366 lines
12 KiB
Python
366 lines
12 KiB
Python
import argparse
|
|
import json
|
|
import re
|
|
from html import unescape
|
|
from html.parser import HTMLParser
|
|
from pathlib import Path
|
|
from typing import Any
|
|
from urllib.parse import urlparse
|
|
|
|
import requests
|
|
|
|
WP_API_BASE = "https://teeoff.no/wp-json/wp/v2/meninger"
|
|
DEFAULT_USER_AGENT = "TeeOff-Meninger-Importer/1.0"
|
|
DEFAULT_OUTPUT = Path("/opt/teeoff/frontend/src/content/importedMeninger.json")
|
|
DEFAULT_MEDIA_DIR = Path("/opt/teeoff/frontend/public/media/meninger")
|
|
INTERNAL_GOLF_COURSE_PATTERN = re.compile(r"https?://teeoff\.no/golfbaner/([^/?#]+)/?", re.IGNORECASE)
|
|
INTERNAL_TEEOFF_LINK_PATTERN = re.compile(r"https?://teeoff\.no/([^\"'#? ]+)", re.IGNORECASE)
|
|
IMG_SRC_PATTERN = re.compile(r"<img\b[^>]*\bsrc=['\"]([^'\"]+)['\"]", re.IGNORECASE)
|
|
DISALLOWED_INTERNAL_SEGMENTS = {
|
|
"wp-content",
|
|
"wp-json",
|
|
"meninger",
|
|
"category",
|
|
"author",
|
|
"tag",
|
|
"feed",
|
|
}
|
|
|
|
|
|
class TextExtractor(HTMLParser):
|
|
def __init__(self) -> None:
|
|
super().__init__()
|
|
self.parts: list[str] = []
|
|
|
|
def handle_data(self, data: str) -> None:
|
|
if data:
|
|
self.parts.append(data)
|
|
|
|
def get_text(self) -> str:
|
|
return " ".join(part.strip() for part in self.parts if part.strip())
|
|
|
|
|
|
def parse_args() -> argparse.Namespace:
|
|
parser = argparse.ArgumentParser(
|
|
description="Importer Meninger-artikler fra gammel TeeOff WordPress-instans."
|
|
)
|
|
parser.add_argument(
|
|
"--output",
|
|
default=str(DEFAULT_OUTPUT),
|
|
help=f"Sti til JSON-filen som skal skrives. Standard: {DEFAULT_OUTPUT}",
|
|
)
|
|
parser.add_argument(
|
|
"--media-dir",
|
|
default=str(DEFAULT_MEDIA_DIR),
|
|
help=f"Mappe for nedlastede bilder. Standard: {DEFAULT_MEDIA_DIR}",
|
|
)
|
|
parser.add_argument(
|
|
"--per-page",
|
|
type=int,
|
|
default=100,
|
|
help="Antall artikler per API-kall. Standard: 100",
|
|
)
|
|
parser.add_argument(
|
|
"--category",
|
|
default=None,
|
|
help="Filtrer på kategorislug, f.eks. 'banebesok'.",
|
|
)
|
|
parser.add_argument(
|
|
"--download-media",
|
|
action="store_true",
|
|
help="Last ned featured media og inline-bilder lokalt og skriv om URL-er i HTML.",
|
|
)
|
|
parser.add_argument(
|
|
"--draft",
|
|
action="store_true",
|
|
help="Ta med artikler som ikke er publisert dersom API-et returnerer dem.",
|
|
)
|
|
parser.add_argument(
|
|
"--limit",
|
|
type=int,
|
|
default=None,
|
|
help="Maks antall artikler som skal skrives etter filtrering.",
|
|
)
|
|
return parser.parse_args()
|
|
|
|
|
|
def fetch_json(url: str, params: dict[str, Any] | None = None) -> Any:
|
|
response = requests.get(
|
|
url,
|
|
params=params,
|
|
timeout=30,
|
|
headers={"User-Agent": DEFAULT_USER_AGENT},
|
|
)
|
|
response.raise_for_status()
|
|
return response.json()
|
|
|
|
|
|
def fetch_all_posts(per_page: int, limit: int | None = None) -> list[dict[str, Any]]:
|
|
page = 1
|
|
posts: list[dict[str, Any]] = []
|
|
|
|
while True:
|
|
try:
|
|
data = fetch_json(
|
|
WP_API_BASE,
|
|
params={
|
|
"per_page": per_page,
|
|
"page": page,
|
|
"_embed": "1",
|
|
},
|
|
)
|
|
except requests.HTTPError as exc:
|
|
response = exc.response
|
|
if response is not None and response.status_code == 400 and page > 1:
|
|
break
|
|
raise
|
|
if not data:
|
|
break
|
|
posts.extend(data)
|
|
if limit is not None and len(posts) >= limit:
|
|
return posts[:limit]
|
|
page += 1
|
|
|
|
return posts
|
|
|
|
|
|
def strip_tags(value: str | None) -> str:
|
|
if not value:
|
|
return ""
|
|
parser = TextExtractor()
|
|
parser.feed(unescape(value))
|
|
parser.close()
|
|
return parser.get_text()
|
|
|
|
|
|
def ensure_directory(path: Path) -> None:
|
|
path.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
def choose_media_url(media_entry: dict[str, Any]) -> str | None:
|
|
media_details = media_entry.get("media_details") or {}
|
|
sizes = media_details.get("sizes") or {}
|
|
for key in ("full", "1536x1536", "large", "medium"):
|
|
candidate = sizes.get(key, {}).get("source_url")
|
|
if candidate:
|
|
return candidate
|
|
return media_entry.get("source_url")
|
|
|
|
|
|
def download_file(url: str, target_dir: Path, basename: str) -> str | None:
|
|
ensure_directory(target_dir)
|
|
parsed = urlparse(url)
|
|
suffix = Path(parsed.path).suffix.lower()
|
|
if suffix not in {".jpg", ".jpeg", ".png", ".webp", ".gif", ".avif"}:
|
|
suffix = ".jpg"
|
|
|
|
filename = f"{basename}{suffix}"
|
|
target_path = target_dir / filename
|
|
|
|
if not target_path.exists():
|
|
response = requests.get(url, timeout=60, headers={"User-Agent": DEFAULT_USER_AGENT})
|
|
response.raise_for_status()
|
|
target_path.write_bytes(response.content)
|
|
|
|
return "/" + str(target_path.relative_to(Path("/opt/teeoff/frontend/public"))).replace("\\", "/")
|
|
|
|
|
|
def extract_categories(post: dict[str, Any]) -> list[dict[str, str]]:
|
|
embedded_terms = (post.get("_embedded") or {}).get("wp:term") or []
|
|
categories: list[dict[str, str]] = []
|
|
for term_group in embedded_terms:
|
|
if not isinstance(term_group, list):
|
|
continue
|
|
for term in term_group:
|
|
if term.get("taxonomy") != "category":
|
|
continue
|
|
categories.append(
|
|
{
|
|
"id": str(term.get("id", "")),
|
|
"name": str(term.get("name", "")),
|
|
"slug": str(term.get("slug", "")),
|
|
}
|
|
)
|
|
return categories
|
|
|
|
|
|
def detect_facility_slugs(html: str) -> list[str]:
|
|
found = INTERNAL_GOLF_COURSE_PATTERN.findall(html or "")
|
|
if html:
|
|
for raw_path in INTERNAL_TEEOFF_LINK_PATTERN.findall(html):
|
|
path = raw_path.strip("/").split("?")[0]
|
|
if not path:
|
|
continue
|
|
segments = [segment for segment in path.split("/") if segment]
|
|
if not segments:
|
|
continue
|
|
if segments[0] in DISALLOWED_INTERNAL_SEGMENTS:
|
|
continue
|
|
candidate = segments[-1]
|
|
if "golf" not in candidate:
|
|
continue
|
|
found.append(candidate)
|
|
seen: dict[str, None] = {}
|
|
for slug in found:
|
|
seen[slug] = None
|
|
return list(seen.keys())
|
|
|
|
|
|
def collect_inline_image_urls(html: str) -> list[str]:
|
|
urls: list[str] = []
|
|
for src in IMG_SRC_PATTERN.findall(html or ""):
|
|
if src.startswith("http"):
|
|
urls.append(src)
|
|
deduped: dict[str, None] = {}
|
|
for url in urls:
|
|
deduped[url] = None
|
|
return list(deduped.keys())
|
|
|
|
|
|
def rewrite_html_media(
|
|
html: str,
|
|
post_slug: str,
|
|
target_dir: Path,
|
|
featured_url: str | None = None,
|
|
) -> tuple[str, list[str], str | None]:
|
|
downloaded_urls: list[str] = []
|
|
rewrite_map: dict[str, str] = {}
|
|
image_index = 1
|
|
|
|
if featured_url:
|
|
local_featured = download_file(featured_url, target_dir, f"{post_slug}-featured")
|
|
if local_featured:
|
|
downloaded_urls.append(local_featured)
|
|
rewrite_map[featured_url] = local_featured
|
|
else:
|
|
local_featured = None
|
|
|
|
for url in collect_inline_image_urls(html):
|
|
local_path = download_file(url, target_dir, f"{post_slug}-inline-{image_index:02d}")
|
|
image_index += 1
|
|
if not local_path:
|
|
continue
|
|
rewrite_map[url] = local_path
|
|
downloaded_urls.append(local_path)
|
|
|
|
rewritten_html = html or ""
|
|
for original, local in rewrite_map.items():
|
|
rewritten_html = rewritten_html.replace(original, local)
|
|
|
|
return rewritten_html, downloaded_urls, local_featured
|
|
|
|
|
|
def normalize_post(
|
|
post: dict[str, Any],
|
|
category_filter: str | None,
|
|
download_media: bool,
|
|
media_dir: Path,
|
|
) -> dict[str, Any] | None:
|
|
status = str(post.get("status") or "")
|
|
categories = extract_categories(post)
|
|
category_slugs = [entry["slug"] for entry in categories if entry.get("slug")]
|
|
if category_filter and category_filter not in category_slugs:
|
|
return None
|
|
|
|
title_html = str((post.get("title") or {}).get("rendered") or "")
|
|
excerpt_html = str((post.get("excerpt") or {}).get("rendered") or "")
|
|
content_html = str((post.get("content") or {}).get("rendered") or "")
|
|
|
|
embedded = post.get("_embedded") or {}
|
|
author_entry = ((embedded.get("author") or [None])[0]) or {}
|
|
featured_entry = ((embedded.get("wp:featuredmedia") or [None])[0]) or {}
|
|
featured_url = choose_media_url(featured_entry) if featured_entry else None
|
|
featured_alt = str(featured_entry.get("alt_text") or "") if featured_entry else ""
|
|
featured_caption = strip_tags(str((featured_entry.get("caption") or {}).get("rendered") or ""))
|
|
|
|
if download_media:
|
|
content_html, downloaded_media, local_featured = rewrite_html_media(
|
|
content_html,
|
|
str(post.get("slug") or "mening"),
|
|
media_dir,
|
|
featured_url,
|
|
)
|
|
featured_image = local_featured or featured_url
|
|
else:
|
|
downloaded_media = []
|
|
featured_image = featured_url
|
|
|
|
facility_slugs = detect_facility_slugs(content_html)
|
|
|
|
return {
|
|
"id": post.get("id"),
|
|
"slug": post.get("slug"),
|
|
"status": status,
|
|
"type": post.get("type"),
|
|
"link": post.get("link"),
|
|
"title": strip_tags(title_html),
|
|
"titleHtml": title_html,
|
|
"excerpt": strip_tags(excerpt_html),
|
|
"excerptHtml": excerpt_html,
|
|
"contentHtml": content_html,
|
|
"publishedAt": post.get("date"),
|
|
"updatedAt": post.get("modified"),
|
|
"author": {
|
|
"id": author_entry.get("id"),
|
|
"name": author_entry.get("name"),
|
|
"slug": author_entry.get("slug"),
|
|
"link": author_entry.get("link"),
|
|
},
|
|
"featuredImage": {
|
|
"url": featured_image,
|
|
"originalUrl": featured_url,
|
|
"alt": featured_alt,
|
|
"caption": featured_caption,
|
|
}
|
|
if featured_url or featured_image
|
|
else None,
|
|
"inlineMedia": downloaded_media,
|
|
"categories": categories,
|
|
"categorySlugs": category_slugs,
|
|
"facilitySlugs": facility_slugs,
|
|
"primaryFacilitySlug": facility_slugs[0] if facility_slugs else None,
|
|
}
|
|
|
|
|
|
def main() -> None:
|
|
args = parse_args()
|
|
output_path = Path(args.output)
|
|
media_dir = Path(args.media_dir)
|
|
|
|
print("🚀 Starter import av Meninger fra WordPress...")
|
|
posts = fetch_all_posts(args.per_page, args.limit)
|
|
print(f"📦 Hentet {len(posts)} artikler fra {WP_API_BASE}")
|
|
|
|
normalized_posts: list[dict[str, Any]] = []
|
|
for post in posts:
|
|
if not args.draft and str(post.get("status") or "") != "publish":
|
|
continue
|
|
normalized = normalize_post(
|
|
post,
|
|
category_filter=args.category,
|
|
download_media=args.download_media,
|
|
media_dir=media_dir,
|
|
)
|
|
if normalized is None:
|
|
continue
|
|
normalized_posts.append(normalized)
|
|
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
output_path.write_text(
|
|
json.dumps(normalized_posts, ensure_ascii=False, indent=2) + "\n",
|
|
encoding="utf-8",
|
|
)
|
|
|
|
print(f"✅ Skrev {len(normalized_posts)} artikler til {output_path}")
|
|
if args.download_media:
|
|
print(f"🖼️ Bilder ble lagret under {media_dir}")
|
|
|
|
if args.category:
|
|
print(f"🏷️ Kategorifilter brukt: {args.category}")
|
|
|
|
linked_count = sum(1 for post in normalized_posts if post.get("primaryFacilitySlug"))
|
|
print(f"⛳ {linked_count} artikler fikk koblet minst én golfbane-slug fra internlenker.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|