Nye-TeeOff/backend/import_meninger.py

367 lines
12 KiB
Python
Raw Permalink Normal View History

2026-04-13 15:29:43 +02:00
import argparse
import json
import re
from html import unescape
from html.parser import HTMLParser
from pathlib import Path
from typing import Any
from urllib.parse import urlparse
import requests
WP_API_BASE = "https://teeoff.no/wp-json/wp/v2/meninger"
DEFAULT_USER_AGENT = "TeeOff-Meninger-Importer/1.0"
DEFAULT_OUTPUT = Path("/opt/teeoff/frontend/src/content/importedMeninger.json")
DEFAULT_MEDIA_DIR = Path("/opt/teeoff/frontend/public/media/meninger")
INTERNAL_GOLF_COURSE_PATTERN = re.compile(r"https?://teeoff\.no/golfbaner/([^/?#]+)/?", re.IGNORECASE)
INTERNAL_TEEOFF_LINK_PATTERN = re.compile(r"https?://teeoff\.no/([^\"'#? ]+)", re.IGNORECASE)
IMG_SRC_PATTERN = re.compile(r"<img\b[^>]*\bsrc=['\"]([^'\"]+)['\"]", re.IGNORECASE)
DISALLOWED_INTERNAL_SEGMENTS = {
"wp-content",
"wp-json",
"meninger",
"category",
"author",
"tag",
"feed",
}
class TextExtractor(HTMLParser):
def __init__(self) -> None:
super().__init__()
self.parts: list[str] = []
def handle_data(self, data: str) -> None:
if data:
self.parts.append(data)
def get_text(self) -> str:
return " ".join(part.strip() for part in self.parts if part.strip())
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Importer Meninger-artikler fra gammel TeeOff WordPress-instans."
)
parser.add_argument(
"--output",
default=str(DEFAULT_OUTPUT),
help=f"Sti til JSON-filen som skal skrives. Standard: {DEFAULT_OUTPUT}",
)
parser.add_argument(
"--media-dir",
default=str(DEFAULT_MEDIA_DIR),
help=f"Mappe for nedlastede bilder. Standard: {DEFAULT_MEDIA_DIR}",
)
parser.add_argument(
"--per-page",
type=int,
default=100,
help="Antall artikler per API-kall. Standard: 100",
)
parser.add_argument(
"--category",
default=None,
help="Filtrer på kategorislug, f.eks. 'banebesok'.",
)
parser.add_argument(
"--download-media",
action="store_true",
help="Last ned featured media og inline-bilder lokalt og skriv om URL-er i HTML.",
)
parser.add_argument(
"--draft",
action="store_true",
help="Ta med artikler som ikke er publisert dersom API-et returnerer dem.",
)
parser.add_argument(
"--limit",
type=int,
default=None,
help="Maks antall artikler som skal skrives etter filtrering.",
)
return parser.parse_args()
def fetch_json(url: str, params: dict[str, Any] | None = None) -> Any:
response = requests.get(
url,
params=params,
timeout=30,
headers={"User-Agent": DEFAULT_USER_AGENT},
)
response.raise_for_status()
return response.json()
def fetch_all_posts(per_page: int, limit: int | None = None) -> list[dict[str, Any]]:
page = 1
posts: list[dict[str, Any]] = []
while True:
try:
data = fetch_json(
WP_API_BASE,
params={
"per_page": per_page,
"page": page,
"_embed": "1",
},
)
except requests.HTTPError as exc:
response = exc.response
if response is not None and response.status_code == 400 and page > 1:
break
raise
if not data:
break
posts.extend(data)
if limit is not None and len(posts) >= limit:
return posts[:limit]
page += 1
return posts
def strip_tags(value: str | None) -> str:
if not value:
return ""
parser = TextExtractor()
parser.feed(unescape(value))
parser.close()
return parser.get_text()
def ensure_directory(path: Path) -> None:
path.mkdir(parents=True, exist_ok=True)
def choose_media_url(media_entry: dict[str, Any]) -> str | None:
media_details = media_entry.get("media_details") or {}
sizes = media_details.get("sizes") or {}
for key in ("full", "1536x1536", "large", "medium"):
candidate = sizes.get(key, {}).get("source_url")
if candidate:
return candidate
return media_entry.get("source_url")
def download_file(url: str, target_dir: Path, basename: str) -> str | None:
ensure_directory(target_dir)
parsed = urlparse(url)
suffix = Path(parsed.path).suffix.lower()
if suffix not in {".jpg", ".jpeg", ".png", ".webp", ".gif", ".avif"}:
suffix = ".jpg"
filename = f"{basename}{suffix}"
target_path = target_dir / filename
if not target_path.exists():
response = requests.get(url, timeout=60, headers={"User-Agent": DEFAULT_USER_AGENT})
response.raise_for_status()
target_path.write_bytes(response.content)
return "/" + str(target_path.relative_to(Path("/opt/teeoff/frontend/public"))).replace("\\", "/")
def extract_categories(post: dict[str, Any]) -> list[dict[str, str]]:
embedded_terms = (post.get("_embedded") or {}).get("wp:term") or []
categories: list[dict[str, str]] = []
for term_group in embedded_terms:
if not isinstance(term_group, list):
continue
for term in term_group:
if term.get("taxonomy") != "category":
continue
categories.append(
{
"id": str(term.get("id", "")),
"name": str(term.get("name", "")),
"slug": str(term.get("slug", "")),
}
)
return categories
def detect_facility_slugs(html: str) -> list[str]:
found = INTERNAL_GOLF_COURSE_PATTERN.findall(html or "")
if html:
for raw_path in INTERNAL_TEEOFF_LINK_PATTERN.findall(html):
path = raw_path.strip("/").split("?")[0]
if not path:
continue
segments = [segment for segment in path.split("/") if segment]
if not segments:
continue
if segments[0] in DISALLOWED_INTERNAL_SEGMENTS:
continue
candidate = segments[-1]
if "golf" not in candidate:
continue
found.append(candidate)
seen: dict[str, None] = {}
for slug in found:
seen[slug] = None
return list(seen.keys())
def collect_inline_image_urls(html: str) -> list[str]:
urls: list[str] = []
for src in IMG_SRC_PATTERN.findall(html or ""):
if src.startswith("http"):
urls.append(src)
deduped: dict[str, None] = {}
for url in urls:
deduped[url] = None
return list(deduped.keys())
def rewrite_html_media(
html: str,
post_slug: str,
target_dir: Path,
featured_url: str | None = None,
) -> tuple[str, list[str], str | None]:
downloaded_urls: list[str] = []
rewrite_map: dict[str, str] = {}
image_index = 1
if featured_url:
local_featured = download_file(featured_url, target_dir, f"{post_slug}-featured")
if local_featured:
downloaded_urls.append(local_featured)
rewrite_map[featured_url] = local_featured
else:
local_featured = None
for url in collect_inline_image_urls(html):
local_path = download_file(url, target_dir, f"{post_slug}-inline-{image_index:02d}")
image_index += 1
if not local_path:
continue
rewrite_map[url] = local_path
downloaded_urls.append(local_path)
rewritten_html = html or ""
for original, local in rewrite_map.items():
rewritten_html = rewritten_html.replace(original, local)
return rewritten_html, downloaded_urls, local_featured
def normalize_post(
post: dict[str, Any],
category_filter: str | None,
download_media: bool,
media_dir: Path,
) -> dict[str, Any] | None:
status = str(post.get("status") or "")
categories = extract_categories(post)
category_slugs = [entry["slug"] for entry in categories if entry.get("slug")]
if category_filter and category_filter not in category_slugs:
return None
title_html = str((post.get("title") or {}).get("rendered") or "")
excerpt_html = str((post.get("excerpt") or {}).get("rendered") or "")
content_html = str((post.get("content") or {}).get("rendered") or "")
embedded = post.get("_embedded") or {}
author_entry = ((embedded.get("author") or [None])[0]) or {}
featured_entry = ((embedded.get("wp:featuredmedia") or [None])[0]) or {}
featured_url = choose_media_url(featured_entry) if featured_entry else None
featured_alt = str(featured_entry.get("alt_text") or "") if featured_entry else ""
featured_caption = strip_tags(str((featured_entry.get("caption") or {}).get("rendered") or ""))
if download_media:
content_html, downloaded_media, local_featured = rewrite_html_media(
content_html,
str(post.get("slug") or "mening"),
media_dir,
featured_url,
)
featured_image = local_featured or featured_url
else:
downloaded_media = []
featured_image = featured_url
facility_slugs = detect_facility_slugs(content_html)
return {
"id": post.get("id"),
"slug": post.get("slug"),
"status": status,
"type": post.get("type"),
"link": post.get("link"),
"title": strip_tags(title_html),
"titleHtml": title_html,
"excerpt": strip_tags(excerpt_html),
"excerptHtml": excerpt_html,
"contentHtml": content_html,
"publishedAt": post.get("date"),
"updatedAt": post.get("modified"),
"author": {
"id": author_entry.get("id"),
"name": author_entry.get("name"),
"slug": author_entry.get("slug"),
"link": author_entry.get("link"),
},
"featuredImage": {
"url": featured_image,
"originalUrl": featured_url,
"alt": featured_alt,
"caption": featured_caption,
}
if featured_url or featured_image
else None,
"inlineMedia": downloaded_media,
"categories": categories,
"categorySlugs": category_slugs,
"facilitySlugs": facility_slugs,
"primaryFacilitySlug": facility_slugs[0] if facility_slugs else None,
}
def main() -> None:
args = parse_args()
output_path = Path(args.output)
media_dir = Path(args.media_dir)
print("🚀 Starter import av Meninger fra WordPress...")
posts = fetch_all_posts(args.per_page, args.limit)
print(f"📦 Hentet {len(posts)} artikler fra {WP_API_BASE}")
normalized_posts: list[dict[str, Any]] = []
for post in posts:
if not args.draft and str(post.get("status") or "") != "publish":
continue
normalized = normalize_post(
post,
category_filter=args.category,
download_media=args.download_media,
media_dir=media_dir,
)
if normalized is None:
continue
normalized_posts.append(normalized)
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_text(
json.dumps(normalized_posts, ensure_ascii=False, indent=2) + "\n",
encoding="utf-8",
)
print(f"✅ Skrev {len(normalized_posts)} artikler til {output_path}")
if args.download_media:
print(f"🖼️ Bilder ble lagret under {media_dir}")
if args.category:
print(f"🏷️ Kategorifilter brukt: {args.category}")
linked_count = sum(1 for post in normalized_posts if post.get("primaryFacilitySlug"))
print(f"{linked_count} artikler fikk koblet minst én golfbane-slug fra internlenker.")
if __name__ == "__main__":
main()