Nye-TeeOff/backend/import_meninger.py

366 lines
12 KiB
Python

import argparse
import json
import re
from html import unescape
from html.parser import HTMLParser
from pathlib import Path
from typing import Any
from urllib.parse import urlparse
import requests
WP_API_BASE = "https://teeoff.no/wp-json/wp/v2/meninger"
DEFAULT_USER_AGENT = "TeeOff-Meninger-Importer/1.0"
DEFAULT_OUTPUT = Path("/opt/teeoff/frontend/src/content/importedMeninger.json")
DEFAULT_MEDIA_DIR = Path("/opt/teeoff/frontend/public/media/meninger")
INTERNAL_GOLF_COURSE_PATTERN = re.compile(r"https?://teeoff\.no/golfbaner/([^/?#]+)/?", re.IGNORECASE)
INTERNAL_TEEOFF_LINK_PATTERN = re.compile(r"https?://teeoff\.no/([^\"'#? ]+)", re.IGNORECASE)
IMG_SRC_PATTERN = re.compile(r"<img\b[^>]*\bsrc=['\"]([^'\"]+)['\"]", re.IGNORECASE)
DISALLOWED_INTERNAL_SEGMENTS = {
"wp-content",
"wp-json",
"meninger",
"category",
"author",
"tag",
"feed",
}
class TextExtractor(HTMLParser):
def __init__(self) -> None:
super().__init__()
self.parts: list[str] = []
def handle_data(self, data: str) -> None:
if data:
self.parts.append(data)
def get_text(self) -> str:
return " ".join(part.strip() for part in self.parts if part.strip())
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Importer Meninger-artikler fra gammel TeeOff WordPress-instans."
)
parser.add_argument(
"--output",
default=str(DEFAULT_OUTPUT),
help=f"Sti til JSON-filen som skal skrives. Standard: {DEFAULT_OUTPUT}",
)
parser.add_argument(
"--media-dir",
default=str(DEFAULT_MEDIA_DIR),
help=f"Mappe for nedlastede bilder. Standard: {DEFAULT_MEDIA_DIR}",
)
parser.add_argument(
"--per-page",
type=int,
default=100,
help="Antall artikler per API-kall. Standard: 100",
)
parser.add_argument(
"--category",
default=None,
help="Filtrer på kategorislug, f.eks. 'banebesok'.",
)
parser.add_argument(
"--download-media",
action="store_true",
help="Last ned featured media og inline-bilder lokalt og skriv om URL-er i HTML.",
)
parser.add_argument(
"--draft",
action="store_true",
help="Ta med artikler som ikke er publisert dersom API-et returnerer dem.",
)
parser.add_argument(
"--limit",
type=int,
default=None,
help="Maks antall artikler som skal skrives etter filtrering.",
)
return parser.parse_args()
def fetch_json(url: str, params: dict[str, Any] | None = None) -> Any:
response = requests.get(
url,
params=params,
timeout=30,
headers={"User-Agent": DEFAULT_USER_AGENT},
)
response.raise_for_status()
return response.json()
def fetch_all_posts(per_page: int, limit: int | None = None) -> list[dict[str, Any]]:
page = 1
posts: list[dict[str, Any]] = []
while True:
try:
data = fetch_json(
WP_API_BASE,
params={
"per_page": per_page,
"page": page,
"_embed": "1",
},
)
except requests.HTTPError as exc:
response = exc.response
if response is not None and response.status_code == 400 and page > 1:
break
raise
if not data:
break
posts.extend(data)
if limit is not None and len(posts) >= limit:
return posts[:limit]
page += 1
return posts
def strip_tags(value: str | None) -> str:
if not value:
return ""
parser = TextExtractor()
parser.feed(unescape(value))
parser.close()
return parser.get_text()
def ensure_directory(path: Path) -> None:
path.mkdir(parents=True, exist_ok=True)
def choose_media_url(media_entry: dict[str, Any]) -> str | None:
media_details = media_entry.get("media_details") or {}
sizes = media_details.get("sizes") or {}
for key in ("full", "1536x1536", "large", "medium"):
candidate = sizes.get(key, {}).get("source_url")
if candidate:
return candidate
return media_entry.get("source_url")
def download_file(url: str, target_dir: Path, basename: str) -> str | None:
ensure_directory(target_dir)
parsed = urlparse(url)
suffix = Path(parsed.path).suffix.lower()
if suffix not in {".jpg", ".jpeg", ".png", ".webp", ".gif", ".avif"}:
suffix = ".jpg"
filename = f"{basename}{suffix}"
target_path = target_dir / filename
if not target_path.exists():
response = requests.get(url, timeout=60, headers={"User-Agent": DEFAULT_USER_AGENT})
response.raise_for_status()
target_path.write_bytes(response.content)
return "/" + str(target_path.relative_to(Path("/opt/teeoff/frontend/public"))).replace("\\", "/")
def extract_categories(post: dict[str, Any]) -> list[dict[str, str]]:
embedded_terms = (post.get("_embedded") or {}).get("wp:term") or []
categories: list[dict[str, str]] = []
for term_group in embedded_terms:
if not isinstance(term_group, list):
continue
for term in term_group:
if term.get("taxonomy") != "category":
continue
categories.append(
{
"id": str(term.get("id", "")),
"name": str(term.get("name", "")),
"slug": str(term.get("slug", "")),
}
)
return categories
def detect_facility_slugs(html: str) -> list[str]:
found = INTERNAL_GOLF_COURSE_PATTERN.findall(html or "")
if html:
for raw_path in INTERNAL_TEEOFF_LINK_PATTERN.findall(html):
path = raw_path.strip("/").split("?")[0]
if not path:
continue
segments = [segment for segment in path.split("/") if segment]
if not segments:
continue
if segments[0] in DISALLOWED_INTERNAL_SEGMENTS:
continue
candidate = segments[-1]
if "golf" not in candidate:
continue
found.append(candidate)
seen: dict[str, None] = {}
for slug in found:
seen[slug] = None
return list(seen.keys())
def collect_inline_image_urls(html: str) -> list[str]:
urls: list[str] = []
for src in IMG_SRC_PATTERN.findall(html or ""):
if src.startswith("http"):
urls.append(src)
deduped: dict[str, None] = {}
for url in urls:
deduped[url] = None
return list(deduped.keys())
def rewrite_html_media(
html: str,
post_slug: str,
target_dir: Path,
featured_url: str | None = None,
) -> tuple[str, list[str], str | None]:
downloaded_urls: list[str] = []
rewrite_map: dict[str, str] = {}
image_index = 1
if featured_url:
local_featured = download_file(featured_url, target_dir, f"{post_slug}-featured")
if local_featured:
downloaded_urls.append(local_featured)
rewrite_map[featured_url] = local_featured
else:
local_featured = None
for url in collect_inline_image_urls(html):
local_path = download_file(url, target_dir, f"{post_slug}-inline-{image_index:02d}")
image_index += 1
if not local_path:
continue
rewrite_map[url] = local_path
downloaded_urls.append(local_path)
rewritten_html = html or ""
for original, local in rewrite_map.items():
rewritten_html = rewritten_html.replace(original, local)
return rewritten_html, downloaded_urls, local_featured
def normalize_post(
post: dict[str, Any],
category_filter: str | None,
download_media: bool,
media_dir: Path,
) -> dict[str, Any] | None:
status = str(post.get("status") or "")
categories = extract_categories(post)
category_slugs = [entry["slug"] for entry in categories if entry.get("slug")]
if category_filter and category_filter not in category_slugs:
return None
title_html = str((post.get("title") or {}).get("rendered") or "")
excerpt_html = str((post.get("excerpt") or {}).get("rendered") or "")
content_html = str((post.get("content") or {}).get("rendered") or "")
embedded = post.get("_embedded") or {}
author_entry = ((embedded.get("author") or [None])[0]) or {}
featured_entry = ((embedded.get("wp:featuredmedia") or [None])[0]) or {}
featured_url = choose_media_url(featured_entry) if featured_entry else None
featured_alt = str(featured_entry.get("alt_text") or "") if featured_entry else ""
featured_caption = strip_tags(str((featured_entry.get("caption") or {}).get("rendered") or ""))
if download_media:
content_html, downloaded_media, local_featured = rewrite_html_media(
content_html,
str(post.get("slug") or "mening"),
media_dir,
featured_url,
)
featured_image = local_featured or featured_url
else:
downloaded_media = []
featured_image = featured_url
facility_slugs = detect_facility_slugs(content_html)
return {
"id": post.get("id"),
"slug": post.get("slug"),
"status": status,
"type": post.get("type"),
"link": post.get("link"),
"title": strip_tags(title_html),
"titleHtml": title_html,
"excerpt": strip_tags(excerpt_html),
"excerptHtml": excerpt_html,
"contentHtml": content_html,
"publishedAt": post.get("date"),
"updatedAt": post.get("modified"),
"author": {
"id": author_entry.get("id"),
"name": author_entry.get("name"),
"slug": author_entry.get("slug"),
"link": author_entry.get("link"),
},
"featuredImage": {
"url": featured_image,
"originalUrl": featured_url,
"alt": featured_alt,
"caption": featured_caption,
}
if featured_url or featured_image
else None,
"inlineMedia": downloaded_media,
"categories": categories,
"categorySlugs": category_slugs,
"facilitySlugs": facility_slugs,
"primaryFacilitySlug": facility_slugs[0] if facility_slugs else None,
}
def main() -> None:
args = parse_args()
output_path = Path(args.output)
media_dir = Path(args.media_dir)
print("🚀 Starter import av Meninger fra WordPress...")
posts = fetch_all_posts(args.per_page, args.limit)
print(f"📦 Hentet {len(posts)} artikler fra {WP_API_BASE}")
normalized_posts: list[dict[str, Any]] = []
for post in posts:
if not args.draft and str(post.get("status") or "") != "publish":
continue
normalized = normalize_post(
post,
category_filter=args.category,
download_media=args.download_media,
media_dir=media_dir,
)
if normalized is None:
continue
normalized_posts.append(normalized)
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_text(
json.dumps(normalized_posts, ensure_ascii=False, indent=2) + "\n",
encoding="utf-8",
)
print(f"✅ Skrev {len(normalized_posts)} artikler til {output_path}")
if args.download_media:
print(f"🖼️ Bilder ble lagret under {media_dir}")
if args.category:
print(f"🏷️ Kategorifilter brukt: {args.category}")
linked_count = sum(1 for post in normalized_posts if post.get("primaryFacilitySlug"))
print(f"{linked_count} artikler fikk koblet minst én golfbane-slug fra internlenker.")
if __name__ == "__main__":
main()