Nye-TeeOff/backend/scrape_golfpakker.py

377 lines
14 KiB
Python
Raw Normal View History

2026-04-15 08:15:53 +02:00
"""
TEE OFF - GOLFPAKKE-SKRAPER MED GEMINI AI
---------------------------------------------------------------------------
Starter klubbens nettside, følger relevante interne lenker om golfpakker/
opphold/hotell, og lagrer AI-forslag som utkast.
---------------------------------------------------------------------------
"""
import argparse
import asyncio
import json
import os
from urllib.parse import urljoin, urlparse
import asyncpg
import google.generativeai as genai
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from env_config import get_database_url
2026-04-15 08:15:53 +02:00
from playwright.async_api import async_playwright
2026-05-04 15:30:29 +02:00
from scrape_utils import (
ProgressCallback,
emit_progress,
exclude_discontinued_facilities_clause,
make_progress_event,
parse_llm_json,
)
2026-04-15 08:15:53 +02:00
load_dotenv()
DB_URL = get_database_url()
2026-04-15 08:15:53 +02:00
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
if not GEMINI_API_KEY:
raise ValueError("🚨 GEMINI_API_KEY mangler i .env filen!")
genai.configure(api_key=GEMINI_API_KEY)
model = genai.GenerativeModel("gemini-2.5-flash")
PACKAGE_LINK_HINTS = (
"golfpakke",
"golfpakker",
"pakke",
"pakker",
"opphold",
"overnatting",
"hotel",
"hotell",
"resort",
"accommodation",
"stay",
)
def _extract_text(html_content: str) -> str:
soup = BeautifulSoup(html_content, "html.parser")
for tag in soup(["script", "style", "nav", "footer", "header"]):
tag.extract()
return soup.get_text(separator=" ", strip=True)
def _extract_candidate_links(html_content: str, base_url: str) -> list[str]:
soup = BeautifulSoup(html_content, "html.parser")
base_host = urlparse(base_url).netloc.lower()
candidates: list[str] = []
seen: set[str] = set()
for anchor in soup.find_all("a", href=True):
href = str(anchor.get("href") or "").strip()
if not href or href.startswith("#") or href.startswith("mailto:") or href.startswith("tel:") or href.startswith("javascript:"):
continue
absolute_url = urljoin(base_url, href)
parsed = urlparse(absolute_url)
if parsed.scheme not in {"http", "https"}:
continue
if parsed.netloc.lower() != base_host:
continue
haystack = f"{absolute_url} {anchor.get_text(' ', strip=True)}".lower()
if not any(hint in haystack for hint in PACKAGE_LINK_HINTS):
continue
normalized_url = absolute_url.rstrip("/")
if normalized_url in seen:
continue
seen.add(normalized_url)
candidates.append(normalized_url)
return candidates[:6]
async def fetch_page_data(url: str, browser) -> tuple[str, str]:
url = url.strip()
if not url.startswith("http"):
return "", ""
print(f" 🌐 Laster inn: {url}")
page = await browser.new_page()
try:
await page.goto(url, wait_until="domcontentloaded", timeout=20000)
html_content = await page.content()
return _extract_text(html_content), html_content
except Exception as exc:
print(f" ❌ Feil ved lasting av {url}: {exc}")
return "", ""
finally:
await page.close()
async def collect_package_source_text(urls: list[str], browser) -> str:
combined_sections: list[str] = []
visited_urls: set[str] = set()
for source_url in urls:
page_text, html_content = await fetch_page_data(source_url, browser)
if page_text:
combined_sections.append(f"--- TEKST FRA SIDE ({source_url}) ---\n{page_text}")
visited_urls.add(source_url.rstrip("/"))
if not html_content:
continue
for candidate_url in _extract_candidate_links(html_content, source_url):
normalized_candidate = candidate_url.rstrip("/")
if normalized_candidate in visited_urls:
continue
visited_urls.add(normalized_candidate)
candidate_text, _ = await fetch_page_data(candidate_url, browser)
if candidate_text:
combined_sections.append(f"--- TEKST FRA SIDE ({candidate_url}) ---\n{candidate_text}")
return "\n\n".join(combined_sections)
def analyze_golfpakker_with_gemini(text: str, club_name: str) -> dict | None:
print(f" 🧠 Sender {len(text)} tegn til Gemini for golfpakke-analyse...")
prompt = f"""
Du er en ekspert norske golfklubber og golfpakker.
Din oppgave er å lese tekster hentet fra nettsidene til "{club_name}" og identifisere eventuelle golfpakker, oppholdspakker eller overnattingstilbud som er relevante for greenfeespillere.
REGLER:
- Trekk bare ut faktiske golfpakker/oppholdspakker. Ikke vanlige greenfeepriser, medlemskap eller bedriftspakker.
- For hver pakke skal du hente ut:
1. navn
2. pris hvis den er eksplisitt oppgitt
3. en kort oppsummering 1-3 setninger om hva pakken går ut
4. lenke til siden der pakken presenteres
- Hvis flere pakker beskrives samme side, kan de bruke samme lenke.
- Hvis pris ikke finnes eksplisitt, sett den til null.
- Hvis du ikke finner noen golfpakker, returner en tom liste.
- Bruk URL-ene som står i markørene `--- TEKST FRA SIDE (...) ---` når du fyller inn lenke.
TEKST FRA NETTSIDENE:
{text}
Returner KUN gyldig JSON med denne strukturen:
{{
"foreslatt_golfpakker": [
{{
"navn": "Golfpakke med hotell",
"pris": 2490,
"beskrivelse": "Én natt på hotell, frokost og greenfee for to personer. Pakken gjelder i utvalgte perioder gjennom sesongen.",
"lenke": "https://eksempel.no/golfpakke"
}}
],
"ai_begrunnelse": "Kort forklaring på hvilke sider og signaler du brukte."
}}
"""
try:
response = model.generate_content(prompt)
parsed = parse_llm_json(response.text)
return parsed if isinstance(parsed, dict) else None
except Exception as exc:
print(f" ❌ AI-analyse feilet: {exc}")
return None
async def run_golfpakker_scraper(facility_ids=None, progress_callback: ProgressCallback | None = None):
print("🚀 Starter golfpakke-skraperen...")
conn = await asyncpg.connect(DB_URL)
facilities = []
analyzed_count = 0
saved_count = 0
skipped_count = 0
failed_count = 0
try:
query = """
SELECT
id,
name,
website_url,
golfpakker_url,
COALESCE(NULLIF(TRIM(golfpakker_url), ''), NULLIF(TRIM(website_url), '')) AS source_url
FROM facilities
WHERE COALESCE(NULLIF(TRIM(golfpakker_url), ''), NULLIF(TRIM(website_url), '')) IS NOT NULL
"""
2026-05-04 15:30:29 +02:00
query += exclude_discontinued_facilities_clause("facilities")
2026-04-15 08:15:53 +02:00
if facility_ids:
query += f" AND id IN ({','.join(map(str, facility_ids))})"
facilities = await conn.fetch(query)
total_facilities = len(facilities)
print(f"📋 Fant {total_facilities} anlegg å skrape.")
await emit_progress(
progress_callback,
progress_total=total_facilities,
progress_completed=0,
progress_ok=0,
progress_failed=0,
progress_skipped=0,
event=make_progress_event(
facility_id=None,
facility_name="Golfpakker",
outcome="info",
message=f"Starter golfpakkeskraping for {total_facilities} anlegg.",
processed=0,
total=total_facilities,
),
)
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
for index, facility in enumerate(facilities, start=1):
fac_id = facility["id"]
name = facility["name"]
urls_raw = facility["source_url"]
print(f"\n▶️ Behandler golfpakker for: {name} (ID: {fac_id})")
await emit_progress(
progress_callback,
current_facility_id=fac_id,
current_facility_name=name,
event=make_progress_event(
facility_id=fac_id,
facility_name=name,
outcome="info",
message="Starter henting av golfpakke-kilde med fallback til nettside.",
processed=index - 1,
total=total_facilities,
),
)
urls = [url.strip() for url in str(urls_raw or "").split(",") if url.strip()]
try:
combined_text = await collect_package_source_text(urls, browser)
if len(combined_text) < 50:
print(" ⚠️ Fant for lite tekst, hopper over.")
skipped_count += 1
await emit_progress(
progress_callback,
progress_completed=index,
progress_ok=saved_count,
progress_failed=failed_count,
progress_skipped=skipped_count,
current_facility_id=fac_id,
current_facility_name=name,
event=make_progress_event(
facility_id=fac_id,
facility_name=name,
outcome="warning",
message="Hoppet over fordi det ble funnet for lite relevant tekst.",
processed=index,
total=total_facilities,
),
)
continue
draft_data = analyze_golfpakker_with_gemini(combined_text[:30000], name)
if not draft_data:
failed_count += 1
await emit_progress(
progress_callback,
progress_completed=index,
progress_ok=saved_count,
progress_failed=failed_count,
progress_skipped=skipped_count,
current_facility_id=fac_id,
current_facility_name=name,
event=make_progress_event(
facility_id=fac_id,
facility_name=name,
outcome="error",
message="AI-analysen ga ikke et gyldig golfpakkeutkast.",
processed=index,
total=total_facilities,
),
)
continue
analyzed_count += 1
found_packages = len(draft_data.get("foreslatt_golfpakker", []))
print(f" ✅ AI fant {found_packages} golfpakker.")
await conn.execute(
"""
UPDATE facilities
SET golfpakker_draft = $1::jsonb
WHERE id = $2
""",
json.dumps(draft_data),
fac_id,
)
print(" 💾 Golfpakke-utkast lagret i databasen!")
saved_count += 1
await emit_progress(
progress_callback,
progress_completed=index,
progress_ok=saved_count,
progress_failed=failed_count,
progress_skipped=skipped_count,
current_facility_id=fac_id,
current_facility_name=name,
event=make_progress_event(
facility_id=fac_id,
facility_name=name,
outcome="success",
message=f"Utkast lagret med {found_packages} golfpakker.",
processed=index,
total=total_facilities,
),
)
except Exception as exc:
failed_count += 1
print(f" ❌ Uventet feil for {name}: {exc}")
await emit_progress(
progress_callback,
progress_completed=index,
progress_ok=saved_count,
progress_failed=failed_count,
progress_skipped=skipped_count,
current_facility_id=fac_id,
current_facility_name=name,
event=make_progress_event(
facility_id=fac_id,
facility_name=name,
outcome="error",
message=f"Feilet under behandling: {str(exc).splitlines()[0]}",
processed=index,
total=total_facilities,
),
)
await browser.close()
finally:
await conn.close()
print("\n🏁 Golfpakkeskraping fullført.")
return {
"processed_facilities": len(facilities),
"analyzed_facilities": analyzed_count,
"saved_drafts": saved_count,
"skipped_facilities": skipped_count,
"failed_facilities": failed_count,
}
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Skrap golfpakker via AI.")
parser.add_argument("--ids", type=str, help="Kommaseparert liste med anleggs-IDer", default=None)
args = parser.parse_args()
facility_ids_list = None
if args.ids:
facility_ids_list = [int(id_str.strip()) for id_str in args.ids.split(",") if id_str.strip()]
asyncio.run(run_golfpakker_scraper(facility_ids_list))