""" TEE OFF - GOLFPAKKE-SKRAPER MED GEMINI AI --------------------------------------------------------------------------- Starter på klubbens nettside, følger relevante interne lenker om golfpakker/ opphold/hotell, og lagrer AI-forslag som utkast. --------------------------------------------------------------------------- """ import argparse import asyncio import json import os from urllib.parse import urljoin, urlparse import asyncpg import google.generativeai as genai from bs4 import BeautifulSoup from dotenv import load_dotenv from env_config import get_database_url from playwright.async_api import async_playwright from scrape_utils import ( ProgressCallback, emit_progress, exclude_discontinued_facilities_clause, make_progress_event, parse_llm_json, ) load_dotenv() DB_URL = get_database_url() GEMINI_API_KEY = os.getenv("GEMINI_API_KEY") if not GEMINI_API_KEY: raise ValueError("🚨 GEMINI_API_KEY mangler i .env filen!") genai.configure(api_key=GEMINI_API_KEY) model = genai.GenerativeModel("gemini-2.5-flash") PACKAGE_LINK_HINTS = ( "golfpakke", "golfpakker", "pakke", "pakker", "opphold", "overnatting", "hotel", "hotell", "resort", "accommodation", "stay", ) def _extract_text(html_content: str) -> str: soup = BeautifulSoup(html_content, "html.parser") for tag in soup(["script", "style", "nav", "footer", "header"]): tag.extract() return soup.get_text(separator=" ", strip=True) def _extract_candidate_links(html_content: str, base_url: str) -> list[str]: soup = BeautifulSoup(html_content, "html.parser") base_host = urlparse(base_url).netloc.lower() candidates: list[str] = [] seen: set[str] = set() for anchor in soup.find_all("a", href=True): href = str(anchor.get("href") or "").strip() if not href or href.startswith("#") or href.startswith("mailto:") or href.startswith("tel:") or href.startswith("javascript:"): continue absolute_url = urljoin(base_url, href) parsed = urlparse(absolute_url) if parsed.scheme not in {"http", "https"}: continue if parsed.netloc.lower() != base_host: continue haystack = f"{absolute_url} {anchor.get_text(' ', strip=True)}".lower() if not any(hint in haystack for hint in PACKAGE_LINK_HINTS): continue normalized_url = absolute_url.rstrip("/") if normalized_url in seen: continue seen.add(normalized_url) candidates.append(normalized_url) return candidates[:6] async def fetch_page_data(url: str, browser) -> tuple[str, str]: url = url.strip() if not url.startswith("http"): return "", "" print(f" 🌐 Laster inn: {url}") page = await browser.new_page() try: await page.goto(url, wait_until="domcontentloaded", timeout=20000) html_content = await page.content() return _extract_text(html_content), html_content except Exception as exc: print(f" ❌ Feil ved lasting av {url}: {exc}") return "", "" finally: await page.close() async def collect_package_source_text(urls: list[str], browser) -> str: combined_sections: list[str] = [] visited_urls: set[str] = set() for source_url in urls: page_text, html_content = await fetch_page_data(source_url, browser) if page_text: combined_sections.append(f"--- TEKST FRA SIDE ({source_url}) ---\n{page_text}") visited_urls.add(source_url.rstrip("/")) if not html_content: continue for candidate_url in _extract_candidate_links(html_content, source_url): normalized_candidate = candidate_url.rstrip("/") if normalized_candidate in visited_urls: continue visited_urls.add(normalized_candidate) candidate_text, _ = await fetch_page_data(candidate_url, browser) if candidate_text: combined_sections.append(f"--- TEKST FRA SIDE ({candidate_url}) ---\n{candidate_text}") return "\n\n".join(combined_sections) def analyze_golfpakker_with_gemini(text: str, club_name: str) -> dict | None: print(f" 🧠 Sender {len(text)} tegn til Gemini for golfpakke-analyse...") prompt = f""" Du er en ekspert på norske golfklubber og golfpakker. Din oppgave er å lese tekster hentet fra nettsidene til "{club_name}" og identifisere eventuelle golfpakker, oppholdspakker eller overnattingstilbud som er relevante for greenfeespillere. REGLER: - Trekk bare ut faktiske golfpakker/oppholdspakker. Ikke vanlige greenfeepriser, medlemskap eller bedriftspakker. - For hver pakke skal du hente ut: 1. navn 2. pris hvis den er eksplisitt oppgitt 3. en kort oppsummering på 1-3 setninger om hva pakken går ut på 4. lenke til siden der pakken presenteres - Hvis flere pakker beskrives på samme side, kan de bruke samme lenke. - Hvis pris ikke finnes eksplisitt, sett den til null. - Hvis du ikke finner noen golfpakker, returner en tom liste. - Bruk URL-ene som står i markørene `--- TEKST FRA SIDE (...) ---` når du fyller inn lenke. TEKST FRA NETTSIDENE: {text} Returner KUN gyldig JSON med denne strukturen: {{ "foreslatt_golfpakker": [ {{ "navn": "Golfpakke med hotell", "pris": 2490, "beskrivelse": "Én natt på hotell, frokost og greenfee for to personer. Pakken gjelder i utvalgte perioder gjennom sesongen.", "lenke": "https://eksempel.no/golfpakke" }} ], "ai_begrunnelse": "Kort forklaring på hvilke sider og signaler du brukte." }} """ try: response = model.generate_content(prompt) parsed = parse_llm_json(response.text) return parsed if isinstance(parsed, dict) else None except Exception as exc: print(f" ❌ AI-analyse feilet: {exc}") return None async def run_golfpakker_scraper(facility_ids=None, progress_callback: ProgressCallback | None = None): print("🚀 Starter golfpakke-skraperen...") conn = await asyncpg.connect(DB_URL) facilities = [] analyzed_count = 0 saved_count = 0 skipped_count = 0 failed_count = 0 try: query = """ SELECT id, name, website_url, golfpakker_url, COALESCE(NULLIF(TRIM(golfpakker_url), ''), NULLIF(TRIM(website_url), '')) AS source_url FROM facilities WHERE COALESCE(NULLIF(TRIM(golfpakker_url), ''), NULLIF(TRIM(website_url), '')) IS NOT NULL """ query += exclude_discontinued_facilities_clause("facilities") if facility_ids: query += f" AND id IN ({','.join(map(str, facility_ids))})" facilities = await conn.fetch(query) total_facilities = len(facilities) print(f"📋 Fant {total_facilities} anlegg å skrape.") await emit_progress( progress_callback, progress_total=total_facilities, progress_completed=0, progress_ok=0, progress_failed=0, progress_skipped=0, event=make_progress_event( facility_id=None, facility_name="Golfpakker", outcome="info", message=f"Starter golfpakkeskraping for {total_facilities} anlegg.", processed=0, total=total_facilities, ), ) async with async_playwright() as p: browser = await p.chromium.launch(headless=True) for index, facility in enumerate(facilities, start=1): fac_id = facility["id"] name = facility["name"] urls_raw = facility["source_url"] print(f"\n▶️ Behandler golfpakker for: {name} (ID: {fac_id})") await emit_progress( progress_callback, current_facility_id=fac_id, current_facility_name=name, event=make_progress_event( facility_id=fac_id, facility_name=name, outcome="info", message="Starter henting av golfpakke-kilde med fallback til nettside.", processed=index - 1, total=total_facilities, ), ) urls = [url.strip() for url in str(urls_raw or "").split(",") if url.strip()] try: combined_text = await collect_package_source_text(urls, browser) if len(combined_text) < 50: print(" ⚠️ Fant for lite tekst, hopper over.") skipped_count += 1 await emit_progress( progress_callback, progress_completed=index, progress_ok=saved_count, progress_failed=failed_count, progress_skipped=skipped_count, current_facility_id=fac_id, current_facility_name=name, event=make_progress_event( facility_id=fac_id, facility_name=name, outcome="warning", message="Hoppet over fordi det ble funnet for lite relevant tekst.", processed=index, total=total_facilities, ), ) continue draft_data = analyze_golfpakker_with_gemini(combined_text[:30000], name) if not draft_data: failed_count += 1 await emit_progress( progress_callback, progress_completed=index, progress_ok=saved_count, progress_failed=failed_count, progress_skipped=skipped_count, current_facility_id=fac_id, current_facility_name=name, event=make_progress_event( facility_id=fac_id, facility_name=name, outcome="error", message="AI-analysen ga ikke et gyldig golfpakkeutkast.", processed=index, total=total_facilities, ), ) continue analyzed_count += 1 found_packages = len(draft_data.get("foreslatt_golfpakker", [])) print(f" ✅ AI fant {found_packages} golfpakker.") await conn.execute( """ UPDATE facilities SET golfpakker_draft = $1::jsonb WHERE id = $2 """, json.dumps(draft_data), fac_id, ) print(" 💾 Golfpakke-utkast lagret i databasen!") saved_count += 1 await emit_progress( progress_callback, progress_completed=index, progress_ok=saved_count, progress_failed=failed_count, progress_skipped=skipped_count, current_facility_id=fac_id, current_facility_name=name, event=make_progress_event( facility_id=fac_id, facility_name=name, outcome="success", message=f"Utkast lagret med {found_packages} golfpakker.", processed=index, total=total_facilities, ), ) except Exception as exc: failed_count += 1 print(f" ❌ Uventet feil for {name}: {exc}") await emit_progress( progress_callback, progress_completed=index, progress_ok=saved_count, progress_failed=failed_count, progress_skipped=skipped_count, current_facility_id=fac_id, current_facility_name=name, event=make_progress_event( facility_id=fac_id, facility_name=name, outcome="error", message=f"Feilet under behandling: {str(exc).splitlines()[0]}", processed=index, total=total_facilities, ), ) await browser.close() finally: await conn.close() print("\n🏁 Golfpakkeskraping fullført.") return { "processed_facilities": len(facilities), "analyzed_facilities": analyzed_count, "saved_drafts": saved_count, "skipped_facilities": skipped_count, "failed_facilities": failed_count, } if __name__ == "__main__": parser = argparse.ArgumentParser(description="Skrap golfpakker via AI.") parser.add_argument("--ids", type=str, help="Kommaseparert liste med anleggs-IDer", default=None) args = parser.parse_args() facility_ids_list = None if args.ids: facility_ids_list = [int(id_str.strip()) for id_str in args.ids.split(",") if id_str.strip()] asyncio.run(run_golfpakker_scraper(facility_ids_list))