369 lines
14 KiB
Python
369 lines
14 KiB
Python
"""
|
|
TEE OFF - GOLFPAKKE-SKRAPER MED GEMINI AI
|
|
---------------------------------------------------------------------------
|
|
Starter på klubbens nettside, følger relevante interne lenker om golfpakker/
|
|
opphold/hotell, og lagrer AI-forslag som utkast.
|
|
---------------------------------------------------------------------------
|
|
"""
|
|
|
|
import argparse
|
|
import asyncio
|
|
import json
|
|
import os
|
|
from urllib.parse import urljoin, urlparse
|
|
|
|
import asyncpg
|
|
import google.generativeai as genai
|
|
from bs4 import BeautifulSoup
|
|
from dotenv import load_dotenv
|
|
from env_config import get_database_url
|
|
from playwright.async_api import async_playwright
|
|
|
|
from scrape_utils import ProgressCallback, emit_progress, make_progress_event, parse_llm_json
|
|
|
|
load_dotenv()
|
|
|
|
DB_URL = get_database_url()
|
|
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
|
|
|
|
if not GEMINI_API_KEY:
|
|
raise ValueError("🚨 GEMINI_API_KEY mangler i .env filen!")
|
|
|
|
genai.configure(api_key=GEMINI_API_KEY)
|
|
model = genai.GenerativeModel("gemini-2.5-flash")
|
|
|
|
PACKAGE_LINK_HINTS = (
|
|
"golfpakke",
|
|
"golfpakker",
|
|
"pakke",
|
|
"pakker",
|
|
"opphold",
|
|
"overnatting",
|
|
"hotel",
|
|
"hotell",
|
|
"resort",
|
|
"accommodation",
|
|
"stay",
|
|
)
|
|
|
|
|
|
def _extract_text(html_content: str) -> str:
|
|
soup = BeautifulSoup(html_content, "html.parser")
|
|
for tag in soup(["script", "style", "nav", "footer", "header"]):
|
|
tag.extract()
|
|
return soup.get_text(separator=" ", strip=True)
|
|
|
|
|
|
def _extract_candidate_links(html_content: str, base_url: str) -> list[str]:
|
|
soup = BeautifulSoup(html_content, "html.parser")
|
|
base_host = urlparse(base_url).netloc.lower()
|
|
candidates: list[str] = []
|
|
seen: set[str] = set()
|
|
|
|
for anchor in soup.find_all("a", href=True):
|
|
href = str(anchor.get("href") or "").strip()
|
|
if not href or href.startswith("#") or href.startswith("mailto:") or href.startswith("tel:") or href.startswith("javascript:"):
|
|
continue
|
|
|
|
absolute_url = urljoin(base_url, href)
|
|
parsed = urlparse(absolute_url)
|
|
if parsed.scheme not in {"http", "https"}:
|
|
continue
|
|
if parsed.netloc.lower() != base_host:
|
|
continue
|
|
|
|
haystack = f"{absolute_url} {anchor.get_text(' ', strip=True)}".lower()
|
|
if not any(hint in haystack for hint in PACKAGE_LINK_HINTS):
|
|
continue
|
|
|
|
normalized_url = absolute_url.rstrip("/")
|
|
if normalized_url in seen:
|
|
continue
|
|
seen.add(normalized_url)
|
|
candidates.append(normalized_url)
|
|
|
|
return candidates[:6]
|
|
|
|
|
|
async def fetch_page_data(url: str, browser) -> tuple[str, str]:
|
|
url = url.strip()
|
|
if not url.startswith("http"):
|
|
return "", ""
|
|
|
|
print(f" 🌐 Laster inn: {url}")
|
|
page = await browser.new_page()
|
|
try:
|
|
await page.goto(url, wait_until="domcontentloaded", timeout=20000)
|
|
html_content = await page.content()
|
|
return _extract_text(html_content), html_content
|
|
except Exception as exc:
|
|
print(f" ❌ Feil ved lasting av {url}: {exc}")
|
|
return "", ""
|
|
finally:
|
|
await page.close()
|
|
|
|
|
|
async def collect_package_source_text(urls: list[str], browser) -> str:
|
|
combined_sections: list[str] = []
|
|
visited_urls: set[str] = set()
|
|
|
|
for source_url in urls:
|
|
page_text, html_content = await fetch_page_data(source_url, browser)
|
|
if page_text:
|
|
combined_sections.append(f"--- TEKST FRA SIDE ({source_url}) ---\n{page_text}")
|
|
visited_urls.add(source_url.rstrip("/"))
|
|
|
|
if not html_content:
|
|
continue
|
|
|
|
for candidate_url in _extract_candidate_links(html_content, source_url):
|
|
normalized_candidate = candidate_url.rstrip("/")
|
|
if normalized_candidate in visited_urls:
|
|
continue
|
|
visited_urls.add(normalized_candidate)
|
|
|
|
candidate_text, _ = await fetch_page_data(candidate_url, browser)
|
|
if candidate_text:
|
|
combined_sections.append(f"--- TEKST FRA SIDE ({candidate_url}) ---\n{candidate_text}")
|
|
|
|
return "\n\n".join(combined_sections)
|
|
|
|
|
|
def analyze_golfpakker_with_gemini(text: str, club_name: str) -> dict | None:
|
|
print(f" 🧠 Sender {len(text)} tegn til Gemini for golfpakke-analyse...")
|
|
|
|
prompt = f"""
|
|
Du er en ekspert på norske golfklubber og golfpakker.
|
|
Din oppgave er å lese tekster hentet fra nettsidene til "{club_name}" og identifisere eventuelle golfpakker, oppholdspakker eller overnattingstilbud som er relevante for greenfeespillere.
|
|
|
|
REGLER:
|
|
- Trekk bare ut faktiske golfpakker/oppholdspakker. Ikke vanlige greenfeepriser, medlemskap eller bedriftspakker.
|
|
- For hver pakke skal du hente ut:
|
|
1. navn
|
|
2. pris hvis den er eksplisitt oppgitt
|
|
3. en kort oppsummering på 1-3 setninger om hva pakken går ut på
|
|
4. lenke til siden der pakken presenteres
|
|
- Hvis flere pakker beskrives på samme side, kan de bruke samme lenke.
|
|
- Hvis pris ikke finnes eksplisitt, sett den til null.
|
|
- Hvis du ikke finner noen golfpakker, returner en tom liste.
|
|
- Bruk URL-ene som står i markørene `--- TEKST FRA SIDE (...) ---` når du fyller inn lenke.
|
|
|
|
TEKST FRA NETTSIDENE:
|
|
{text}
|
|
|
|
Returner KUN gyldig JSON med denne strukturen:
|
|
{{
|
|
"foreslatt_golfpakker": [
|
|
{{
|
|
"navn": "Golfpakke med hotell",
|
|
"pris": 2490,
|
|
"beskrivelse": "Én natt på hotell, frokost og greenfee for to personer. Pakken gjelder i utvalgte perioder gjennom sesongen.",
|
|
"lenke": "https://eksempel.no/golfpakke"
|
|
}}
|
|
],
|
|
"ai_begrunnelse": "Kort forklaring på hvilke sider og signaler du brukte."
|
|
}}
|
|
"""
|
|
|
|
try:
|
|
response = model.generate_content(prompt)
|
|
parsed = parse_llm_json(response.text)
|
|
return parsed if isinstance(parsed, dict) else None
|
|
except Exception as exc:
|
|
print(f" ❌ AI-analyse feilet: {exc}")
|
|
return None
|
|
|
|
|
|
async def run_golfpakker_scraper(facility_ids=None, progress_callback: ProgressCallback | None = None):
|
|
print("🚀 Starter golfpakke-skraperen...")
|
|
conn = await asyncpg.connect(DB_URL)
|
|
facilities = []
|
|
analyzed_count = 0
|
|
saved_count = 0
|
|
skipped_count = 0
|
|
failed_count = 0
|
|
|
|
try:
|
|
query = """
|
|
SELECT
|
|
id,
|
|
name,
|
|
website_url,
|
|
golfpakker_url,
|
|
COALESCE(NULLIF(TRIM(golfpakker_url), ''), NULLIF(TRIM(website_url), '')) AS source_url
|
|
FROM facilities
|
|
WHERE COALESCE(NULLIF(TRIM(golfpakker_url), ''), NULLIF(TRIM(website_url), '')) IS NOT NULL
|
|
"""
|
|
if facility_ids:
|
|
query += f" AND id IN ({','.join(map(str, facility_ids))})"
|
|
|
|
facilities = await conn.fetch(query)
|
|
total_facilities = len(facilities)
|
|
print(f"📋 Fant {total_facilities} anlegg å skrape.")
|
|
await emit_progress(
|
|
progress_callback,
|
|
progress_total=total_facilities,
|
|
progress_completed=0,
|
|
progress_ok=0,
|
|
progress_failed=0,
|
|
progress_skipped=0,
|
|
event=make_progress_event(
|
|
facility_id=None,
|
|
facility_name="Golfpakker",
|
|
outcome="info",
|
|
message=f"Starter golfpakkeskraping for {total_facilities} anlegg.",
|
|
processed=0,
|
|
total=total_facilities,
|
|
),
|
|
)
|
|
|
|
async with async_playwright() as p:
|
|
browser = await p.chromium.launch(headless=True)
|
|
|
|
for index, facility in enumerate(facilities, start=1):
|
|
fac_id = facility["id"]
|
|
name = facility["name"]
|
|
urls_raw = facility["source_url"]
|
|
|
|
print(f"\n▶️ Behandler golfpakker for: {name} (ID: {fac_id})")
|
|
await emit_progress(
|
|
progress_callback,
|
|
current_facility_id=fac_id,
|
|
current_facility_name=name,
|
|
event=make_progress_event(
|
|
facility_id=fac_id,
|
|
facility_name=name,
|
|
outcome="info",
|
|
message="Starter henting av golfpakke-kilde med fallback til nettside.",
|
|
processed=index - 1,
|
|
total=total_facilities,
|
|
),
|
|
)
|
|
|
|
urls = [url.strip() for url in str(urls_raw or "").split(",") if url.strip()]
|
|
try:
|
|
combined_text = await collect_package_source_text(urls, browser)
|
|
|
|
if len(combined_text) < 50:
|
|
print(" ⚠️ Fant for lite tekst, hopper over.")
|
|
skipped_count += 1
|
|
await emit_progress(
|
|
progress_callback,
|
|
progress_completed=index,
|
|
progress_ok=saved_count,
|
|
progress_failed=failed_count,
|
|
progress_skipped=skipped_count,
|
|
current_facility_id=fac_id,
|
|
current_facility_name=name,
|
|
event=make_progress_event(
|
|
facility_id=fac_id,
|
|
facility_name=name,
|
|
outcome="warning",
|
|
message="Hoppet over fordi det ble funnet for lite relevant tekst.",
|
|
processed=index,
|
|
total=total_facilities,
|
|
),
|
|
)
|
|
continue
|
|
|
|
draft_data = analyze_golfpakker_with_gemini(combined_text[:30000], name)
|
|
if not draft_data:
|
|
failed_count += 1
|
|
await emit_progress(
|
|
progress_callback,
|
|
progress_completed=index,
|
|
progress_ok=saved_count,
|
|
progress_failed=failed_count,
|
|
progress_skipped=skipped_count,
|
|
current_facility_id=fac_id,
|
|
current_facility_name=name,
|
|
event=make_progress_event(
|
|
facility_id=fac_id,
|
|
facility_name=name,
|
|
outcome="error",
|
|
message="AI-analysen ga ikke et gyldig golfpakkeutkast.",
|
|
processed=index,
|
|
total=total_facilities,
|
|
),
|
|
)
|
|
continue
|
|
|
|
analyzed_count += 1
|
|
found_packages = len(draft_data.get("foreslatt_golfpakker", []))
|
|
print(f" ✅ AI fant {found_packages} golfpakker.")
|
|
|
|
await conn.execute(
|
|
"""
|
|
UPDATE facilities
|
|
SET golfpakker_draft = $1::jsonb
|
|
WHERE id = $2
|
|
""",
|
|
json.dumps(draft_data),
|
|
fac_id,
|
|
)
|
|
|
|
print(" 💾 Golfpakke-utkast lagret i databasen!")
|
|
saved_count += 1
|
|
await emit_progress(
|
|
progress_callback,
|
|
progress_completed=index,
|
|
progress_ok=saved_count,
|
|
progress_failed=failed_count,
|
|
progress_skipped=skipped_count,
|
|
current_facility_id=fac_id,
|
|
current_facility_name=name,
|
|
event=make_progress_event(
|
|
facility_id=fac_id,
|
|
facility_name=name,
|
|
outcome="success",
|
|
message=f"Utkast lagret med {found_packages} golfpakker.",
|
|
processed=index,
|
|
total=total_facilities,
|
|
),
|
|
)
|
|
except Exception as exc:
|
|
failed_count += 1
|
|
print(f" ❌ Uventet feil for {name}: {exc}")
|
|
await emit_progress(
|
|
progress_callback,
|
|
progress_completed=index,
|
|
progress_ok=saved_count,
|
|
progress_failed=failed_count,
|
|
progress_skipped=skipped_count,
|
|
current_facility_id=fac_id,
|
|
current_facility_name=name,
|
|
event=make_progress_event(
|
|
facility_id=fac_id,
|
|
facility_name=name,
|
|
outcome="error",
|
|
message=f"Feilet under behandling: {str(exc).splitlines()[0]}",
|
|
processed=index,
|
|
total=total_facilities,
|
|
),
|
|
)
|
|
|
|
await browser.close()
|
|
|
|
finally:
|
|
await conn.close()
|
|
print("\n🏁 Golfpakkeskraping fullført.")
|
|
|
|
return {
|
|
"processed_facilities": len(facilities),
|
|
"analyzed_facilities": analyzed_count,
|
|
"saved_drafts": saved_count,
|
|
"skipped_facilities": skipped_count,
|
|
"failed_facilities": failed_count,
|
|
}
|
|
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser(description="Skrap golfpakker via AI.")
|
|
parser.add_argument("--ids", type=str, help="Kommaseparert liste med anleggs-IDer", default=None)
|
|
args = parser.parse_args()
|
|
|
|
facility_ids_list = None
|
|
if args.ids:
|
|
facility_ids_list = [int(id_str.strip()) for id_str in args.ids.split(",") if id_str.strip()]
|
|
|
|
asyncio.run(run_golfpakker_scraper(facility_ids_list))
|