Nye-TeeOff/backend/scrape_status.py

591 lines
26 KiB
Python

import asyncio
import os
import asyncpg
import smtplib
import re
import argparse
from datetime import datetime
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
from playwright.async_api import async_playwright
try:
from playwright_stealth import stealth_async as apply_stealth
except ImportError:
from playwright_stealth import stealth as apply_stealth
from google import genai
from dotenv import load_dotenv
from env_config import get_database_url
from scrape_utils import ProgressCallback, emit_progress, make_progress_event
load_dotenv()
DB_URL = get_database_url()
# ==========================================
# KONFIGURERER GEMINI AI (NY SDK)
# ==========================================
client = genai.Client()
async def ask_llm_status(text, course_name, is_single_course, ai_instruction=None):
if is_single_course:
bane_instruks = "Finn den generelle banestatusen for dette golfanlegget. Se bort fra spesifikke banenavn, da anlegget kun har én bane."
else:
bane_instruks = f'Finn banestatusen SPESIFIKT for banen som heter/omtales som: "{course_name}".'
ekstra_tekst = f"\n!!! VIKTIG EKSTRA-INSTRUKS FRA ADMIN (DENNE OVERSTYRER ALLE ANDRE REGLER) !!!:\n{ai_instruction}\n" if ai_instruction else ""
prompt = f"""
Du er en ekspert på å lese norske golfklubbers nettsider for å finne banestatus.
{bane_instruks}
{ekstra_tekst}
Svar KUN med nøyaktig ETT av disse ordene:
- aapen (hvis banen er åpen/sommergreener)
- stengt (hvis banen er lukket/stengt/frost/snø)
- aapen_med_vintergreener (hvis det spilles på vintergreener)
- aapner_snart (hvis den åpner om kort tid)
- stenger_snart (hvis den stenger for sesongen om kort tid)
- under_utvikling (hvis den er under utvikling)
- nedlagt (hvis den er nedlagt)
- ukjent (hvis du ikke finner noe info om banen i teksten)
Tekst fra nettsiden:
{text[:15000]}
"""
print("\n" + "="*60)
print(f"🤖 SENDER PROMPT TIL GEMINI FOR: '{course_name}'")
print(f"👉 STANDARD-INSTRUKS: {bane_instruks}")
if ai_instruction:
print(f"👉 ADMIN-HVISKER: {ai_instruction}")
clean_text_sample = " ".join(text.split())[:250]
print(f"👉 TEKST FRA NETTSIDEN (utdrag): '{clean_text_sample}...'")
print("="*60 + "\n")
try:
response = await client.aio.models.generate_content(
model='gemini-2.5-flash',
contents=prompt
)
svar = response.text.strip().lower()
print(f" 🧠 GEMINI RÅ-SVAR: '{svar}'")
# --- NYTT: SORTERT SIKKERHETSFILTER ---
gyldige_svar = [
"aapen_med_vintergreener",
"aapner_snart",
"stenger_snart",
"under_utvikling",
"nedlagt",
"stengt",
"aapen",
"ukjent"
]
for gyldig in gyldige_svar:
if gyldig in svar:
return gyldig
return "ukjent"
except Exception as e:
print(f"❌ Gemini Feil: {e}")
return "ukjent"
# ==========================================
# EKSISTERENDE LOGIKK FOR MANUELL SCRAPING
# ==========================================
def clean_text(text):
return re.sub(r'[^a-zA-Z0-9æøåÆØÅ]', '', text).lower()
def interpret_status(text, keyword=None):
t_raw = text.lower()
if keyword:
k_clean = clean_text(keyword)
if k_clean not in clean_text(t_raw):
return "NOT_FOUND"
parts = re.split(re.escape(keyword), t_raw, flags=re.IGNORECASE)
if len(parts) > 1:
t_raw = parts[1][:150]
else:
t_raw = t_raw[-200:]
if any(word in t_raw for word in ["stengt", "lukket", "frost", "snø", "is", "closed", "stenger"]):
return "stengt"
if any(word in t_raw for word in ["vintergreen", "vintergrønn", "vinter"]):
return "aapen_med_vintergreener"
if any(word in t_raw for word in ["snart", "åpner kl"]):
return "aapner_snart"
if any(word in t_raw for word in ["åpen", "åpent", "aapen", "open"]):
return "aapen"
return "ukjent"
def send_report(changes, warnings, successes):
if not changes and not warnings and not successes: return
subject = f"TeeOff Banestatus Rapport - {datetime.now().strftime('%d.%m.%Y')}"
body = "BANESTATUS RAPPORT\n" + "="*30 + "\n\n"
if changes: body += "✅ OPPDATERINGER:\n" + "\n".join(changes) + "\n\n"
if warnings: body += "⚠️ MERKNADER / ADVARSLER:\n" + "\n".join(warnings) + "\n\n"
if successes: body += "🆗 VELLYKKEDE SJEKKER (INGEN ENDRING):\n" + "\n".join(successes) + "\n"
msg = MIMEMultipart()
msg['From'] = os.getenv("SMTP_USER")
msg['To'] = os.getenv("EMAIL_TO")
msg['Subject'] = subject
msg.attach(MIMEText(body, 'plain'))
try:
with smtplib.SMTP_SSL(os.getenv("SMTP_SERVER"), int(os.getenv("SMTP_PORT"))) as server:
server.login(os.getenv("SMTP_USER"), os.getenv("SMTP_PASS"))
server.send_message(msg)
print("✅ Rapport sendt på e-post.")
except Exception as e:
print(f"❌ E-post feil: {e}")
# ==========================================
# HOVEDMOTOR
# ==========================================
async def run_daily_scraping(facility_ids=None, progress_callback: ProgressCallback | None = None):
print(f"🚀 Starter sjekk {datetime.now().strftime('%H:%M:%S')}...")
conn = await asyncpg.connect(DB_URL)
if facility_ids:
print(f"📌 Kjører skraping KUN for anlegg-ID(er): {facility_ids}")
facilities = await conn.fetch(
"SELECT id, name, scrape_status_url, scrape_status_selector, scrape_method, ai_instruction FROM facilities WHERE scrape_status_url IS NOT NULL AND id = ANY($1::int[])",
facility_ids
)
else:
print("🌍 Kjører skraping for ALLE anlegg med scrape_status_url...")
facilities = await conn.fetch(
"SELECT id, name, scrape_status_url, scrape_status_selector, scrape_method, ai_instruction FROM facilities WHERE scrape_status_url IS NOT NULL"
)
if not facilities:
print("⚠️ Fant ingen anlegg å skrape.")
await conn.close()
return {
"processed_facilities": 0,
"updated_courses": 0,
"warnings": 0,
"successes": 0,
"failed_facilities": 0,
"skipped_facilities": 0,
}
changes, warnings, successes = [], [], []
total_facilities = len(facilities)
ok_facilities = 0
failed_facilities = 0
skipped_facilities = 0
await emit_progress(
progress_callback,
progress_total=total_facilities,
progress_completed=0,
progress_ok=0,
progress_failed=0,
progress_skipped=0,
event=make_progress_event(
facility_id=None,
facility_name="Banestatus",
outcome="info",
message=f"Starter banestatusskraping for {total_facilities} anlegg.",
processed=0,
total=total_facilities,
),
)
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context()
for index, f in enumerate(facilities, start=1):
method = f.get('scrape_method') or 'css_selector'
facility_id = f['id']
facility_name = f['name']
await emit_progress(
progress_callback,
current_facility_id=facility_id,
current_facility_name=facility_name,
event=make_progress_event(
facility_id=facility_id,
facility_name=facility_name,
outcome="info",
message=f"Starter sjekk med metode {method}.",
processed=index - 1,
total=total_facilities,
),
)
if method == 'manual':
successes.append(f"⏸️ {f['name']}: Hoppet over (Manuell overstyring)")
print(f" ⏸️ Hopper over skraping av {f['name']} (Satt til Manuell)")
skipped_facilities += 1
await emit_progress(
progress_callback,
progress_completed=index,
progress_ok=ok_facilities,
progress_failed=failed_facilities,
progress_skipped=skipped_facilities,
current_facility_id=facility_id,
current_facility_name=facility_name,
event=make_progress_event(
facility_id=facility_id,
facility_name=facility_name,
outcome="warning",
message="Hoppet over fordi anlegget er satt til manuell overstyring.",
processed=index,
total=total_facilities,
),
)
continue
page = await context.new_page()
try: await apply_stealth(page)
except: pass
try:
print(f"🔍 Besøker {f['name']} (Metode: {method})...")
await page.goto(f['scrape_status_url'], timeout=60000, wait_until="domcontentloaded")
await page.wait_for_timeout(3000)
full_text = ""
if method == 'css_selector':
element = page.locator(f['scrape_status_selector']).first
if await element.count() == 0:
warnings.append(f"{f['name']}: Fant ikke CSS-elementet '{f['scrape_status_selector']}'")
failed_facilities += 1
await emit_progress(
progress_callback,
progress_completed=index,
progress_ok=ok_facilities,
progress_failed=failed_facilities,
progress_skipped=skipped_facilities,
current_facility_id=facility_id,
current_facility_name=facility_name,
event=make_progress_event(
facility_id=facility_id,
facility_name=facility_name,
outcome="error",
message=f"Fant ikke CSS-elementet {f['scrape_status_selector']}.",
processed=index,
total=total_facilities,
),
)
continue
full_text = await element.inner_text()
elif method == 'iframe_golfbox':
frame = page.frame_locator('iframe[src*="golfbox"]')
element = frame.locator(f['scrape_status_selector']).first
if await element.count() == 0:
warnings.append(f"{f['name']}: Fant ikke elementet '{f['scrape_status_selector']}' i iframen")
failed_facilities += 1
await emit_progress(
progress_callback,
progress_completed=index,
progress_ok=ok_facilities,
progress_failed=failed_facilities,
progress_skipped=skipped_facilities,
current_facility_id=facility_id,
current_facility_name=facility_name,
event=make_progress_event(
facility_id=facility_id,
facility_name=facility_name,
outcome="error",
message=f"Fant ikke elementet {f['scrape_status_selector']} i Golfbox-iframe.",
processed=index,
total=total_facilities,
),
)
continue
full_text = await element.inner_text()
elif method == 'click_then_css':
parts = f['scrape_status_selector'].split('||')
if len(parts) != 2:
warnings.append(f"{f['name']}: Ugyldig selector for click_then_css (mangler ||)")
failed_facilities += 1
await emit_progress(
progress_callback,
progress_completed=index,
progress_ok=ok_facilities,
progress_failed=failed_facilities,
progress_skipped=skipped_facilities,
current_facility_id=facility_id,
current_facility_name=facility_name,
event=make_progress_event(
facility_id=facility_id,
facility_name=facility_name,
outcome="error",
message="Ugyldig click_then_css-selector i konfigurasjonen.",
processed=index,
total=total_facilities,
),
)
continue
btn_selector, text_selector = parts
btn = page.locator(btn_selector).first
if await btn.count() == 0:
warnings.append(f"{f['name']}: Fant ikke knappen å klikke på: '{btn_selector}'")
failed_facilities += 1
await emit_progress(
progress_callback,
progress_completed=index,
progress_ok=ok_facilities,
progress_failed=failed_facilities,
progress_skipped=skipped_facilities,
current_facility_id=facility_id,
current_facility_name=facility_name,
event=make_progress_event(
facility_id=facility_id,
facility_name=facility_name,
outcome="error",
message=f"Fant ikke knappen {btn_selector} som skulle klikkes.",
processed=index,
total=total_facilities,
),
)
continue
await btn.click(force=True)
await page.wait_for_timeout(2000)
element = page.locator(text_selector).first
if await element.count() == 0:
warnings.append(f"{f['name']}: Fant ikke tekstboksen '{text_selector}' etter klikk")
failed_facilities += 1
await emit_progress(
progress_callback,
progress_completed=index,
progress_ok=ok_facilities,
progress_failed=failed_facilities,
progress_skipped=skipped_facilities,
current_facility_id=facility_id,
current_facility_name=facility_name,
event=make_progress_event(
facility_id=facility_id,
facility_name=facility_name,
outcome="error",
message=f"Fant ikke tekstboksen {text_selector} etter klikk.",
processed=index,
total=total_facilities,
),
)
continue
full_text = await element.inner_text()
elif method == 'llm_parse':
print(" 🖱️ Leter etter knapper å klikke på for å avdekke skjult tekst...")
knapper = await page.get_by_text(re.compile(r"banestatus|dagens status|se status|se dagens status|baneinfo|\bstatus\b", re.IGNORECASE)).all()
klikk_count = 0
for knapp in knapper:
try:
if await knapp.is_visible():
await knapp.click(timeout=2000, force=True)
klikk_count += 1
await page.wait_for_timeout(2000)
except Exception:
pass
if klikk_count > 0:
print(f" 🎯 Tvangsklikket på {klikk_count} status-knapp(er)! Venter ekstra på at innholdet laster...")
await page.wait_for_timeout(2000)
else:
print(" ⚠️ Fant ingen knapper å klikke på.")
# --- NYTT: HENTER OGSÅ SKJULT TEKST (For Scangolf megamenyer) ---
element = page.locator("body").first
if await element.count() == 0:
warnings.append(f"{f['name']}: Klarte ikke å lese siden for AI-tolkning")
failed_facilities += 1
await emit_progress(
progress_callback,
progress_completed=index,
progress_ok=ok_facilities,
progress_failed=failed_facilities,
progress_skipped=skipped_facilities,
current_facility_id=facility_id,
current_facility_name=facility_name,
event=make_progress_event(
facility_id=facility_id,
facility_name=facility_name,
outcome="error",
message="Klarte ikke å lese siden for AI-tolkning.",
processed=index,
total=total_facilities,
),
)
continue
synlig_tekst = await element.inner_text() or ""
skjult_tekst = await element.text_content() or ""
# Slår sammen all tekst slik at Gemini får med seg menyer som er gjemt med CSS
råtekst = synlig_tekst + " " + skjult_tekst
full_text = " ".join(råtekst.split())
# ----------------------------------------------------------------
else:
warnings.append(f"⚠️ {f['name']}: Ukjent skrapemetode i databasen: '{method}'")
failed_facilities += 1
await emit_progress(
progress_callback,
progress_completed=index,
progress_ok=ok_facilities,
progress_failed=failed_facilities,
progress_skipped=skipped_facilities,
current_facility_id=facility_id,
current_facility_name=facility_name,
event=make_progress_event(
facility_id=facility_id,
facility_name=facility_name,
outcome="error",
message=f"Ukjent skrapemetode: {method}.",
processed=index,
total=total_facilities,
),
)
continue
await conn.execute("UPDATE facilities SET status_updated_at = CURRENT_DATE WHERE id = $1", f['id'])
courses = await conn.fetch("SELECT id, name, status, scrape_keyword FROM courses WHERE facility_id = $1", f['id'])
is_single_course = len(courses) == 1
facility_changed = 0
facility_confirmed = 0
facility_unresolved = 0
for c in courses:
old_status = c['status'] or "ukjent"
if method == 'llm_parse':
print(f" 🤖 Spør Gemini om status for '{c['name']}' (Singelbane: {is_single_course})...")
new_status = await ask_llm_status(full_text, c['name'], is_single_course, f.get('ai_instruction'))
print(" ⏳ Tar 5 sekunders pause for å spare Gemini-kvoten...")
await asyncio.sleep(5)
else:
new_status = interpret_status(full_text, c['scrape_keyword'])
if new_status == "NOT_FOUND":
warnings.append(f"{f['name']} ({c['name']}): Fant ikke søkeordet '{c['scrape_keyword']}' i teksten.")
facility_unresolved += 1
continue
# --- OPPDATERT LOGIKK (Fikser logg-buggen) ---
if new_status == "ukjent":
# Sikkerhetsnettet slår inn: Vi beholder gammel status!
warnings.append(f"⚠️ {f['name']} ({c['name']}): Fant ikke status. Beholder '{old_status.upper()}'.")
print(f" 🟡 KONKLUSJON: Fant ikke status i teksten (Sikkerhetsnett). Beholder gammel status ({old_status.upper()}).")
facility_unresolved += 1
elif new_status != old_status:
await conn.execute("UPDATE courses SET status = $1 WHERE id = $2", new_status, c['id'])
changes.append(f"🔹 {f['name']} ({c['name']}): {old_status.upper()}{new_status.upper()}")
print(f" 🟢 KONKLUSJON: Status endret fra {old_status.upper()} til {new_status.upper()}")
facility_changed += 1
else:
successes.append(f"{f['name']} ({c['name']}): {new_status.upper()}")
print(f" ⚪ KONKLUSJON: Ingen endring. Banen er fortsatt {old_status.upper()}")
facility_confirmed += 1
# ---------------------------------------------
ok_facilities += 1
if facility_changed > 0:
facility_outcome = "success"
facility_message = (
f"{facility_changed} baner oppdatert, {facility_confirmed} bekreftet"
+ (f", {facility_unresolved} beholdt som før" if facility_unresolved > 0 else "")
+ "."
)
elif facility_unresolved > 0:
facility_outcome = "warning"
facility_message = (
f"Ingen statusendring. {facility_confirmed} baner bekreftet og "
f"{facility_unresolved} beholdt som før."
)
else:
facility_outcome = "success"
facility_message = f"Ingen endring. {facility_confirmed} baner bekreftet."
await emit_progress(
progress_callback,
progress_completed=index,
progress_ok=ok_facilities,
progress_failed=failed_facilities,
progress_skipped=skipped_facilities,
current_facility_id=facility_id,
current_facility_name=facility_name,
event=make_progress_event(
facility_id=facility_id,
facility_name=facility_name,
outcome=facility_outcome,
message=facility_message,
processed=index,
total=total_facilities,
),
)
except Exception as e:
err_msg = str(e).split('\n')[0]
warnings.append(f"🔥 {f['name']}: Feil under skraping: {err_msg}")
failed_facilities += 1
await emit_progress(
progress_callback,
progress_completed=index,
progress_ok=ok_facilities,
progress_failed=failed_facilities,
progress_skipped=skipped_facilities,
current_facility_id=facility_id,
current_facility_name=facility_name,
event=make_progress_event(
facility_id=facility_id,
facility_name=facility_name,
outcome="error",
message=f"Feil under skraping: {err_msg}",
processed=index,
total=total_facilities,
),
)
finally:
await page.close()
await browser.close()
await conn.close()
send_report(changes, warnings, successes)
print("🏁 Ferdig.")
return {
"processed_facilities": len(facilities),
"updated_courses": len(changes),
"warnings": len(warnings),
"successes": len(successes),
"failed_facilities": failed_facilities,
"skipped_facilities": skipped_facilities,
}
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="TeeOff Status Scraper")
parser.add_argument("--ids", type=str, help="Kommaseparert liste med anleggs-IDer", default=None)
args = parser.parse_args()
facility_ids_list = None
if args.ids:
try:
facility_ids_list = [int(id_str.strip()) for id_str in args.ids.split(",") if id_str.strip()]
except ValueError:
print("❌ Feil format på --ids. Må være kommaseparerte tall, f.eks: 1,4,12")
exit(1)
asyncio.run(run_daily_scraping(facility_ids_list))