2026-03-12 13:39:10 +01:00
"""
TEE OFF - GREENFEE - SKRAPER MED GEMINI AI
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Henter alle greenfee - varianter fra en ( eller flere ) URL - er og strukturerer
dem i en JSON - liste . Finner også avtaleklubber / vennskapsklubber .
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
"""
import asyncio
import asyncpg
import os
import json
import argparse
from bs4 import BeautifulSoup
from playwright . async_api import async_playwright
import google . generativeai as genai
from dotenv import load_dotenv
2026-04-16 09:58:08 +02:00
from env_config import get_database_url
2026-05-04 15:30:29 +02:00
from scrape_utils import (
ProgressCallback ,
emit_progress ,
exclude_discontinued_facilities_clause ,
make_progress_event ,
parse_llm_json ,
)
2026-03-12 13:39:10 +01:00
load_dotenv ( )
2026-04-16 09:58:08 +02:00
DB_URL = get_database_url ( )
2026-03-12 13:39:10 +01:00
GEMINI_API_KEY = os . getenv ( " GEMINI_API_KEY " )
if not GEMINI_API_KEY :
raise ValueError ( " 🚨 GEMINI_API_KEY mangler i .env filen! " )
genai . configure ( api_key = GEMINI_API_KEY )
model = genai . GenerativeModel ( ' gemini-2.5-flash ' )
async def fetch_page_text ( url : str , browser ) - > str :
url = url . strip ( )
if not url . startswith ( " http " ) :
return " "
print ( f " 🌐 Laster inn: { url } " )
try :
page = await browser . new_page ( )
await page . goto ( url , wait_until = " domcontentloaded " , timeout = 15000 )
html_content = await page . content ( )
await page . close ( )
soup = BeautifulSoup ( html_content , ' html.parser ' )
for script in soup ( [ " script " , " style " , " nav " , " footer " , " header " ] ) :
script . extract ( )
return soup . get_text ( separator = ' ' , strip = True )
except Exception as e :
print ( f " ❌ Feil ved lasting av { url } : { e } " )
return " "
2026-04-12 10:11:23 +02:00
def analyze_greenfee_with_gemini ( text : str , club_name : str ) - > dict | None :
2026-03-12 13:39:10 +01:00
print ( f " 🧠 Sender { len ( text ) } tegn til Gemini for greenfee-analyse... " )
prompt = f """
Du er en ekspert på norske golfklubber og prissetting .
Din oppgave er å lese teksten hentet fra nettsidene til " {club_name} " og hente ut TO ting :
1. ALLE varianter av greenfee - priser .
2. Navn på eventuelle vennskapsklubber / avtaleklubber ( hvis nevnt ) .
REGLER FOR GREENFEE :
- Trekk ut absolutt alle priskategorier du finner ( f . eks . " Hverdag høysesong " , " Helg før kl 14 " , " Gjest av medlem " , " 9 hull kveld " , osv . ) .
- Finn både voksenpris og juniorpris for hver kategori .
- HVIS juniorpris er oppgitt som en regel ( f . eks . " Juniorer betaler halv pris " eller " 50 % r abatt for junior " ) , MÅ du selv regne ut prisen og skrive inn heltallet .
- " banenavn " : Bruk navnet på banen hvis det er spesifisert ( f . eks . " 18-hullsbanen " , " Korthullsbanen " ) . Hvis ikke spesifisert , bruk " {club_name} " .
- Priser SKAL være tall ( integer ) . Sett pris til null ( null ) hvis den ikke finnes .
REGLER FOR AVTALEKLUBBER :
- Let etter overskrifter som " Vennskapsklubber " , " Avtaleklubber " , " Gjestespill " , " Samarbeidsklubber " .
- Trekk ut kun navnene på klubbene i en liste ( f . eks . [ " Haga GK " , " Oslo GK " ] ) . La listen være tom hvis du ikke finner noen .
TEKST FRA NETTSIDEN :
{ text }
OPPGAVE :
Returner KUN et gyldig JSON - objekt med nøyaktig følgende struktur :
{ {
" foreslatt_greenfee " : [
{ {
" banenavn " : " Navn på banen " ,
" priskategori " : " F.eks: Hverdag Gjest av Medlem " ,
" pris_voksne " : 600 ,
" pris_junior " : 300
} }
] ,
" foreslatt_avtaleklubber " : [
" Klubb 1 GK " ,
" Klubb 2 GK "
] ,
" ai_begrunnelse " : " Kort forklaring, f.eks: ' Fant et komplekst prissystem for høy/lavsesong. Regnet ut juniorpriser til 50 % s om angitt i teksten. Fant 3 samarbeidsklubber nederst. ' "
} }
"""
try :
response = model . generate_content ( prompt )
2026-04-12 10:11:23 +02:00
parsed = parse_llm_json ( response . text )
return parsed if isinstance ( parsed , dict ) else None
2026-03-12 13:39:10 +01:00
except Exception as e :
print ( f " ❌ AI-analyse feilet: { e } " )
return None
2026-04-12 10:11:23 +02:00
async def run_greenfee_scraper ( facility_ids = None , progress_callback : ProgressCallback | None = None ) :
2026-03-12 13:39:10 +01:00
print ( " 🚀 Starter Greenfee-skraperen... " )
conn = await asyncpg . connect ( DB_URL )
2026-04-10 18:37:33 +02:00
facilities = [ ]
analyzed_count = 0
saved_count = 0
skipped_count = 0
2026-04-12 10:11:23 +02:00
failed_count = 0
2026-03-12 13:39:10 +01:00
try :
2026-05-04 15:30:29 +02:00
query = (
" SELECT id, name, greenfee_url FROM facilities "
" WHERE greenfee_url IS NOT NULL AND greenfee_url != ' ' "
f " { exclude_discontinued_facilities_clause ( ' facilities ' ) } "
)
2026-03-12 13:39:10 +01:00
if facility_ids :
query + = f " AND id IN ( { ' , ' . join ( map ( str , facility_ids ) ) } ) "
facilities = await conn . fetch ( query )
2026-04-12 10:11:23 +02:00
total_facilities = len ( facilities )
print ( f " 📋 Fant { total_facilities } anlegg å skrape. " )
await emit_progress (
progress_callback ,
progress_total = total_facilities ,
progress_completed = 0 ,
progress_ok = 0 ,
progress_failed = 0 ,
progress_skipped = 0 ,
event = make_progress_event (
facility_id = None ,
facility_name = " Greenfee " ,
outcome = " info " ,
message = f " Starter greenfeeskraping for { total_facilities } anlegg. " ,
processed = 0 ,
total = total_facilities ,
) ,
)
2026-03-12 13:39:10 +01:00
async with async_playwright ( ) as p :
browser = await p . chromium . launch ( headless = True )
2026-04-12 10:11:23 +02:00
for index , facility in enumerate ( facilities , start = 1 ) :
2026-03-12 13:39:10 +01:00
fac_id = facility [ ' id ' ]
name = facility [ ' name ' ]
urls_raw = facility [ ' greenfee_url ' ]
print ( f " \n ▶️ Behandler Greenfee for: { name } (ID: { fac_id } ) " )
2026-04-12 10:11:23 +02:00
await emit_progress (
progress_callback ,
current_facility_id = fac_id ,
current_facility_name = name ,
event = make_progress_event (
facility_id = fac_id ,
facility_name = name ,
outcome = " info " ,
message = " Starter henting av greenfeesider. " ,
processed = index - 1 ,
total = total_facilities ,
) ,
)
2026-03-12 13:39:10 +01:00
urls = [ u . strip ( ) for u in urls_raw . split ( ' , ' ) ]
combined_text = " "
2026-04-12 10:11:23 +02:00
try :
for idx , url in enumerate ( urls , 1 ) :
page_text = await fetch_page_text ( url , browser )
if page_text :
combined_text + = f " \n \n --- TEKST FRA SIDE { idx } ( { url } ) --- \n { page_text } "
2026-03-12 13:39:10 +01:00
2026-04-12 10:11:23 +02:00
if len ( combined_text ) < 50 :
print ( " ⚠️ Fant for lite tekst, hopper over. " )
skipped_count + = 1
await emit_progress (
progress_callback ,
progress_completed = index ,
progress_ok = saved_count ,
progress_failed = failed_count ,
progress_skipped = skipped_count ,
current_facility_id = fac_id ,
current_facility_name = name ,
event = make_progress_event (
facility_id = fac_id ,
facility_name = name ,
outcome = " warning " ,
message = " Hoppet over fordi det ble funnet for lite tekst på greenfeesidene. " ,
processed = index ,
total = total_facilities ,
) ,
)
continue
draft_data = analyze_greenfee_with_gemini ( combined_text [ : 25000 ] , name )
if not draft_data :
failed_count + = 1
await emit_progress (
progress_callback ,
progress_completed = index ,
progress_ok = saved_count ,
progress_failed = failed_count ,
progress_skipped = skipped_count ,
current_facility_id = fac_id ,
current_facility_name = name ,
event = make_progress_event (
facility_id = fac_id ,
facility_name = name ,
outcome = " error " ,
message = " AI-analysen ga ikke et gyldig greenfeeutkast. " ,
processed = index ,
total = total_facilities ,
) ,
)
continue
2026-04-10 18:37:33 +02:00
2026-04-12 10:11:23 +02:00
analyzed_count + = 1
funnet_priser = len ( draft_data . get ( ' foreslatt_greenfee ' , [ ] ) )
funnet_klubber = len ( draft_data . get ( ' foreslatt_avtaleklubber ' , [ ] ) )
print ( f " ✅ AI fant { funnet_priser } greenfee-varianter og { funnet_klubber } avtaleklubber. " )
2026-03-12 13:39:10 +01:00
2026-04-12 10:11:23 +02:00
await conn . execute ( """
UPDATE facilities
SET greenfee_draft = $ 1 : : jsonb
WHERE id = $ 2
""" , json.dumps(draft_data), fac_id)
print ( " 💾 Greenfee-utkast lagret i databasen! " )
saved_count + = 1
await emit_progress (
progress_callback ,
progress_completed = index ,
progress_ok = saved_count ,
progress_failed = failed_count ,
progress_skipped = skipped_count ,
current_facility_id = fac_id ,
current_facility_name = name ,
event = make_progress_event (
facility_id = fac_id ,
facility_name = name ,
outcome = " success " ,
message = f " Utkast lagret med { funnet_priser } prisvarianter og { funnet_klubber } avtaleklubber. " ,
processed = index ,
total = total_facilities ,
) ,
)
except Exception as e :
failed_count + = 1
print ( f " ❌ Uventet feil for { name } : { e } " )
await emit_progress (
progress_callback ,
progress_completed = index ,
progress_ok = saved_count ,
progress_failed = failed_count ,
progress_skipped = skipped_count ,
current_facility_id = fac_id ,
current_facility_name = name ,
event = make_progress_event (
facility_id = fac_id ,
facility_name = name ,
outcome = " error " ,
message = f " Feilet under behandling: { str ( e ) . splitlines ( ) [ 0 ] } " ,
processed = index ,
total = total_facilities ,
) ,
)
2026-03-12 13:39:10 +01:00
await browser . close ( )
finally :
await conn . close ( )
print ( " \n 🏁 Skraping fullført. " )
2026-04-10 18:37:33 +02:00
return {
" processed_facilities " : len ( facilities ) ,
" analyzed_facilities " : analyzed_count ,
" saved_drafts " : saved_count ,
" skipped_facilities " : skipped_count ,
2026-04-12 10:11:23 +02:00
" failed_facilities " : failed_count ,
2026-04-10 18:37:33 +02:00
}
2026-03-12 13:39:10 +01:00
if __name__ == " __main__ " :
parser = argparse . ArgumentParser ( description = " Skrap greenfeepriser via AI. " )
parser . add_argument ( " --ids " , type = str , help = " Kommaseparert liste med facility IDs (eks: 1,5,12) " )
args = parser . parse_args ( )
ids_to_scrape = None
if args . ids :
ids_to_scrape = [ int ( x . strip ( ) ) for x in args . ids . split ( " , " ) ]
2026-04-10 18:37:33 +02:00
asyncio . run ( run_greenfee_scraper ( ids_to_scrape ) )