100 lines
4.3 KiB
Python
100 lines
4.3 KiB
Python
import asyncio
|
|
import asyncpg
|
|
import httpx
|
|
from bs4 import BeautifulSoup
|
|
import re
|
|
import json
|
|
|
|
DB_URL = "postgresql://teeoff_admin:teeoff_secret_password@db:5432/teeoff"
|
|
|
|
def clean_name(text):
|
|
if not text: return ""
|
|
# Vasker navnet for matching (fjerner alt unntatt bokstaver)
|
|
s = text.lower().replace("golfklubb", "").replace("gk", "").replace(" & ", "").strip()
|
|
return re.sub(r'[^a-z]', '', s)
|
|
|
|
async def get_golfamore_links(client):
|
|
"""Henter ALLE norske klubblenker fra Golfamore sin sitemap"""
|
|
print("🕵️ Henter komplett liste fra Golfamore...")
|
|
try:
|
|
# Golfamore har egne sitemaps for hvert land
|
|
resp = await client.get("https://www.golfamore.com/sitemaps/courses-no.xml")
|
|
if resp.status_code == 200:
|
|
links = re.findall(r'<loc>(https://www.golfamore.com/no/golfklubb/.*?/)</loc>', resp.text)
|
|
return list(set(links))
|
|
except Exception as e:
|
|
print(f"❌ Kunne ikke hente sitemap: {e}")
|
|
return []
|
|
|
|
async def scrape_golfamore():
|
|
print("\n******************************************")
|
|
print("🚀 STARTER GOLFAMORE-SYNKRONISERING v1.0")
|
|
print("******************************************\n")
|
|
|
|
conn = await asyncpg.connect(DB_URL)
|
|
facilities = await conn.fetch("SELECT id, name FROM facilities")
|
|
|
|
async with httpx.AsyncClient(timeout=20.0, headers={'User-Agent': 'Mozilla/5.0'}) as client:
|
|
ga_links = await get_golfamore_links(client)
|
|
# Map vaskede navn fra URL-en til selve URL-en
|
|
link_map = {clean_name(l.split('/')[-2].replace('-', ' ')): l for l in ga_links}
|
|
|
|
matches_found = 0
|
|
for fac in facilities:
|
|
fac_id = fac['id']
|
|
fac_name = fac['name']
|
|
fac_clean = clean_name(fac_name)
|
|
|
|
match_url = link_map.get(fac_clean)
|
|
|
|
# Prøv delvis match hvis ikke eksakt (f.eks "Arendal" i "Arendal og Omegn")
|
|
if not match_url:
|
|
for slug, url in link_map.items():
|
|
if len(fac_clean) > 4 and (fac_clean in slug or slug in fac_clean):
|
|
match_url = url
|
|
break
|
|
|
|
if match_url:
|
|
try:
|
|
# Gå til klubbsiden for å finne vilkårene
|
|
f_resp = await client.get(match_url)
|
|
soup = BeautifulSoup(f_resp.text, 'html.parser')
|
|
|
|
# Finn teksten om når kortet gjelder.
|
|
# Golfamore bruker ofte spesifikke klasser for "rules" eller "conditions"
|
|
rules_section = soup.find('div', {'class': 'course-rules'}) or \
|
|
soup.find('div', {'class': 'course-info__rules'}) or \
|
|
soup.find(text=re.compile(r'Golfamore gjelder', re.I))
|
|
|
|
validity = "Gjelder alle dager" # Standard
|
|
if rules_section:
|
|
# Rydd opp i teksten
|
|
validity = rules_section.get_text(separator=' ').replace('\n', ' ')
|
|
validity = re.sub(r'\s+', ' ', validity).strip()
|
|
|
|
ga_data = {
|
|
"validity": validity,
|
|
"source_url": match_url
|
|
}
|
|
|
|
# Oppdater databasen
|
|
await conn.execute("""
|
|
UPDATE facilities
|
|
SET golfamore = true, golfamore_data = $1
|
|
WHERE id = $2
|
|
""", json.dumps(ga_data), fac_id)
|
|
|
|
print(f"✅ MATCH: {fac_name} ({validity[:50]}...)")
|
|
matches_found += 1
|
|
except:
|
|
# Hvis vi ikke klarer å lese detaljene, markerer vi den i hvert fall som aktiv
|
|
await conn.execute("UPDATE facilities SET golfamore = true WHERE id = $1", fac_id)
|
|
else:
|
|
# Hvis den ikke finnes på Golfamore, sett til false
|
|
await conn.execute("UPDATE facilities SET golfamore = false, golfamore_data = '{}' WHERE id = $1", fac_id)
|
|
|
|
await conn.close()
|
|
print(f"\n🎉 Ferdig! {matches_found} baner er nå bekreftet hos Golfamore.")
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(scrape_golfamore())
|