Nye-TeeOff/backend/scrape_nsg_3.py

97 lines
4 KiB
Python

import asyncio
import asyncpg
import httpx
from bs4 import BeautifulSoup
import re
import json
from env_config import get_database_url
DB_URL = get_database_url()
def clean_name(text):
if not text: return ""
s = text.lower().replace("golfklubb", "").replace("gk", "").replace("par3golf", "").replace(" & ", "").strip()
return re.sub(r'[^a-z]', '', s)
def clean_nsg_content(text):
"""Fjerner doble linjeskift og kutter teksten før websidemenyen starter"""
if not text: return ""
# Fjern alt som ligner på bunn-menyen til NSG
garbage_phrases = [
"Klubbens hjemmeside", "Resultatlister i Golfbox", "Livescoring",
"Scoreinntasting", "Lagserie", "Turneringer", "Innmelding"
]
for phrase in garbage_phrases:
text = text.split(phrase)[0]
# Rydd opp i linjeskift og doble mellomrom
text = text.replace('\r', '').replace('\n', ' ')
text = re.sub(r'\s+', ' ', text).strip()
return text
async def get_nsg_links(client):
links = []
urls = ["https://seniorgolf.no/lojalitetskort-sitemap.xml", "https://seniorgolf.no/fordelskortet/"]
for url in urls:
try:
resp = await client.get(url)
if resp.status_code == 200:
if ".xml" in url:
found = re.findall(r'<loc>(https://seniorgolf.no/lojalitetskort/.*?/)</loc>', resp.text)
if found: return list(set(found))
else:
soup = BeautifulSoup(resp.text, 'html.parser')
links.extend([l['href'] for l in soup.select('a[href*="/lojalitetskort/"]')])
except: continue
return list(set(links))
async def scrape_nsg():
print("🚀 Starter NSG VASKEMASKIN v3.8...")
conn = await asyncpg.connect(DB_URL)
facilities = await conn.fetch("SELECT id, name FROM facilities")
async with httpx.AsyncClient(timeout=20.0, headers={'User-Agent': 'Mozilla/5.0'}) as client:
all_nsg_links = await get_nsg_links(client)
link_map = {clean_name(l.split('/')[-2].replace('-', ' ')): l for l in all_nsg_links}
matches_found = 0
for fac in facilities:
fac_name_clean = clean_name(fac['name'])
match_url = link_map.get(fac_name_clean)
if not match_url:
for slug, url in link_map.items():
if fac_name_clean in slug or slug in fac_name_clean:
match_url = url
break
if match_url:
try:
f_resp = await client.get(match_url)
f_soup = BeautifulSoup(f_resp.text, 'html.parser')
# Finn hovedinnholdet i stedet for hele siden for å unngå menyer
main_content = f_soup.find('div', {'class': 'entry-content'}) or f_soup
text = main_content.get_text()
st = re.search(r"Starttider:?\s*(.*?)(?=Greenfee|Booking|Adresse|Kontakt|$)", text, re.S | re.I)
gf = re.search(r"Greenfee:?\s*(.*?)(?=Booking|Adresse|Kontakt|$)", text, re.S | re.I)
bk = re.search(r"Booking:?\s*(.*?)(?=Adresse|Kontakt|$)", text, re.S | re.I)
nsg_data = {
"url": match_url,
"starttider": clean_nsg_content(st.group(1)) if st else "Se nettside",
"greenfee": clean_nsg_content(gf.group(1)) if gf else "Se nettside",
"booking": clean_nsg_content(bk.group(1)) if bk else "Se nettside"
}
await conn.execute("UPDATE facilities SET nsg_data = $1 WHERE id = $2", json.dumps(nsg_data), fac['id'])
print(f"✅ Vasket & Lagret: {fac['name']}")
matches_found += 1
except: pass
await conn.close()
print(f"\n🎉 Vask ferdig! {matches_found} baner er nå 100% klare.")
if __name__ == "__main__":
asyncio.run(scrape_nsg())