96 lines
4 KiB
Text
96 lines
4 KiB
Text
import asyncio
|
|
import asyncpg
|
|
import httpx
|
|
from bs4 import BeautifulSoup
|
|
import re
|
|
import json
|
|
|
|
DB_URL = "postgresql://teeoff_admin:teeoff_secret_password@db:5432/teeoff"
|
|
|
|
def clean_name(text):
|
|
if not text: return ""
|
|
s = text.lower().replace("golfklubb", "").replace("gk", "").replace("par3golf", "").replace(" & ", "").strip()
|
|
return re.sub(r'[^a-z]', '', s)
|
|
|
|
def clean_nsg_content(text):
|
|
"""Fjerner doble linjeskift og kutter teksten før websidemenyen starter"""
|
|
if not text: return ""
|
|
# Fjern alt som ligner på bunn-menyen til NSG
|
|
garbage_phrases = [
|
|
"Klubbens hjemmeside", "Resultatlister i Golfbox", "Livescoring",
|
|
"Scoreinntasting", "Lagserie", "Turneringer", "Innmelding"
|
|
]
|
|
for phrase in garbage_phrases:
|
|
text = text.split(phrase)[0]
|
|
|
|
# Rydd opp i linjeskift og doble mellomrom
|
|
text = text.replace('\r', '').replace('\n', ' ')
|
|
text = re.sub(r'\s+', ' ', text).strip()
|
|
return text
|
|
|
|
async def get_nsg_links(client):
|
|
links = []
|
|
urls = ["https://seniorgolf.no/lojalitetskort-sitemap.xml", "https://seniorgolf.no/fordelskortet/"]
|
|
for url in urls:
|
|
try:
|
|
resp = await client.get(url)
|
|
if resp.status_code == 200:
|
|
if ".xml" in url:
|
|
found = re.findall(r'<loc>(https://seniorgolf.no/lojalitetskort/.*?/)</loc>', resp.text)
|
|
if found: return list(set(found))
|
|
else:
|
|
soup = BeautifulSoup(resp.text, 'html.parser')
|
|
links.extend([l['href'] for l in soup.select('a[href*="/lojalitetskort/"]')])
|
|
except: continue
|
|
return list(set(links))
|
|
|
|
async def scrape_nsg():
|
|
print("🚀 Starter NSG VASKEMASKIN v3.8...")
|
|
conn = await asyncpg.connect(DB_URL)
|
|
facilities = await conn.fetch("SELECT id, name FROM facilities")
|
|
|
|
async with httpx.AsyncClient(timeout=20.0, headers={'User-Agent': 'Mozilla/5.0'}) as client:
|
|
all_nsg_links = await get_nsg_links(client)
|
|
link_map = {clean_name(l.split('/')[-2].replace('-', ' ')): l for l in all_nsg_links}
|
|
|
|
matches_found = 0
|
|
for fac in facilities:
|
|
fac_name_clean = clean_name(fac['name'])
|
|
match_url = link_map.get(fac_name_clean)
|
|
|
|
if not match_url:
|
|
for slug, url in link_map.items():
|
|
if fac_name_clean in slug or slug in fac_name_clean:
|
|
match_url = url
|
|
break
|
|
|
|
if match_url:
|
|
try:
|
|
f_resp = await client.get(match_url)
|
|
f_soup = BeautifulSoup(f_resp.text, 'html.parser')
|
|
|
|
# Finn hovedinnholdet i stedet for hele siden for å unngå menyer
|
|
main_content = f_soup.find('div', {'class': 'entry-content'}) or f_soup
|
|
text = main_content.get_text()
|
|
|
|
st = re.search(r"Starttider:?\s*(.*?)(?=Greenfee|Booking|Adresse|Kontakt|$)", text, re.S | re.I)
|
|
gf = re.search(r"Greenfee:?\s*(.*?)(?=Booking|Adresse|Kontakt|$)", text, re.S | re.I)
|
|
bk = re.search(r"Booking:?\s*(.*?)(?=Adresse|Kontakt|$)", text, re.S | re.I)
|
|
|
|
nsg_data = {
|
|
"url": match_url,
|
|
"starttider": clean_nsg_content(st.group(1)) if st else "Se nettside",
|
|
"greenfee": clean_nsg_content(gf.group(1)) if gf else "Se nettside",
|
|
"booking": clean_nsg_content(bk.group(1)) if bk else "Se nettside"
|
|
}
|
|
|
|
await conn.execute("UPDATE facilities SET nsg_data = $1 WHERE id = $2", json.dumps(nsg_data), fac['id'])
|
|
print(f"✅ Vasket & Lagret: {fac['name']}")
|
|
matches_found += 1
|
|
except: pass
|
|
|
|
await conn.close()
|
|
print(f"\n🎉 Vask ferdig! {matches_found} baner er nå 100% klare.")
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(scrape_nsg())
|