Files
crawlernation/crawler.py

139 lines
5.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import requests
from bs4 import BeautifulSoup
import json
import os
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
# CONFIGURATION
NATIONER_URL = "https://www.nationsguiden.se/" # Example URL, replace with actual
DISCORD_WEBHOOK_URL = "https://discord.com/api/webhooks/1437888900747104317/TI2RfDGC5dzoi5JGz6UO2aD23teYNwa6pLQOskhaDnsSVe3cr8_rly0L3K0VyIYARgeR"
def fetch_nationer_open_times(url):
import re
print(f"DEBUG: Current working directory is {os.getcwd()}")
# Use headless browser to get rendered DOM and extract event data
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--no-sandbox')
driver = webdriver.Chrome(options=chrome_options)
driver.get(url)
import time
time.sleep(5) # Wait for JS to load events
nationer_data = []
# Find all event blocks (adjust selector as needed)
event_blocks = driver.find_elements("css selector", "div.flex.flex-col.justify-evenly")
print(f"DEBUG: Found {len(event_blocks)} event blocks in DOM.")
for block in event_blocks:
try:
event_title = block.find_element("css selector", "h4 a").text.strip()
except Exception:
event_title = ""
try:
organiser = block.find_element("css selector", "a.text-primary, p.text-primary").text.strip()
except Exception:
organiser = ""
try:
open_time = block.find_element("css selector", "time").text.strip()
except Exception:
open_time = ""
try:
permalink = block.find_element("css selector", "h4 a").get_attribute("href")
except Exception:
permalink = ""
if event_title:
print(f"DEBUG: Event: {event_title}, Organiser: {organiser}, Time: {open_time}")
nationer_data.append({
"event": event_title,
"nation": organiser,
"open_time": open_time,
"permalink": permalink
})
driver.quit()
# Write debug file for extracted events
debug_path = os.path.abspath("/mnt/serverdata/html/crawlernation/events_debug.json")
try:
with open(debug_path, "w", encoding="utf-8") as f:
json.dump(nationer_data, f, ensure_ascii=False, indent=2)
print(f"Wrote debug event data to {debug_path}")
except Exception as e:
print(f"Error writing debug file: {e}")
return nationer_data
def send_to_discord_webhook(data, webhook_url):
if not webhook_url:
raise ValueError("DISCORD_WEBHOOK_URL not set")
if not data:
payload = {"content": "Inga öppettider hittades."}
try:
response = requests.post(webhook_url, json=payload)
response.raise_for_status()
return response.status_code
except requests.exceptions.HTTPError as e:
print(f"Discord webhook error: {e}\nPayload: {payload}")
return None
# Group events
pub_keywords = ['pub', "wermlandskällarn", 'orvars krog']
def is_pub_event(e):
# Keyword match
if any(k in e['event'].lower() for k in pub_keywords):
return True
# Time match: look for start time after 18:00
import re
time_match = re.search(r'(\d{2}):(\d{2})', e.get('open_time', ''))
if time_match:
hour = int(time_match.group(1))
if hour >= 18:
return True
return False
pub_events = [e for e in data if is_pub_event(e)]
other_events = [e for e in data if not is_pub_event(e)]
# Format as plain grouped messages
def format_batch(events, title):
content = f"**{title}**\n"
for item in events:
content += f"**{item['nation']}**\nEvent: {item['event']}\nTid: {item['open_time']}\n---\n"
# Discord message limit is 2000 characters
if len(content) > 1900:
content = content[:1900] + "... (truncated)"
return content
# Do not send non-pub events to Discord
# Send pub events second, as normal message
if pub_events:
payload = {"content": format_batch(pub_events, "Pub-aktiviteter")}
try:
response = requests.post(webhook_url, json=payload)
response.raise_for_status()
print("Sent pub batch to Discord.")
except requests.exceptions.HTTPError as e:
print(f"Discord webhook error: {e}\nPayload: {payload}")
def main():
nationer_data = fetch_nationer_open_times(NATIONER_URL)
send_to_discord_webhook(nationer_data, DISCORD_WEBHOOK_URL)
if __name__ == "__main__":
import time
import datetime
# Run once immediately
main()
print("First run complete. Scheduling next runs at 00:05 daily.")
while True:
now = datetime.datetime.now()
# Calculate next 00:05
next_run = now.replace(hour=0, minute=5, second=0, microsecond=0)
if now >= next_run:
next_run += datetime.timedelta(days=1)
sleep_seconds = (next_run - now).total_seconds()
print(f"Sleeping until next run at {next_run.strftime('%Y-%m-%d %H:%M:%S')} ({int(sleep_seconds)} seconds)")
time.sleep(sleep_seconds)
main()