crawlernation/crawler.py

import requests
from bs4 import BeautifulSoup
import json
import os
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

# CONFIGURATION
NATIONER_URL = "https://www.nationsguiden.se/"  # Example URL, replace with actual
DISCORD_WEBHOOK_URL = "https://discord.com/api/webhooks/1437888900747104317/TI2RfDGC5dzoi5JGz6UO2aD23teYNwa6pLQOskhaDnsSVe3cr8_rly0L3K0VyIYARgeR"


def fetch_nationer_open_times(url):
    import re
    print(f"DEBUG: Current working directory is {os.getcwd()}")
    # Use headless browser to get rendered DOM and extract event data
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_argument('--no-sandbox')
    driver = webdriver.Chrome(options=chrome_options)
    driver.get(url)
    import time
    time.sleep(5)  # Wait for JS to load events

    nationer_data = []
    # Find all event blocks (adjust selector as needed)
    event_blocks = driver.find_elements("css selector", "div.flex.flex-col.justify-evenly")
    print(f"DEBUG: Found {len(event_blocks)} event blocks in DOM.")
    for block in event_blocks:
        try:
            event_title = block.find_element("css selector", "h4 a").text.strip()
        except Exception:
            event_title = ""
        try:
            organiser = block.find_element("css selector", "a.text-primary, p.text-primary").text.strip()
        except Exception:
            organiser = ""
        try:
            open_time = block.find_element("css selector", "time").text.strip()
        except Exception:
            open_time = ""
        try:
            permalink = block.find_element("css selector", "h4 a").get_attribute("href")
        except Exception:
            permalink = ""
        if event_title:
            print(f"DEBUG: Event: {event_title}, Organiser: {organiser}, Time: {open_time}")
            nationer_data.append({
                "event": event_title,
                "nation": organiser,
                "open_time": open_time,
                "permalink": permalink
            })
    driver.quit()
    # Write debug file for extracted events
    debug_path = os.path.abspath("/mnt/serverdata/html/crawlernation/events_debug.json")
    try:
        with open(debug_path, "w", encoding="utf-8") as f:
            json.dump(nationer_data, f, ensure_ascii=False, indent=2)
        print(f"Wrote debug event data to {debug_path}")
    except Exception as e:
        print(f"Error writing debug file: {e}")
    return nationer_data


def send_to_discord_webhook(data, webhook_url):
    if not webhook_url:
        raise ValueError("DISCORD_WEBHOOK_URL not set")
    if not data:
        payload = {"content": "Inga öppettider hittades."}
        try:
            response = requests.post(webhook_url, json=payload)
            response.raise_for_status()
            return response.status_code
        except requests.exceptions.HTTPError as e:
            print(f"Discord webhook error: {e}\nPayload: {payload}")
            return None
    # Group events
    pub_keywords = ['pub', 'wermlandskälla', 'orvars krog']
    def is_pub_event(e):
        # Keyword match
        if any(k in e['event'].lower() for k in pub_keywords):
            return True
        # Time match: look for start time after 18:00
        import re
        time_match = re.search(r'(\d{2}):(\d{2})', e.get('open_time', ''))
        if time_match:
            hour = int(time_match.group(1))
            if hour >= 18:
                return True
        return False
    pub_events = [e for e in data if is_pub_event(e)]
    other_events = [e for e in data if not is_pub_event(e)]
    # Format as plain grouped messages
    def format_batch(events, title):
        import datetime
        today = datetime.datetime.now().strftime('%Y-%m-%d')
        content = f"## {today}\n**{title}**\n"
        for item in events:
            content += (
                f"> **{item['nation']}**\n"
                f"> **Event:** {item['event']}\n"
                f"> **Tid:** {item['open_time']}\n"
                f"> ————————————————\n"
            )
        return content

    def send_in_batches(content, webhook_url):
        max_len = 2000
        lines = content.split('\n')
        batch = ''
        for line in lines:
            if len(batch) + len(line) + 1 > max_len:
                payload = {"content": batch}
                try:
                    response = requests.post(webhook_url, json=payload)
                    response.raise_for_status()
                except requests.exceptions.HTTPError as e:
                    print(f"Discord webhook error: {e}\nPayload: {payload}")
                batch = ''
            batch += line + '\n'
        if batch.strip():
            payload = {"content": batch}
            try:
                response = requests.post(webhook_url, json=payload)
                response.raise_for_status()
            except requests.exceptions.HTTPError as e:
                print(f"Discord webhook error: {e}\nPayload: {payload}")
    # Do not send non-pub events to Discord
    # Send pub events second, as normal message
    if pub_events:
        content = format_batch(pub_events, "Pub-aktiviteter")
        send_in_batches(content, webhook_url)


def main():
    nationer_data = fetch_nationer_open_times(NATIONER_URL)
    send_to_discord_webhook(nationer_data, DISCORD_WEBHOOK_URL)


if __name__ == "__main__":
    import time
    import datetime

    # Run once immediately
    main()
    print("First run complete. Scheduling next runs at 00:05 daily.")
    while True:
        now = datetime.datetime.now()
        # Calculate next 00:05
        next_run = now.replace(hour=0, minute=5, second=0, microsecond=0)
        if now >= next_run:
            next_run += datetime.timedelta(days=1)
        sleep_seconds = (next_run - now).total_seconds()
        print(f"Sleeping until next run at {next_run.strftime('%Y-%m-%d %H:%M:%S')} ({int(sleep_seconds)} seconds)")
        time.sleep(sleep_seconds)
        main()