jroshell/scripts/ags-rss-fetch.py

#!/usr/bin/env python3
"""
Fetch RSS/Atom feeds and write a merged cache file for AGS widgets.

Dependencies: python-feedparser (Arch: pacman -S python-feedparser)

Output: ~/.cache/ags/rss-feeds.json (flat array, sorted by date desc, max 30 items)

Supported sources:
  - Any RSS / Atom feed (feedparser handles both transparently)
  - YouTube channels (Atom) via ?channel_id=UC...
  - Reddit (.rss suffix on any subreddit or user URL)
  - Telegram / other services via RSSHub
"""

import json
import os
import re
import socket
import sys
import time
from html import unescape
from pathlib import Path

import feedparser

# ── Configuration ────────────────────────────────────────────────────────────

CACHE_DIR = os.path.join(
    os.environ.get("XDG_CACHE_HOME", os.path.expanduser("~/.cache")),
    "ags",
)
CACHE_FILE = os.path.join(CACHE_DIR, "rss-feeds.json")
ITEM_LIMIT = 30
REQUEST_TIMEOUT = 15

# Apply socket-level timeout for feedparser HTTP requests
socket.setdefaulttimeout(REQUEST_TIMEOUT)

# Each entry: {"url": "...", "tag": "..."}
# tag is used by the GJS widget to style / filter items per source type
FEEDS = [
    # Arch Linux news
    {"url": "https://archlinux.org/feeds/news/", "tag": "arch"},
    # YouTube — replace CHANNEL_ID with the actual channel ID
    # {"url": "https://www.youtube.com/feeds/videos.xml?channel_id=CHANNEL_ID", "tag": "youtube"},
    # Reddit — append .rss to any subreddit or user URL
    # {"url": "https://www.reddit.com/r/archlinux/.rss", "tag": "reddit"},
    # RSSHub Telegram channel bridge
    # {"url": "https://your-rsshub/telegram/channel/ChannelName", "tag": "telegram"},
]


# ── Helpers ──────────────────────────────────────────────────────────────────

_TAG_RE = re.compile(r"<[^>]+>")


def strip_html(raw: str | None) -> str:
    """Strip HTML tags and decode named/numeric entities."""
    if not raw:
        return ""
    text = _TAG_RE.sub("", raw)
    text = unescape(text)
    text = re.sub(r"\s+", " ", text).strip()
    return text


def parse_date(entry) -> str:
    """Normalize a feedparser entry date to ISO-8601 (UTC)."""
    parsed = entry.get("published_parsed") or entry.get("updated_parsed")
    if not parsed:
        return ""
    try:
        ts = time.mktime(parsed)
        return time.strftime("%Y-%m-%dT%H:%M:%S", time.gmtime(ts))
    except (OverflowError, ValueError):
        return ""


# ── Main ─────────────────────────────────────────────────────────────────────


def main() -> None:
    all_items: list[dict] = []

    for feed_conf in FEEDS:
        url = feed_conf["url"]
        tag = feed_conf["tag"]

        try:
            parsed = feedparser.parse(url, agent="ags-rss-fetch/1.0")
        except Exception as exc:
            print(f"Warning: failed to fetch {url}: {exc}", file=sys.stderr)
            continue

        # feedparser sets status attribute on HTTP-level errors
        status = getattr(parsed, "status", None)
        if status is not None and status >= 400:
            print(f"Warning: HTTP {status} for {url}", file=sys.stderr)
            continue

        feed_title = parsed.feed.get("title", url)

        for entry in parsed.entries:
            item = {
                "title": strip_html(entry.get("title", "(untitled)")),
                "link": entry.get("link", ""),
                "date": parse_date(entry),
                "summary": strip_html(entry.get("summary") or entry.get("description", "")),
                "author": strip_html(entry.get("author", "")),
                "feed_title": feed_title,
                "tag": tag,
            }
            all_items.append(item)

    # Sort by date descending; items without dates land at the bottom
    all_items.sort(key=lambda i: i["date"] or "", reverse=True)

    # Truncate
    all_items = all_items[:ITEM_LIMIT]

    # Write cache
    cache_path = Path(CACHE_FILE)
    cache_path.parent.mkdir(parents=True, exist_ok=True)
    cache_path.write_text(json.dumps(all_items, indent=2, ensure_ascii=False) + "\n")

    print(f"Wrote {len(all_items)} items to {CACHE_FILE}")


if __name__ == "__main__":
    main()