jroshell/scripts/ags-rss-fetch.py
2026-05-29 17:09:52 +02:00

132 lines
4.4 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Fetch RSS/Atom feeds and write a merged cache file for AGS widgets.
Dependencies: python-feedparser (Arch: pacman -S python-feedparser)
Output: ~/.cache/ags/rss-feeds.json (flat array, sorted by date desc, max 30 items)
Supported sources:
- Any RSS / Atom feed (feedparser handles both transparently)
- YouTube channels (Atom) via ?channel_id=UC...
- Reddit (.rss suffix on any subreddit or user URL)
- Telegram / other services via RSSHub
"""
import json
import os
import re
import socket
import sys
import time
from html import unescape
from pathlib import Path
import feedparser
# ── Configuration ────────────────────────────────────────────────────────────
CACHE_DIR = os.path.join(
os.environ.get("XDG_CACHE_HOME", os.path.expanduser("~/.cache")),
"ags",
)
CACHE_FILE = os.path.join(CACHE_DIR, "rss-feeds.json")
ITEM_LIMIT = 30
REQUEST_TIMEOUT = 15
# Apply socket-level timeout for feedparser HTTP requests
socket.setdefaulttimeout(REQUEST_TIMEOUT)
# Each entry: {"url": "...", "tag": "..."}
# tag is used by the GJS widget to style / filter items per source type
FEEDS = [
# Arch Linux news
{"url": "https://archlinux.org/feeds/news/", "tag": "arch"},
# YouTube — replace CHANNEL_ID with the actual channel ID
# {"url": "https://www.youtube.com/feeds/videos.xml?channel_id=CHANNEL_ID", "tag": "youtube"},
# Reddit — append .rss to any subreddit or user URL
# {"url": "https://www.reddit.com/r/archlinux/.rss", "tag": "reddit"},
# RSSHub Telegram channel bridge
# {"url": "https://your-rsshub/telegram/channel/ChannelName", "tag": "telegram"},
]
# ── Helpers ──────────────────────────────────────────────────────────────────
_TAG_RE = re.compile(r"<[^>]+>")
def strip_html(raw: str | None) -> str:
"""Strip HTML tags and decode named/numeric entities."""
if not raw:
return ""
text = _TAG_RE.sub("", raw)
text = unescape(text)
text = re.sub(r"\s+", " ", text).strip()
return text
def parse_date(entry) -> str:
"""Normalize a feedparser entry date to ISO-8601 (UTC)."""
parsed = entry.get("published_parsed") or entry.get("updated_parsed")
if not parsed:
return ""
try:
ts = time.mktime(parsed)
return time.strftime("%Y-%m-%dT%H:%M:%S", time.gmtime(ts))
except (OverflowError, ValueError):
return ""
# ── Main ─────────────────────────────────────────────────────────────────────
def main() -> None:
all_items: list[dict] = []
for feed_conf in FEEDS:
url = feed_conf["url"]
tag = feed_conf["tag"]
try:
parsed = feedparser.parse(url, agent="ags-rss-fetch/1.0")
except Exception as exc:
print(f"Warning: failed to fetch {url}: {exc}", file=sys.stderr)
continue
# feedparser sets status attribute on HTTP-level errors
status = getattr(parsed, "status", None)
if status is not None and status >= 400:
print(f"Warning: HTTP {status} for {url}", file=sys.stderr)
continue
feed_title = parsed.feed.get("title", url)
for entry in parsed.entries:
item = {
"title": strip_html(entry.get("title", "(untitled)")),
"link": entry.get("link", ""),
"date": parse_date(entry),
"summary": strip_html(entry.get("summary") or entry.get("description", "")),
"author": strip_html(entry.get("author", "")),
"feed_title": feed_title,
"tag": tag,
}
all_items.append(item)
# Sort by date descending; items without dates land at the bottom
all_items.sort(key=lambda i: i["date"] or "", reverse=True)
# Truncate
all_items = all_items[:ITEM_LIMIT]
# Write cache
cache_path = Path(CACHE_FILE)
cache_path.parent.mkdir(parents=True, exist_ok=True)
cache_path.write_text(json.dumps(all_items, indent=2, ensure_ascii=False) + "\n")
print(f"Wrote {len(all_items)} items to {CACHE_FILE}")
if __name__ == "__main__":
main()