Skip to content

Instantly share code, notes, and snippets.

@yunruse
Last active June 14, 2024 04:20
Show Gist options
  • Save yunruse/93dd40719568dccccebcca26d9ed1ccb to your computer and use it in GitHub Desktop.
Save yunruse/93dd40719568dccccebcca26d9ed1ccb to your computer and use it in GitHub Desktop.
very basic YouTube-to-RSS extractor
#!/usr/bin/env python3
# pip install scrapetube
# for basic operation, use `python -m http.server --cgi`
# and access /cgi-bin/yt.py?c={CHANNEL_ID}
# Note that due to scrapetube limits, dates are fetched as eg "3 days ago"
# so the further in the past a video is, the less precise the date will be.
from email.utils import format_datetime
from datetime import datetime, timedelta
import cgi
import scrapetube
def determine_date(string: str, dt: datetime):
"Return best guess at date of video release."
assert len(string.split()) == 3
num, unit, ago = string.split()
assert ago == 'ago'
unit = unit.removesuffix('s')
num = int(num)
UNITS = {
'second': timedelta(seconds=1),
'minute': timedelta(seconds=60),
'hour': timedelta(seconds=3600),
'day': timedelta(days=1),
'week': timedelta(days=7),
'month': timedelta(days=31),
'year': timedelta(days=365),
}
assert unit in UNITS
dt = dt or datetime.today()
return dt - (num * UNITS[unit])
FETCH_LIMIT = 15
DAY_LIMIT = 100
def obtain(channel_id: str):
dt = datetime.now()
for v in scrapetube.get_channel(channel_id, limit=FETCH_LIMIT):
dt = determine_date(v['publishedTimeText']['simpleText'], dt)
yield dt, v
RSS = """
<?xml version="1.0" encoding="UTF-8" ?>
<rss version="2.0">
<channel>
<title>Automatically-fetched Youtube Channel</title>
<link>https://youtube.com/channel/{channel_id}</link>
{items}
</channel>
</rss>
"""
ITEM = """
<item>
<title>{title}</title>
<link>https://youtube.com/watch?v={video_id}</link>
<description>{description}</description>
<pubDate>{date}</pubDate>
</item>"""
if __name__ == '__main__':
print("Content-Type: application/rss+xml\n\n")
data = cgi.FieldStorage()
channel_id = data.getfirst('c')
ITEMS = [
ITEM.format(
title=v['title']['runs'][0]['text'],
description=f"<img src=\"{v['thumbnail']['thumbnails'][-1]['url']}\">",
video_id=v['videoId'],
date=format_datetime(dt)
)
for dt, v in obtain(channel_id)
]
print(RSS.format(channel_id=channel_id, items='\n'.join(ITEMS)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment