Last active
June 14, 2024 04:20
-
-
Save yunruse/93dd40719568dccccebcca26d9ed1ccb to your computer and use it in GitHub Desktop.
very basic YouTube-to-RSS extractor
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# pip install scrapetube | |
# for basic operation, use `python -m http.server --cgi` | |
# and access /cgi-bin/yt.py?c={CHANNEL_ID} | |
# Note that due to scrapetube limits, dates are fetched as eg "3 days ago" | |
# so the further in the past a video is, the less precise the date will be. | |
from email.utils import format_datetime | |
from datetime import datetime, timedelta | |
import cgi | |
import scrapetube | |
def determine_date(string: str, dt: datetime): | |
"Return best guess at date of video release." | |
assert len(string.split()) == 3 | |
num, unit, ago = string.split() | |
assert ago == 'ago' | |
unit = unit.removesuffix('s') | |
num = int(num) | |
UNITS = { | |
'second': timedelta(seconds=1), | |
'minute': timedelta(seconds=60), | |
'hour': timedelta(seconds=3600), | |
'day': timedelta(days=1), | |
'week': timedelta(days=7), | |
'month': timedelta(days=31), | |
'year': timedelta(days=365), | |
} | |
assert unit in UNITS | |
dt = dt or datetime.today() | |
return dt - (num * UNITS[unit]) | |
FETCH_LIMIT = 15 | |
DAY_LIMIT = 100 | |
def obtain(channel_id: str): | |
dt = datetime.now() | |
for v in scrapetube.get_channel(channel_id, limit=FETCH_LIMIT): | |
dt = determine_date(v['publishedTimeText']['simpleText'], dt) | |
yield dt, v | |
RSS = """ | |
<?xml version="1.0" encoding="UTF-8" ?> | |
<rss version="2.0"> | |
<channel> | |
<title>Automatically-fetched Youtube Channel</title> | |
<link>https://youtube.com/channel/{channel_id}</link> | |
{items} | |
</channel> | |
</rss> | |
""" | |
ITEM = """ | |
<item> | |
<title>{title}</title> | |
<link>https://youtube.com/watch?v={video_id}</link> | |
<description>{description}</description> | |
<pubDate>{date}</pubDate> | |
</item>""" | |
if __name__ == '__main__': | |
print("Content-Type: application/rss+xml\n\n") | |
data = cgi.FieldStorage() | |
channel_id = data.getfirst('c') | |
ITEMS = [ | |
ITEM.format( | |
title=v['title']['runs'][0]['text'], | |
description=f"<img src=\"{v['thumbnail']['thumbnails'][-1]['url']}\">", | |
video_id=v['videoId'], | |
date=format_datetime(dt) | |
) | |
for dt, v in obtain(channel_id) | |
] | |
print(RSS.format(channel_id=channel_id, items='\n'.join(ITEMS))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment