Created
September 30, 2020 18:38
-
-
Save ansemjo/7f3d47fa31439ab137a2df2f510c4f5d to your computer and use it in GitHub Desktop.
parse a humble book bundle downloads page and output a more useful json for bulk downloading
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# download multiple books from a humblebundle books page at once | |
# the "bulk download" button on their site is useless ... | |
# you *might* need to copy / export the page html from developer tools | |
from pyquery import PyQuery as pq | |
import sys, json | |
# read in page html from stdin | |
page = pq(sys.stdin.read()) | |
# get the download rows | |
rows = [pq(r) for r in page("div.row")] | |
# get the downloads with title and links from rows | |
downloads = [{ "title": r("div.title > a").text(), "downloads": [pq(a).attr("href") for a in r("div.download a.a")] } for r in rows] | |
# dump json structure to stdout | |
json.dump(downloads, sys.stdout) | |
# now you can export a plaintext list for parsing in | |
# bash from this json with a little more python: | |
# | |
# for book in downloads: | |
# for link in book["downloads"]: | |
# print(link, book["title"]) | |
# | |
# and then download books of each filetype with a simple while-read-do-loop: | |
# | |
# grep '\.epub' downloads.txt | while read link title; do echo "$title"; curl -# -o "$title.epub" "$link"; done | |
# grep '\.pdf' downloads.txt | while read link title; do echo "$title"; curl -# -o "$title.pdf" "$link"; done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment