Last active
April 18, 2016 20:24
-
-
Save rgbkrk/5499863 to your computer and use it in GitHub Desktop.
Just a quick way to mirror node's dist/ directory for nvm.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
''' | |
Wrote this very fast to quickly mirror nodejs's dist directory, which nvm | |
(the node version manager) uses to pull releases of node. | |
If nodejs.org provided rsync access (or other means), I would have | |
just mirrored it that way. | |
See https://twitter.com/nodejs/status/211311765479374848 | |
I declare this code to be in the public domain because it's just a simple | |
script. | |
Totally could have made Scrapy do it or combined enough options with wget. | |
''' | |
import requests | |
import lxml | |
import os | |
from lxml.cssselect import CSSSelector | |
from lxml.html import fromstring | |
def mirror_time(base_url, req_path): | |
''' | |
Simply grabs full_url=base_url+req_path | |
If full_url is text/html, it recurses through all links and creates a | |
directory for req_path. | |
If full_url is anything else, it downloads it to a file. | |
''' | |
# Ignore these so we can just grab the meaty bits | |
for bad_path in set(["docs", ".."]): | |
if bad_path in req_path: | |
return | |
full_url = os.path.join(base_url,req_path) | |
# Status is good | |
print("Downloading {}".format(full_url)) | |
# Let requests do the heavy lifting | |
try: | |
r = requests.get(full_url) | |
except Exception as e: | |
print(e) | |
print("Requests fail, carry on") | |
return | |
if(not r.ok): | |
return | |
content_type = r.headers['content-type'] | |
if(content_type == 'text/html'): | |
# Only want directories | |
if("<title>Index of" not in r.content): | |
print("Ignoring {}, possibly not a directory".format(req_path)) | |
return | |
# Assume directory now | |
html = fromstring(r.content) | |
links = CSSSelector("a") | |
newlinks = [e.text for e in links(html)] | |
if(not os.path.exists(req_path)): | |
os.mkdir(req_path) | |
elif(not os.path.isdir(req_path)): | |
raise Exception("{} exists and is not a directory".format(req_path)) | |
for newlink in newlinks: | |
try: | |
mirror_time(base_url, os.path.join(req_path,newlink)) | |
except Exception as e: | |
print("Tried mirroring {}, failed".format(req_path)) | |
print(e) | |
else: | |
# Download it to a file | |
with open(req_path, 'wb') as ff: | |
for chunk in r.iter_content(1024): | |
ff.write(chunk) | |
if __name__ == "__main__": | |
mirror_time("http://nodejs.org","dist") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi would it be doable to update this script to support incremental downloads? so after the init run only files changed and new will be downloaded.