rgbkrk/nodemirror.py

## nodemirror.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-

'''
Wrote this very fast to quickly mirror nodejs's dist directory, which nvm
(the node version manager) uses to pull releases of node.

If nodejs.org provided rsync access (or other means), I would have
just mirrored it that way.

See https://twitter.com/nodejs/status/211311765479374848

I declare this code to be in the public domain because it's just a simple
script.

Totally could have made Scrapy do it or combined enough options with wget.
'''

import requests
import lxml
import os

from lxml.cssselect import CSSSelector
from lxml.html import fromstring

def mirror_time(base_url, req_path):
    '''
    Simply grabs full_url=base_url+req_path

    If full_url is text/html, it recurses through all links and creates a
    directory for req_path.

    If full_url is anything else, it downloads it to a file.
    '''

    # Ignore these so we can just grab the meaty bits
    for bad_path in set(["docs", ".."]):
        if bad_path in req_path:
            return

    full_url = os.path.join(base_url,req_path)

    # Status is good
    print("Downloading {}".format(full_url))

    # Let requests do the heavy lifting
    try:
        r = requests.get(full_url)
    except Exception as e:
        print(e)
        print("Requests fail, carry on")
        return

    if(not r.ok):
        return

    content_type = r.headers['content-type']

    if(content_type == 'text/html'):
        # Only want directories
        if("<title>Index of" not in r.content):
            print("Ignoring {}, possibly not a directory".format(req_path))
            return

        # Assume directory now
        html = fromstring(r.content)
        links = CSSSelector("a")
        newlinks = [e.text for e in links(html)]

        if(not os.path.exists(req_path)):
            os.mkdir(req_path)
        elif(not os.path.isdir(req_path)):
            raise Exception("{} exists and is not a directory".format(req_path))
        for newlink in newlinks:
            try:
                mirror_time(base_url, os.path.join(req_path,newlink))
            except Exception as e:
                print("Tried mirroring {}, failed".format(req_path))
                print(e)

    else:
        # Download it to a file
        with open(req_path, 'wb') as ff:
            for chunk in r.iter_content(1024):
                ff.write(chunk)

if __name__ == "__main__":
    mirror_time("http://nodejs.org","dist")
	#!/usr/bin/env python
	# -- coding: utf-8 --

	'''
	Wrote this very fast to quickly mirror nodejs's dist directory, which nvm
	(the node version manager) uses to pull releases of node.

	If nodejs.org provided rsync access (or other means), I would have
	just mirrored it that way.

	See https://twitter.com/nodejs/status/211311765479374848

	I declare this code to be in the public domain because it's just a simple
	script.

	Totally could have made Scrapy do it or combined enough options with wget.
	'''

	import requests
	import lxml
	import os

	from lxml.cssselect import CSSSelector
	from lxml.html import fromstring

	def mirror_time(base_url, req_path):
	'''
	Simply grabs full_url=base_url+req_path

	If full_url is text/html, it recurses through all links and creates a
	directory for req_path.

	If full_url is anything else, it downloads it to a file.
	'''

	# Ignore these so we can just grab the meaty bits
	for bad_path in set(["docs", ".."]):
	if bad_path in req_path:
	return

	full_url = os.path.join(base_url,req_path)

	# Status is good
	print("Downloading {}".format(full_url))

	# Let requests do the heavy lifting
	try:
	r = requests.get(full_url)
	except Exception as e:
	print(e)
	print("Requests fail, carry on")
	return

	if(not r.ok):
	return

	content_type = r.headers['content-type']

	if(content_type == 'text/html'):
	# Only want directories
	if("<title>Index of" not in r.content):
	print("Ignoring {}, possibly not a directory".format(req_path))
	return

	# Assume directory now
	html = fromstring(r.content)
	links = CSSSelector("a")
	newlinks = [e.text for e in links(html)]

	if(not os.path.exists(req_path)):
	os.mkdir(req_path)
	elif(not os.path.isdir(req_path)):
	raise Exception("{} exists and is not a directory".format(req_path))
	for newlink in newlinks:
	try:
	mirror_time(base_url, os.path.join(req_path,newlink))
	except Exception as e:
	print("Tried mirroring {}, failed".format(req_path))
	print(e)

	else:
	# Download it to a file
	with open(req_path, 'wb') as ff:
	for chunk in r.iter_content(1024):
	ff.write(chunk)

	if __name__ == "__main__":
	mirror_time("http://nodejs.org","dist")