apanimesh061/CrawlStreetView.py

## CrawlStreetView.py
# coding=utf-8

"""
 This script crawls the street-view web-page and collects all
 of the states, counties per state, cities in every county and
 streets in every city.

 I created a separate script that generates an apartment number
 as per three formats. See GetRandomAddress.py
"""

from bs4 import BeautifulSoup
import urllib2
import urlparse
from urlparse import urljoin
import time
from datetime import datetime
from unidecode import unidecode
import json


class Stack:
    """
    Data Structure used to performs a DFS crawl on a website.
    """
    def __init__(self):
        self.items = []

    def is_empty(self):
        return self.items == []

    def push(self, item):
        self.items.append(item)

    def pop(self):
        return self.items.pop()

    def peek(self):
        return self.items[len(self.items) - 1]

    def size(self):
        return len(self.items)

    def __str__(self):
        return str(self.items)


def remove_non_ascii(text):
    """
    Removes non-ascii characters
    :param text: input text
    :return:
    """
    return unidecode(unicode(text, encoding="utf-8"))


def normalize(text):
    """
    Normalizes non-ascii characters
    :param text: input text
    :return:
    """
    try:
        text = remove_non_ascii(text.encode("utf-8"))
    except UnicodeDecodeError:
        text = remove_non_ascii(text)
    return text


def get_neighbor_links(current_url):
    """
    Get out-links for the current_url
    :param current_url: input url
    :return:
    """
    response = None
    try:
        time.sleep(1)
        response = urllib2.urlopen(current_url)
    except Exception as exp:
        print "Failed URL: {0}".format(current_url)
        print "Message:", exp.message
        print "Taking some rest..."
        time.sleep(8.5)

        attempt = 1
        while True:
            print "Retrying URL: {0}".format(current_url)
            try:
                response = urllib2.urlopen(current_url)
                break
            except:
                if attempt == 5:
                    print "Rejecting URL: {0}".format(current_url)
                    yield None
                else:
                    attempt += 1
                    time.sleep(1)
                    continue

    web_page = response.read()
    soup = BeautifulSoup(web_page, "lxml")
    for a in soup.find_all('a', href=True):
        abs_url = urljoin(current_url, a["href"])

        # Check if you are accessing the OpenStreetMap.
        # If you are accessing the view page of the StreetView
        # this is the leaf node of the DFS traversal.
        if "view.php" in abs_url:
            abs_url = abs_url.replace('&', 'and')
            parsed = urlparse.urlparse(abs_url)
            place = urlparse.parse_qs(parsed.query)['place']
            yield {"LOCATION": place}

        goto_url_without_base = current_url.rsplit('/', 1)[0] + '/'
        if goto_url_without_base in abs_url:
            yield {"URL": abs_url}


def process_address(address):
    """
    Covert the address string from the streetview page to a dictionary.
    :param address: address string
    :return: address dict
    """
    parts = map(lambda x: x.strip(), address.split(','))

    # Parsing rules
    street = parts[0]
    country = parts[-1]
    zipcode = parts[-2]
    state = parts[-3]
    if state == "D.C.":
        state = parts[-4] + " " + state
    county = ""
    city = ""

    if (len(parts)) > 5:
        county = parts[-4]
        city = parts[-5]

    yield {
        "STREET": street,
        "CITY": city,
        "COUNTY": county,
        "STATE": state,
        "ZIP": zipcode,
        "COUNTRY": country
    }


if __name__ == '__main__':
    out_file = None
    try:
        out_file = open("address.dat", "w")
        startTime = datetime.now()
        seed = "http://www.geographic.org/streetview/usa/index.html"
        stack = Stack()
        stack.push(seed)
        state_being_covered = None
        valid_leaves = 0
        visited = set()
        while not stack.is_empty():
            current_url = stack.pop()
            if current_url not in visited:
                visited.add(current_url)
                for data in get_neighbor_links(current_url=current_url):
                    if data:
                        url = data.get("URL", None)
                        if url:
                            stack.push(url)
                        else:
                            final_address = data.get("LOCATION", None)
                            if final_address:
                                address_json = next(process_address(final_address[0]))
                                valid_leaves += 1
                                if address_json["STATE"] != state_being_covered:
                                    if state_being_covered:
                                        print "Crawled", valid_leaves, "links..."
                                        print "\n"
                                    state_being_covered = address_json["STATE"]
                                    print "Started with", state_being_covered
                                    valid_leaves = 0
                                json.dump(address_json, out_file)
                                out_file.write("\n")
                                break
        totalTime = datetime.now() - startTime
        print "Total Time Taken:", totalTime, "Units"

    except KeyboardInterrupt as kbi:
        print "Crawl interrupted!"
        print "Info stored in out_file.dat"

    finally:
        out_file.close()

## GetRandomAddress.py
import random
import json


def random_address(obj):
    """
    Returns a random address from the file which stores all possible street address
    collected from the crawling http://www.geographic.org/streetview/usa
    :param obj: file object that stores addresses
    :return: a random line from the file as a dictionary
            {u'CITY': u'Harrietta', 'APARTMENT': '661-9178', u'ZIP': u'49638', u'COUNTRY': u'United States',
            u'COUNTY': u'Wexford', u'STATE': u'Michigan', u'STREET': u'N 1 1/4 Road'}
    """

    def random_with_n_digits(n):
        range_start = 10 ** (n - 1)
        range_end = (10 ** n) - 1
        return random.randint(range_start, range_end)

    def generate_apartment_number():
        """
        Generates three formats of apartment numbers:
            1. {0-90-90-9}-{0-90-90-90-9}
            2. {0-90-9A-Z}
            3. {#0-9|0-90-9|0-90-90-9}
        :return: an string representing an apt. number
        """
        apt_type_flag = random.randint(1, 3)
        if apt_type_flag == 1:
            return str(random_with_n_digits(3)) + '-' + str(random_with_n_digits(4))
        elif apt_type_flag == 2:
            return str(random_with_n_digits(2)) + chr(random.randint(65, 90))
        elif apt_type_flag == 3:
            return "#" + str(random_with_n_digits(random.randint(1, 3)))

    line = next(obj)
    for num, aline in enumerate(obj):
        if random.randrange(num + 2):
            continue
        line = aline

    address_json = json.loads(line)
    address_json["APARTMENT"] = generate_apartment_number()
    return address_json


if __name__ == '__main__':
    # Download the addresses.dat from https://www.dropbox.com/s/gl9xcq7f0lsv72d/addresses.dat?dl=0
    # Or you could create one yourself using CrawlStreetView.py
    file_obj = open("addresses.dat", "rb")
    print random_address(obj=file_obj)
    file_obj.close()
	# coding=utf-8

	"""
	This script crawls the street-view web-page and collects all
	of the states, counties per state, cities in every county and
	streets in every city.

	I created a separate script that generates an apartment number
	as per three formats. See GetRandomAddress.py
	"""

	from bs4 import BeautifulSoup
	import urllib2
	import urlparse
	from urlparse import urljoin
	import time
	from datetime import datetime
	from unidecode import unidecode
	import json


	class Stack:
	"""
	Data Structure used to performs a DFS crawl on a website.
	"""
	def __init__(self):
	self.items = []

	def is_empty(self):
	return self.items == []

	def push(self, item):
	self.items.append(item)

	def pop(self):
	return self.items.pop()

	def peek(self):
	return self.items[len(self.items) - 1]

	def size(self):
	return len(self.items)

	def __str__(self):
	return str(self.items)


	def remove_non_ascii(text):
	"""
	Removes non-ascii characters
	:param text: input text
	:return:
	"""
	return unidecode(unicode(text, encoding="utf-8"))


	def normalize(text):
	"""
	Normalizes non-ascii characters
	:param text: input text
	:return:
	"""
	try:
	text = remove_non_ascii(text.encode("utf-8"))
	except UnicodeDecodeError:
	text = remove_non_ascii(text)
	return text


	def get_neighbor_links(current_url):
	"""
	Get out-links for the current_url
	:param current_url: input url
	:return:
	"""
	response = None
	try:
	time.sleep(1)
	response = urllib2.urlopen(current_url)
	except Exception as exp:
	print "Failed URL: {0}".format(current_url)
	print "Message:", exp.message
	print "Taking some rest..."
	time.sleep(8.5)

	attempt = 1
	while True:
	print "Retrying URL: {0}".format(current_url)
	try:
	response = urllib2.urlopen(current_url)
	break
	except:
	if attempt == 5:
	print "Rejecting URL: {0}".format(current_url)
	yield None
	else:
	attempt += 1
	time.sleep(1)
	continue

	web_page = response.read()
	soup = BeautifulSoup(web_page, "lxml")
	for a in soup.find_all('a', href=True):
	abs_url = urljoin(current_url, a["href"])

	# Check if you are accessing the OpenStreetMap.
	# If you are accessing the view page of the StreetView
	# this is the leaf node of the DFS traversal.
	if "view.php" in abs_url:
	abs_url = abs_url.replace('&', 'and')
	parsed = urlparse.urlparse(abs_url)
	place = urlparse.parse_qs(parsed.query)['place']
	yield {"LOCATION": place}

	goto_url_without_base = current_url.rsplit('/', 1)[0] + '/'
	if goto_url_without_base in abs_url:
	yield {"URL": abs_url}


	def process_address(address):
	"""
	Covert the address string from the streetview page to a dictionary.
	:param address: address string
	:return: address dict
	"""
	parts = map(lambda x: x.strip(), address.split(','))

	# Parsing rules
	street = parts[0]
	country = parts[-1]
	zipcode = parts[-2]
	state = parts[-3]
	if state == "D.C.":
	state = parts[-4] + " " + state
	county = ""
	city = ""

	if (len(parts)) > 5:
	county = parts[-4]
	city = parts[-5]

	yield {
	"STREET": street,
	"CITY": city,
	"COUNTY": county,
	"STATE": state,
	"ZIP": zipcode,
	"COUNTRY": country
	}


	if __name__ == '__main__':
	out_file = None
	try:
	out_file = open("address.dat", "w")
	startTime = datetime.now()
	seed = "http://www.geographic.org/streetview/usa/index.html"
	stack = Stack()
	stack.push(seed)
	state_being_covered = None
	valid_leaves = 0
	visited = set()
	while not stack.is_empty():
	current_url = stack.pop()
	if current_url not in visited:
	visited.add(current_url)
	for data in get_neighbor_links(current_url=current_url):
	if data:
	url = data.get("URL", None)
	if url:
	stack.push(url)
	else:
	final_address = data.get("LOCATION", None)
	if final_address:
	address_json = next(process_address(final_address[0]))
	valid_leaves += 1
	if address_json["STATE"] != state_being_covered:
	if state_being_covered:
	print "Crawled", valid_leaves, "links..."
	print "\n"
	state_being_covered = address_json["STATE"]
	print "Started with", state_being_covered
	valid_leaves = 0
	json.dump(address_json, out_file)
	out_file.write("\n")
	break
	totalTime = datetime.now() - startTime
	print "Total Time Taken:", totalTime, "Units"

	except KeyboardInterrupt as kbi:
	print "Crawl interrupted!"
	print "Info stored in out_file.dat"

	finally:
	out_file.close()
	import random
	import json


	def random_address(obj):
	"""
	Returns a random address from the file which stores all possible street address
	collected from the crawling http://www.geographic.org/streetview/usa
	:param obj: file object that stores addresses
	:return: a random line from the file as a dictionary
	{u'CITY': u'Harrietta', 'APARTMENT': '661-9178', u'ZIP': u'49638', u'COUNTRY': u'United States',
	u'COUNTY': u'Wexford', u'STATE': u'Michigan', u'STREET': u'N 1 1/4 Road'}
	"""

	def random_with_n_digits(n):
	range_start = 10 ** (n - 1)
	range_end = (10 ** n) - 1
	return random.randint(range_start, range_end)

	def generate_apartment_number():
	"""
	Generates three formats of apartment numbers:
	1. {0-90-90-9}-{0-90-90-90-9}
	2. {0-90-9A-Z}
	3. {#0-9\|0-90-9\|0-90-90-9}
	:return: an string representing an apt. number
	"""
	apt_type_flag = random.randint(1, 3)
	if apt_type_flag == 1:
	return str(random_with_n_digits(3)) + '-' + str(random_with_n_digits(4))
	elif apt_type_flag == 2:
	return str(random_with_n_digits(2)) + chr(random.randint(65, 90))
	elif apt_type_flag == 3:
	return "#" + str(random_with_n_digits(random.randint(1, 3)))

	line = next(obj)
	for num, aline in enumerate(obj):
	if random.randrange(num + 2):
	continue
	line = aline

	address_json = json.loads(line)
	address_json["APARTMENT"] = generate_apartment_number()
	return address_json


	if __name__ == '__main__':
	# Download the addresses.dat from https://www.dropbox.com/s/gl9xcq7f0lsv72d/addresses.dat?dl=0
	# Or you could create one yourself using CrawlStreetView.py
	file_obj = open("addresses.dat", "rb")
	print random_address(obj=file_obj)
	file_obj.close()