emsi/googleNIP.py

## googleNIP.py
#!/usr/bin/env python

import urllib2
import re
from bs4 import BeautifulSoup

def most_common(lst):
    return max(set(lst), key=lst.count)

def googleNIP(nazwa_firmy):
    qtitle=urllib.quote_plus(nazwa_firmy)
    url="https://www.google.pl/search?num=50&q=NIP+%22"+qtitle+"%22" # google url
    req = urllib2.Request(url, headers={ 'User-Agent': 'Mozilla/5.0' })
    response = urllib2.urlopen(req)
    html = response.read()
    reISBN=re.compile("NIP[^0-9]*([\d\-]{10,17})") # regexp all NIPs
    found_nip=re.findall(reISBN,html)
    found_nip=map(lambda n: n.translate(None, '-'),found_nip) # standarize NIP format
    return found_nip

# Example usage:
nip=googleNIP("KGHM")
print most_common(nip)

## krsDataFromNIP.py
import urllib
import re
from bs4 import BeautifulSoup

def krsDataFromNIP(nip):
    data={} # struktura do przechowania danych o firmie
    # znajdź firmę na www.krs.wp.pl
    qtitle=urllib.quote_plus(nip)
    url="http://www.krs.wp.pl/szukaj/wszystko/"+qtitle
    req = urllib2.Request(url, headers={ 'User-Agent': 'Mozilla/5.0' })
    response = urllib2.urlopen(req)
    soup = BeautifulSoup(response,"lxml")
    firma=soup.find_all('div', class_='obiekt')
    krs_url=firma[0].find_all('a')[0]['href']
    data["nazwa"]=firma[0].find('span').find('strong').get_text()
    data["branza"]= firma[0].find('span').get_text()

    # odczytaj szczegółowy skład zarządu
    krs_url="http://www.krs.wp.pl"+krs_url
    req = urllib2.Request(krs_url, headers={ 'User-Agent': 'Mozilla/5.0' })
    response = urllib2.urlopen(req)
    soup = BeautifulSoup(response,"lxml")
    soup.prettify()
    opis=soup.find_all('div', class_='opis')
    osoby= opis[2].find_all('li')

    data["zarzad"]=[]
    for o in osoby:
        data["zarzad"].append(o.get_text())
    return data

# Example usage:
krs=krsDataFromNIP(most_common(nip))

print krs["nazwa"]
print krs["branza"]
print '\n'.join(krs["zarzad"])
	#!/usr/bin/env python

	import urllib2
	import re
	from bs4 import BeautifulSoup

	def most_common(lst):
	return max(set(lst), key=lst.count)

	def googleNIP(nazwa_firmy):
	qtitle=urllib.quote_plus(nazwa_firmy)
	url="https://www.google.pl/search?num=50&q=NIP+%22"+qtitle+"%22" # google url
	req = urllib2.Request(url, headers={ 'User-Agent': 'Mozilla/5.0' })
	response = urllib2.urlopen(req)
	html = response.read()
	reISBN=re.compile("NIP[^0-9]*([\d\-]{10,17})") # regexp all NIPs
	found_nip=re.findall(reISBN,html)
	found_nip=map(lambda n: n.translate(None, '-'),found_nip) # standarize NIP format
	return found_nip

	# Example usage:
	nip=googleNIP("KGHM")
	print most_common(nip)
	import urllib
	import re
	from bs4 import BeautifulSoup

	def krsDataFromNIP(nip):
	data={} # struktura do przechowania danych o firmie
	# znajdź firmę na www.krs.wp.pl
	qtitle=urllib.quote_plus(nip)
	url="http://www.krs.wp.pl/szukaj/wszystko/"+qtitle
	req = urllib2.Request(url, headers={ 'User-Agent': 'Mozilla/5.0' })
	response = urllib2.urlopen(req)
	soup = BeautifulSoup(response,"lxml")
	firma=soup.find_all('div', class_='obiekt')
	krs_url=firma[0].find_all('a')[0]['href']
	data["nazwa"]=firma[0].find('span').find('strong').get_text()
	data["branza"]= firma[0].find('span').get_text()

	# odczytaj szczegółowy skład zarządu
	krs_url="http://www.krs.wp.pl"+krs_url
	req = urllib2.Request(krs_url, headers={ 'User-Agent': 'Mozilla/5.0' })
	response = urllib2.urlopen(req)
	soup = BeautifulSoup(response,"lxml")
	soup.prettify()
	opis=soup.find_all('div', class_='opis')
	osoby= opis[2].find_all('li')

	data["zarzad"]=[]
	for o in osoby:
	data["zarzad"].append(o.get_text())
	return data

	# Example usage:
	krs=krsDataFromNIP(most_common(nip))

	print krs["nazwa"]
	print krs["branza"]
	print '\n'.join(krs["zarzad"])