Last active
June 9, 2016 21:28
-
-
Save emsi/e17219257a3733e95c4a4c33d21ea5a6 to your computer and use it in GitHub Desktop.
Funkcje do googlowania NIPu firmy po nazwie
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import urllib2 | |
import re | |
from bs4 import BeautifulSoup | |
def most_common(lst): | |
return max(set(lst), key=lst.count) | |
def googleNIP(nazwa_firmy): | |
qtitle=urllib.quote_plus(nazwa_firmy) | |
url="https://www.google.pl/search?num=50&q=NIP+%22"+qtitle+"%22" # google url | |
req = urllib2.Request(url, headers={ 'User-Agent': 'Mozilla/5.0' }) | |
response = urllib2.urlopen(req) | |
html = response.read() | |
reISBN=re.compile("NIP[^0-9]*([\d\-]{10,17})") # regexp all NIPs | |
found_nip=re.findall(reISBN,html) | |
found_nip=map(lambda n: n.translate(None, '-'),found_nip) # standarize NIP format | |
return found_nip | |
# Example usage: | |
nip=googleNIP("KGHM") | |
print most_common(nip) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib | |
import re | |
from bs4 import BeautifulSoup | |
def krsDataFromNIP(nip): | |
data={} # struktura do przechowania danych o firmie | |
# znajdź firmę na www.krs.wp.pl | |
qtitle=urllib.quote_plus(nip) | |
url="http://www.krs.wp.pl/szukaj/wszystko/"+qtitle | |
req = urllib2.Request(url, headers={ 'User-Agent': 'Mozilla/5.0' }) | |
response = urllib2.urlopen(req) | |
soup = BeautifulSoup(response,"lxml") | |
firma=soup.find_all('div', class_='obiekt') | |
krs_url=firma[0].find_all('a')[0]['href'] | |
data["nazwa"]=firma[0].find('span').find('strong').get_text() | |
data["branza"]= firma[0].find('span').get_text() | |
# odczytaj szczegółowy skład zarządu | |
krs_url="http://www.krs.wp.pl"+krs_url | |
req = urllib2.Request(krs_url, headers={ 'User-Agent': 'Mozilla/5.0' }) | |
response = urllib2.urlopen(req) | |
soup = BeautifulSoup(response,"lxml") | |
soup.prettify() | |
opis=soup.find_all('div', class_='opis') | |
osoby= opis[2].find_all('li') | |
data["zarzad"]=[] | |
for o in osoby: | |
data["zarzad"].append(o.get_text()) | |
return data | |
# Example usage: | |
krs=krsDataFromNIP(most_common(nip)) | |
print krs["nazwa"] | |
print krs["branza"] | |
print '\n'.join(krs["zarzad"]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment