-
-
Save thinkjrs/63c0133ae4c2b6cda109d709a1e58b83 to your computer and use it in GitHub Desktop.
Script to scrape financial data from NASDAQ
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
from lxml import html | |
import requests | |
from time import sleep | |
import json | |
import argparse | |
from random import randint | |
def parse_finance_page(ticker): | |
""" | |
Grab financial data from NASDAQ page | |
Args: | |
ticker (str): Stock symbol | |
Returns: | |
dict: Scraped data | |
""" | |
key_stock_dict = {} | |
headers = { | |
"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", | |
"Accept-Encoding":"gzip, deflate", | |
"Accept-Language":"en-GB,en;q=0.9,en-US;q=0.8,ml;q=0.7", | |
"Connection":"keep-alive", | |
"Host":"www.nasdaq.com", | |
"Referer":"http://www.nasdaq.com", | |
"Upgrade-Insecure-Requests":"1", | |
"User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119 Safari/537.36" | |
} | |
# Retrying for failed request | |
for retries in range(5): | |
try: | |
url = "http://www.nasdaq.com/symbol/%s"%(ticker) | |
response = requests.get(url, headers = headers, verify=False) | |
if response.status_code!=200: | |
raise ValueError("Invalid Response Received From Webserver") | |
print("Parsing %s"%(url)) | |
# Adding random delay | |
sleep(randint(1,3)) | |
parser = html.fromstring(response.text) | |
xpath_head = "//div[contains(@id,'pageheader')]//h1//text()" | |
xpath_key_stock_table = '//div[contains(@class,"overview-results")]//div[contains(@class,"table-table")]/div' | |
xpath_open_price = '//b[contains(text(),"Open Price:")]/following-sibling::span/text()' | |
xpath_open_date = '//b[contains(text(),"Open Date:")]/following-sibling::span/text()' | |
xpath_close_price = '//b[contains(text(),"Close Price:")]/following-sibling::span/text()' | |
xpath_close_date = '//b[contains(text(),"Close Date:")]/following-sibling::span/text()' | |
xpath_key = './/div[@class="table-cell"]/b/text()' | |
xpath_value = './/div[@class="table-cell"]/text()' | |
raw_name = parser.xpath(xpath_head) | |
key_stock_table = parser.xpath(xpath_key_stock_table) | |
raw_open_price = parser.xpath(xpath_open_price) | |
raw_open_date = parser.xpath(xpath_open_date) | |
raw_close_price = parser.xpath(xpath_close_price) | |
raw_close_date = parser.xpath(xpath_close_date) | |
company_name = raw_name[0].replace("Common Stock Quote & Summary Data","").strip() if raw_name else '' | |
open_price =raw_open_price[0].strip() if raw_open_price else None | |
open_date = raw_open_date[0].strip() if raw_open_date else None | |
close_price = raw_close_price[0].strip() if raw_close_price else None | |
close_date = raw_close_date[0].strip() if raw_close_date else None | |
# Grabbing and cleaning keystock data | |
for i in key_stock_table: | |
key = i.xpath(xpath_key) | |
value = i.xpath(xpath_value) | |
key = ''.join(key).strip() | |
value = ' '.join(''.join(value).split()) | |
key_stock_dict[key] = value | |
nasdaq_data = { | |
"company_name":company_name, | |
"ticker":ticker, | |
"url":url, | |
"open price":open_price, | |
"open_date":open_date, | |
"close_price":close_price, | |
"close_date":close_date, | |
"key_stock_data":key_stock_dict | |
} | |
return nasdaq_data | |
except Exception as e: | |
print("Failed to process the request, Exception:%s"%(e)) | |
if __name__=="__main__": | |
argparser = argparse.ArgumentParser() | |
argparser.add_argument('ticker',help = 'Company stock symbol') | |
args = argparser.parse_args() | |
ticker = args.ticker | |
print("Fetching data for %s"%(ticker)) | |
scraped_data = parse_finance_page(ticker) | |
print("Writing scraped data to output file") | |
with open('%s-summary.json'%(ticker),'w') as fp: | |
json.dump(scraped_data,fp,indent = 4,ensure_ascii=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment