harshpanchal-hp/linkxpath.py

## linkxpath.py
import lxml.etree
import requests

# target location
url = 'https://raw.githubusercontent.com/DataFinnovation/public-talks/master/pugs-scraping/example2.html'

# get the page
page = requests.get(url)

# parse it
tree = lxml.etree.fromstring(page.content)

# what elements we care about
theXPath = '//a[text()="link"]/@href'

# grab all of them
resultList = tree.xpath(theXPath)

# now grab one of those links
page2 = requests.get(resultList[0])

# and dump the headers for this next link
print(page2.headers)

## new_egg_scrape.py
from bs4 import BeautifulSoup as soup  # HTML data structure
from urllib.request import urlopen as uReq  # Web client

# URl to web scrap from.
# in this example we web scrap graphics cards from Newegg.com
page_url = "http://www.newegg.com/Product/ProductList.aspx?Submit=ENE&N=-1&IsNodeId=1&Description=GTX&bop=And&Page=1&PageSize=36&order=BESTMATCH"

# opens the connection and downloads html page from url
uClient = uReq(page_url)

# parses html into a soup data structure to traverse html
# as if it were a json data type.
page_soup = soup(uClient.read(), "html.parser")
uClient.close()

# finds each product from the store page
containers = page_soup.findAll("div", {"class": "item-container"})

# name the output file to write to local disk
out_filename = "graphics_cards.csv"
# header of csv file to be written
headers = "brand,product_name,shippingn"

# opens file, and writes headers
f = open(out_filename, "w")
f.write(headers)

# loops over each product and grabs attributes about
# each product
for container in containers:
    # Finds all link tags "a" from within the first div.
    make_rating_sp = container.div.select("a")

    # Grabs the title from the image title attribute
    # Then does proper casing using .title()
    brand = make_rating_sp[0].img["title"].title()

    # Grabs the text within the second "(a)" tag from within
    # the list of queries.
    product_name = container.div.select("a")[2].text

    # Grabs the product shipping information by searching
    # all lists with the class "price-ship".
    # Then cleans the text of white space with strip()
    # Cleans the strip of "Shipping $" if it exists to just get number
    shipping = container.findAll("li", {"class": "price-ship"})[0].text.strip().replace("$", "").replace(" Shipping", "")

    # prints the dataset to console
    print("brand: " + brand + "n")
    print("product_name: " + product_name + "n")
    print("shipping: " + shipping + "n")

    # writes the dataset to file
    f.write(brand + ", " + product_name.replace(",", "|") + ", " + shipping + "n")

f.close()  # Close the file

## scarpe.py
from bs4 import BeautifulSoup
import requests
import csv

source = requests.get('http://coreyms.com').text

soup = BeautifulSoup(source, 'lxml')

csv_file = open('cms_scrape.csv', 'w')

csv_writer = csv.writer(csv_file)
csv_writer.writerow(['headline', 'summary', 'video_link'])

for article in soup.find_all('article'):
    headline = article.h2.a.text
    print(headline)

    summary = article.find('div', class_='entry-content').p.text
    print(summary)

    try:
        vid_src = article.find('iframe', class_='youtube-player')['src']

        vid_id = vid_src.split('/')[4]
        vid_id = vid_id.split('?')[0]

        yt_link = f'https://youtube.com/watch?v={vid_id}'
    except Exception as e:
        yt_link = None

    print(yt_link)

    print()

    csv_writer.writerow([headline, summary, yt_link])

csv_file.close()

## selenium_driver_download.py
import os
import tempfile
from time import sleep
from selenium import webdriver
from selenium.webdriver.support.ui import Select

# make a temporary download directory
downloadDir = tempfile.mkdtemp()

# and set up chrome to use it
prefs = { 'download.default_directory' : downloadDir }
chromeOptions = webdriver.ChromeOptions()
chromeOptions.add_experimental_option("prefs",prefs)

# kick off a chrome
driver = webdriver.Chrome(options=chromeOptions)

# back to our url
url = 'www.demo.com/DownloadBulkData.aspx'
driver.get(url)

# fill out the forms and buttons
productForm = driver.find_element_by_xpath("//select[contains(@name,'ctl00$MainContentHolder$ListBox1')]")
formSelect = Select(productForm)
formSelect.select_by_visible_text('Call Reports -- Single Period')
radioButton = driver.find_element_by_id('XBRLRadiobutton')
radioButton.click()

# find the download button
button = driver.find_element_by_name('ctl00$MainContentHolder$TabStrip1$Download_0')

# get listing of files in the download directory
startFiles = os.listdir(downloadDir)

# kick of the download
button.click()

# loop looking for a finished file
found = False
while not found:
    print('searching...')
    sleep(1)

    # what files do we have now
    newFiles = os.listdir(downloadDir)
    for f in newFiles:
        # any new files ending in .zip?
        if f not in startFiles and f[-4:] == '.zip':
            found = f
            print('found!')

# wait a bit for demo reasons
sleep(15)
driver.quit()

## simple.html
<!doctype html>
<html class="no-js" lang="">
    <head>
        <title>Test - A Sample Website</title>
        <meta charset="utf-8">
        <link rel="stylesheet" href="css/normalize.css">
        <link rel="stylesheet" href="css/main.css">
    </head>
    <body>
        <h1 id='site_title'>Test Website</h1>
        <hr></hr>
        <div class="article">
            <h2><a href="article_1.html">Article 1 Headline</a></h2>
            <p>This is a summary of article 1</p>
        </div>
        <hr></hr>
        <div class="article">
            <h2><a href="article_2.html">Article 2 Headline</a></h2>
            <p>This is a summary of article 2</p>
        </div>
        <hr></hr>

        <div class='footer'>
            <p>Footer Information</p>
        </div>

        <script src="js/vendor/modernizr-3.5.0.min.js"></script>
        <script src="js/plugins.js"></script>
        <script src="js/main.js"></script>
    </body>
</html>
	import lxml.etree
	import requests

	# target location
	url = 'https://raw.githubusercontent.com/DataFinnovation/public-talks/master/pugs-scraping/example2.html'

	# get the page
	page = requests.get(url)

	# parse it
	tree = lxml.etree.fromstring(page.content)

	# what elements we care about
	theXPath = '//a[text()="link"]/@href'

	# grab all of them
	resultList = tree.xpath(theXPath)

	# now grab one of those links
	page2 = requests.get(resultList[0])

	# and dump the headers for this next link
	print(page2.headers)
	from bs4 import BeautifulSoup as soup # HTML data structure
	from urllib.request import urlopen as uReq # Web client

	# URl to web scrap from.
	# in this example we web scrap graphics cards from Newegg.com
	page_url = "http://www.newegg.com/Product/ProductList.aspx?Submit=ENE&N=-1&IsNodeId=1&Description=GTX&bop=And&Page=1&PageSize=36&order=BESTMATCH"

	# opens the connection and downloads html page from url
	uClient = uReq(page_url)

	# parses html into a soup data structure to traverse html
	# as if it were a json data type.
	page_soup = soup(uClient.read(), "html.parser")
	uClient.close()

	# finds each product from the store page
	containers = page_soup.findAll("div", {"class": "item-container"})

	# name the output file to write to local disk
	out_filename = "graphics_cards.csv"
	# header of csv file to be written
	headers = "brand,product_name,shippingn"

	# opens file, and writes headers
	f = open(out_filename, "w")
	f.write(headers)

	# loops over each product and grabs attributes about
	# each product
	for container in containers:
	# Finds all link tags "a" from within the first div.
	make_rating_sp = container.div.select("a")

	# Grabs the title from the image title attribute
	# Then does proper casing using .title()
	brand = make_rating_sp[0].img["title"].title()

	# Grabs the text within the second "(a)" tag from within
	# the list of queries.
	product_name = container.div.select("a")[2].text

	# Grabs the product shipping information by searching
	# all lists with the class "price-ship".
	# Then cleans the text of white space with strip()
	# Cleans the strip of "Shipping $" if it exists to just get number
	shipping = container.findAll("li", {"class": "price-ship"})[0].text.strip().replace("$", "").replace(" Shipping", "")

	# prints the dataset to console
	print("brand: " + brand + "n")
	print("product_name: " + product_name + "n")
	print("shipping: " + shipping + "n")

	# writes the dataset to file
	f.write(brand + ", " + product_name.replace(",", "\|") + ", " + shipping + "n")

	f.close() # Close the file
	from bs4 import BeautifulSoup
	import requests
	import csv

	source = requests.get('http://coreyms.com').text

	soup = BeautifulSoup(source, 'lxml')

	csv_file = open('cms_scrape.csv', 'w')

	csv_writer = csv.writer(csv_file)
	csv_writer.writerow(['headline', 'summary', 'video_link'])

	for article in soup.find_all('article'):
	headline = article.h2.a.text
	print(headline)

	summary = article.find('div', class_='entry-content').p.text
	print(summary)

	try:
	vid_src = article.find('iframe', class_='youtube-player')['src']

	vid_id = vid_src.split('/')[4]
	vid_id = vid_id.split('?')[0]

	yt_link = f'https://youtube.com/watch?v={vid_id}'
	except Exception as e:
	yt_link = None

	print(yt_link)

	print()

	csv_writer.writerow([headline, summary, yt_link])

	csv_file.close()
	import os
	import tempfile
	from time import sleep
	from selenium import webdriver
	from selenium.webdriver.support.ui import Select

	# make a temporary download directory
	downloadDir = tempfile.mkdtemp()

	# and set up chrome to use it
	prefs = { 'download.default_directory' : downloadDir }
	chromeOptions = webdriver.ChromeOptions()
	chromeOptions.add_experimental_option("prefs",prefs)

	# kick off a chrome
	driver = webdriver.Chrome(options=chromeOptions)

	# back to our url
	url = 'www.demo.com/DownloadBulkData.aspx'
	driver.get(url)

	# fill out the forms and buttons
	productForm = driver.find_element_by_xpath("//select[contains(@name,'ctl00$MainContentHolder$ListBox1')]")
	formSelect = Select(productForm)
	formSelect.select_by_visible_text('Call Reports -- Single Period')
	radioButton = driver.find_element_by_id('XBRLRadiobutton')
	radioButton.click()

	# find the download button
	button = driver.find_element_by_name('ctl00$MainContentHolder$TabStrip1$Download_0')

	# get listing of files in the download directory
	startFiles = os.listdir(downloadDir)

	# kick of the download
	button.click()

	# loop looking for a finished file
	found = False
	while not found:
	print('searching...')
	sleep(1)

	# what files do we have now
	newFiles = os.listdir(downloadDir)
	for f in newFiles:
	# any new files ending in .zip?
	if f not in startFiles and f[-4:] == '.zip':
	found = f
	print('found!')

	# wait a bit for demo reasons
	sleep(15)
	driver.quit()
	<!doctype html>
	<html class="no-js" lang="">
	<head>
	<title>Test - A Sample Website</title>
	<meta charset="utf-8">
	<link rel="stylesheet" href="css/normalize.css">
	<link rel="stylesheet" href="css/main.css">
	</head>
	<body>
	<h1 id='site_title'>Test Website</h1>
	<hr></hr>
	<div class="article">
	<h2><a href="article_1.html">Article 1 Headline</a></h2>
	<p>This is a summary of article 1</p>
	</div>
	<hr></hr>
	<div class="article">
	<h2><a href="article_2.html">Article 2 Headline</a></h2>
	<p>This is a summary of article 2</p>
	</div>
	<hr></hr>

	<div class='footer'>
	<p>Footer Information</p>
	</div>

	<script src="js/vendor/modernizr-3.5.0.min.js"></script>
	<script src="js/plugins.js"></script>
	<script src="js/main.js"></script>
	</body>
	</html>