Last active
November 2, 2021 08:59
-
-
Save harshpanchal-hp/67d25b56f8761b3fa0677b89c5e0a7fb to your computer and use it in GitHub Desktop.
Python Beautifulsoup
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import lxml.etree | |
import requests | |
# target location | |
url = 'https://raw.githubusercontent.com/DataFinnovation/public-talks/master/pugs-scraping/example2.html' | |
# get the page | |
page = requests.get(url) | |
# parse it | |
tree = lxml.etree.fromstring(page.content) | |
# what elements we care about | |
theXPath = '//a[text()="link"]/@href' | |
# grab all of them | |
resultList = tree.xpath(theXPath) | |
# now grab one of those links | |
page2 = requests.get(resultList[0]) | |
# and dump the headers for this next link | |
print(page2.headers) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup as soup # HTML data structure | |
from urllib.request import urlopen as uReq # Web client | |
# URl to web scrap from. | |
# in this example we web scrap graphics cards from Newegg.com | |
page_url = "http://www.newegg.com/Product/ProductList.aspx?Submit=ENE&N=-1&IsNodeId=1&Description=GTX&bop=And&Page=1&PageSize=36&order=BESTMATCH" | |
# opens the connection and downloads html page from url | |
uClient = uReq(page_url) | |
# parses html into a soup data structure to traverse html | |
# as if it were a json data type. | |
page_soup = soup(uClient.read(), "html.parser") | |
uClient.close() | |
# finds each product from the store page | |
containers = page_soup.findAll("div", {"class": "item-container"}) | |
# name the output file to write to local disk | |
out_filename = "graphics_cards.csv" | |
# header of csv file to be written | |
headers = "brand,product_name,shippingn" | |
# opens file, and writes headers | |
f = open(out_filename, "w") | |
f.write(headers) | |
# loops over each product and grabs attributes about | |
# each product | |
for container in containers: | |
# Finds all link tags "a" from within the first div. | |
make_rating_sp = container.div.select("a") | |
# Grabs the title from the image title attribute | |
# Then does proper casing using .title() | |
brand = make_rating_sp[0].img["title"].title() | |
# Grabs the text within the second "(a)" tag from within | |
# the list of queries. | |
product_name = container.div.select("a")[2].text | |
# Grabs the product shipping information by searching | |
# all lists with the class "price-ship". | |
# Then cleans the text of white space with strip() | |
# Cleans the strip of "Shipping $" if it exists to just get number | |
shipping = container.findAll("li", {"class": "price-ship"})[0].text.strip().replace("$", "").replace(" Shipping", "") | |
# prints the dataset to console | |
print("brand: " + brand + "n") | |
print("product_name: " + product_name + "n") | |
print("shipping: " + shipping + "n") | |
# writes the dataset to file | |
f.write(brand + ", " + product_name.replace(",", "|") + ", " + shipping + "n") | |
f.close() # Close the file |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
import requests | |
import csv | |
source = requests.get('http://coreyms.com').text | |
soup = BeautifulSoup(source, 'lxml') | |
csv_file = open('cms_scrape.csv', 'w') | |
csv_writer = csv.writer(csv_file) | |
csv_writer.writerow(['headline', 'summary', 'video_link']) | |
for article in soup.find_all('article'): | |
headline = article.h2.a.text | |
print(headline) | |
summary = article.find('div', class_='entry-content').p.text | |
print(summary) | |
try: | |
vid_src = article.find('iframe', class_='youtube-player')['src'] | |
vid_id = vid_src.split('/')[4] | |
vid_id = vid_id.split('?')[0] | |
yt_link = f'https://youtube.com/watch?v={vid_id}' | |
except Exception as e: | |
yt_link = None | |
print(yt_link) | |
print() | |
csv_writer.writerow([headline, summary, yt_link]) | |
csv_file.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import tempfile | |
from time import sleep | |
from selenium import webdriver | |
from selenium.webdriver.support.ui import Select | |
# make a temporary download directory | |
downloadDir = tempfile.mkdtemp() | |
# and set up chrome to use it | |
prefs = { 'download.default_directory' : downloadDir } | |
chromeOptions = webdriver.ChromeOptions() | |
chromeOptions.add_experimental_option("prefs",prefs) | |
# kick off a chrome | |
driver = webdriver.Chrome(options=chromeOptions) | |
# back to our url | |
url = 'www.demo.com/DownloadBulkData.aspx' | |
driver.get(url) | |
# fill out the forms and buttons | |
productForm = driver.find_element_by_xpath("//select[contains(@name,'ctl00$MainContentHolder$ListBox1')]") | |
formSelect = Select(productForm) | |
formSelect.select_by_visible_text('Call Reports -- Single Period') | |
radioButton = driver.find_element_by_id('XBRLRadiobutton') | |
radioButton.click() | |
# find the download button | |
button = driver.find_element_by_name('ctl00$MainContentHolder$TabStrip1$Download_0') | |
# get listing of files in the download directory | |
startFiles = os.listdir(downloadDir) | |
# kick of the download | |
button.click() | |
# loop looking for a finished file | |
found = False | |
while not found: | |
print('searching...') | |
sleep(1) | |
# what files do we have now | |
newFiles = os.listdir(downloadDir) | |
for f in newFiles: | |
# any new files ending in .zip? | |
if f not in startFiles and f[-4:] == '.zip': | |
found = f | |
print('found!') | |
# wait a bit for demo reasons | |
sleep(15) | |
driver.quit() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<!doctype html> | |
<html class="no-js" lang=""> | |
<head> | |
<title>Test - A Sample Website</title> | |
<meta charset="utf-8"> | |
<link rel="stylesheet" href="css/normalize.css"> | |
<link rel="stylesheet" href="css/main.css"> | |
</head> | |
<body> | |
<h1 id='site_title'>Test Website</h1> | |
<hr></hr> | |
<div class="article"> | |
<h2><a href="article_1.html">Article 1 Headline</a></h2> | |
<p>This is a summary of article 1</p> | |
</div> | |
<hr></hr> | |
<div class="article"> | |
<h2><a href="article_2.html">Article 2 Headline</a></h2> | |
<p>This is a summary of article 2</p> | |
</div> | |
<hr></hr> | |
<div class='footer'> | |
<p>Footer Information</p> | |
</div> | |
<script src="js/vendor/modernizr-3.5.0.min.js"></script> | |
<script src="js/plugins.js"></script> | |
<script src="js/main.js"></script> | |
</body> | |
</html> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment