Skip to content

Instantly share code, notes, and snippets.

@hsali
Created April 29, 2020 10:57
Show Gist options
  • Save hsali/9a9b340cbb0a84a8960865ee2f584296 to your computer and use it in GitHub Desktop.
Save hsali/9a9b340cbb0a84a8960865ee2f584296 to your computer and use it in GitHub Desktop.
import os
from bs4 import BeautifulSoup
import requests
import wget
import pandas as pd
from requests.compat import urljoin
import tqdm
url = "https://towardsdatascience.com/springer-has-released-65-machine-learning-and-data-books-for-free-961f8181f189"
folder = "D:/books/SpringerDataScience"
def get_book(book_link):
book_page = requests.get(book_link)
# base_url = book_page.request.url.split(book_page.request.path_url)[0]
book_soup = BeautifulSoup(book_page.content, 'html.parser')
book_title = book_soup.find('div', attrs={"data-test": "book-title"}).h1.get_text()
book_pdf_link = book_soup.find('a', attrs={"data-track-action": "Book download - pdf"}).get('href')
book_pdf_link = urljoin(book_page.url, book_pdf_link)
author = book_soup.find('div', attrs={'data-role': 'PersonsList'}).find('span').get_text().encode('ascii', 'replace').decode().replace('?', ' ')
isbn = book_soup.find('span', attrs={'id': 'electronic-isbn'}).get_text()
return {'title': book_title, 'link': book_pdf_link, 'author': author, 'isbn': isbn}
content = requests.get(url)
soup = BeautifulSoup(content.content, 'html.parser')
# METHOD 1
# p = soup.find_all("p")
# book_links = list(filter(None, [l.a for l in p]))
# METHOD 2
links = soup.find_all('a')
book_links = [link.get('href') for link in links if link.get('href').count('link.springer.com') > 0]
link_list = []
for book_link in tqdm.tqdm(book_links):
book = get_book(book_link)
book_name = book['title'].lower().replace(" ", "-") + "__" + book['author'].replace(". ", "-").replace(' ', '-') + "__" + book["isbn"] + ".pdf"
print("book detail")
book_detail = {**book, "book_pdf": book_name, "book_link": book_link}
print(book_detail)
link_list.append(book_detail)
wget.download(book['link'], os.path.join(folder, book_name))
df = pd.DataFrame(link_list)
df.to_csv(os.path.join(os.path.dirname(folder) , os.path.basename(folder) + '.csv'), index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment