Created
April 29, 2020 10:57
-
-
Save hsali/9a9b340cbb0a84a8960865ee2f584296 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
from bs4 import BeautifulSoup | |
import requests | |
import wget | |
import pandas as pd | |
from requests.compat import urljoin | |
import tqdm | |
url = "https://towardsdatascience.com/springer-has-released-65-machine-learning-and-data-books-for-free-961f8181f189" | |
folder = "D:/books/SpringerDataScience" | |
def get_book(book_link): | |
book_page = requests.get(book_link) | |
# base_url = book_page.request.url.split(book_page.request.path_url)[0] | |
book_soup = BeautifulSoup(book_page.content, 'html.parser') | |
book_title = book_soup.find('div', attrs={"data-test": "book-title"}).h1.get_text() | |
book_pdf_link = book_soup.find('a', attrs={"data-track-action": "Book download - pdf"}).get('href') | |
book_pdf_link = urljoin(book_page.url, book_pdf_link) | |
author = book_soup.find('div', attrs={'data-role': 'PersonsList'}).find('span').get_text().encode('ascii', 'replace').decode().replace('?', ' ') | |
isbn = book_soup.find('span', attrs={'id': 'electronic-isbn'}).get_text() | |
return {'title': book_title, 'link': book_pdf_link, 'author': author, 'isbn': isbn} | |
content = requests.get(url) | |
soup = BeautifulSoup(content.content, 'html.parser') | |
# METHOD 1 | |
# p = soup.find_all("p") | |
# book_links = list(filter(None, [l.a for l in p])) | |
# METHOD 2 | |
links = soup.find_all('a') | |
book_links = [link.get('href') for link in links if link.get('href').count('link.springer.com') > 0] | |
link_list = [] | |
for book_link in tqdm.tqdm(book_links): | |
book = get_book(book_link) | |
book_name = book['title'].lower().replace(" ", "-") + "__" + book['author'].replace(". ", "-").replace(' ', '-') + "__" + book["isbn"] + ".pdf" | |
print("book detail") | |
book_detail = {**book, "book_pdf": book_name, "book_link": book_link} | |
print(book_detail) | |
link_list.append(book_detail) | |
wget.download(book['link'], os.path.join(folder, book_name)) | |
df = pd.DataFrame(link_list) | |
df.to_csv(os.path.join(os.path.dirname(folder) , os.path.basename(folder) + '.csv'), index=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment