Skip to content

Instantly share code, notes, and snippets.

@suzzukin
Created January 12, 2022 13:19
Show Gist options
  • Save suzzukin/ad3dbf9b1a45086e21f89f48e053c178 to your computer and use it in GitHub Desktop.
Save suzzukin/ad3dbf9b1a45086e21f89f48e053c178 to your computer and use it in GitHub Desktop.
from bs4 import BeautifulSoup
import requests
import sys
class PythonCrawler:
def __init__(self, link):
self.headers = {
'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.3 Safari/605.1.15'
}
self.order = [
'Date',
'Article',
'Link',
]
self.get_request(link, self.headers)
self.parse_request()
def get_request(self, link, headers):
self.req = requests.get(url = link, headers = headers)
return self.req
def parse_request(self):
data = dict.fromkeys(self.order)
soup = BeautifulSoup(self.req.text, 'lxml')
upcoming_table = soup.find(class_='medium-widget event-widget last').find(class_='shrubbery')
articles = upcoming_table.find(class_='menu').find_all('li')
for article in articles:
date = article.find('time').text
data['Date'] = date
name = article.find('a').text
data['Article'] = name
link = f'''https://www.python.org{article.find('a').get('href')}'''
data['Link'] = link
for i in self.order:
sys.stdout.write(data[i]+' ')
sys.stdout.write('\n')
def main():
link = 'https://www.python.org'
crawler = PythonCrawler(link=link)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment