Last active
October 18, 2022 05:05
-
-
Save swyxio/6cce0e05d32610c3595d5480d7af695d to your computer and use it in GitHub Desktop.
web scraping + gpt3. given a company name we scrape google for relevant urls and then scrape those urls for info. persisting each step in case google blocks, so we can switch IP and carry on. once we accumulated our corpus, feed into openai to generate company categories and descriptions.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# https://beta.openai.com/docs/libraries | |
import os | |
import openai | |
import yaml | |
# Load your API key from an environment variable or secret management service | |
openai.api_key = os.getenv("OPENAI_API_KEY") | |
# open res.yaml and load it into the data variable | |
with open('res.yaml') as f: | |
sources = yaml.load(f, Loader=yaml.FullLoader) | |
cats = """ | |
The following are valid categories: | |
- Advertising | |
- Analytics | |
- AWS | |
- CRM | |
- Collaboration | |
- Communication | |
- Content | |
- Customer Success | |
- Data Lake | |
- Data Warehouse | |
- Databases | |
- Dev Tools | |
- E-Commerce | |
- ERP | |
- Event Streaming | |
- Files | |
- Finance | |
- Google Cloud | |
- Human Resources | |
- Marketing Automation | |
- Marketplace | |
- Microsoft Azure | |
- Payments | |
- Recruiting | |
- Search | |
- Security | |
- Survey | |
""" | |
def getOpenAIResponses(data): | |
prompt = f"""The following is a web scrape of all content relevant to for {data['name']}: | |
--- | |
Their Google description says: {data['google_description']} | |
--- | |
Their about page says: {data['about_content']} | |
--- | |
Their website says: {data['www_url']} | |
--- | |
""" | |
instructions = f""" | |
Please reply with a comma separated list of up to 3 valid categories for this company specifically only chosen from the list above. | |
Do not use any other categories not specifilly listed above. | |
Examples | |
Name: Adjust.com | |
Categories: Advertising, Analytics, Marketing Automation | |
Name: AlloyDB for PostgreSQL | |
Categories: Databases, Dev Tools, Security | |
Name: {data['name']} | |
Categories: """ | |
response = openai.Completion.create(model="text-davinci-002", prompt=prompt + cats + instructions, temperature=0.5, max_tokens=256, top_p=1, frequency_penalty=1, presence_penalty=1) | |
# print(response) | |
data['ai_categories'] = response['choices'][0]['text'].strip().replace('\n', ' ') | |
instructions = f"""Please reply with a non-numbered, comma-separated list of five different search terms for this company. Do not use the company name. | |
Examples | |
Name: Airtable | |
Terms: custom applications, no-code apps, online spreadsheets, workflow management, business transformation | |
Name: Apify Dataset | |
Terms: data scraping, web scraping, automation, data extraction, web crawler | |
Name: {data['name']} | |
Terms:""" | |
response = openai.Completion.create(model="text-davinci-002", prompt=prompt + instructions, temperature=0, max_tokens=256, top_p=1, frequency_penalty=1, presence_penalty=1) | |
# print(instructions) | |
# print(response) | |
data['ai_searchterms'] = response['choices'][0]['text'].strip().replace('\n', ' ') | |
instructions = f"""Please reply with an exciting, dynamic and engaging marketing Headline describing {data['name']} in less than 6 words. | |
Do not use its name in this headline. | |
Examples | |
Name: AppFollow | |
Headline: Insights to Help Your Mobile App Thrive | |
Name: BigCommerce | |
Headline: The Most Trusted Commerce Solution Provider | |
Name: {data['name']} | |
Headline: """ | |
response = openai.Completion.create(model="text-davinci-002", prompt=prompt + instructions, temperature=0, max_tokens=256, top_p=1, frequency_penalty=1, presence_penalty=1) | |
# print(instructions) | |
# print(response) | |
data['ai_headline'] = response['choices'][0]['text'].strip().replace('\n', ' ') | |
# if the headline starts with the name, remove the name | |
if data['ai_headline'].startswith(data['name']): | |
data['ai_headline'] = data['ai_headline'].replace(data['name'] + ": ", '').strip() | |
instructions = f""" | |
Please reply with a detailed and technical medium-length Description for {data['name']} in less than 40 words. | |
Describe the names of its main products and what it does, writing for a technical data engineer audience. | |
Begin. | |
{data['name']} is """ | |
response = openai.Completion.create(model="text-davinci-002", prompt=prompt + instructions, temperature=0, max_tokens=256, top_p=1, frequency_penalty=1, presence_penalty=1) | |
# print(instructions) | |
# print(response) | |
data['ai_description'] = response['choices'][0]['text'].strip().replace('\n', ' ') | |
instructions = f"Please reply with a detailed and technically in-depth description listing each and every products, features, use cases, and origin story of {data['name']} specifically for the data engineering audience in under 200 words.\n\nBegin.\n\n" | |
response = openai.Completion.create(model="text-davinci-002", prompt=prompt + instructions, temperature=0, max_tokens=1000, top_p=1, frequency_penalty=1, presence_penalty=1) | |
# print(instructions) | |
# print(response) | |
data['ai_long_description'] = response['choices'][0]['text'].strip().replace('\n', ' ') | |
import csv | |
for i, data in enumerate(sources[0:30]): | |
print(f"Processing {i} of {len(sources)}: " + data['name']) | |
getOpenAIResponses(data) | |
# # delete the content field in data | |
# del data['www_content'] | |
# del data['about_content'] | |
# # dump all sources to an xlsx file | |
# with open('res.xlsx', 'w') as f: | |
# w = csv.DictWriter(f, sources[0].keys()) | |
# w.writeheader() | |
# w.writerows(sources) | |
# save all columns in sources to csv | |
with open('res.csv', 'w') as f: | |
writer = csv.writer(f) | |
writer.writerow(sources[0].keys()) | |
for data in sources[0:30]: | |
writer.writerow(data.values()) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# wait for a random amount of time | |
import time | |
import random | |
from pprint import pprint | |
# import foo.yaml and parse it | |
import yaml | |
with open('source.yaml') as f: | |
data = yaml.load(f, Loader=yaml.FullLoader) | |
# print length of data | |
print('loaded data of size: ' + str(len(data))) | |
# loop through the data and google each name to find the url | |
import requests | |
from bs4 import BeautifulSoup | |
from openaiscript import getOpenAIResponses | |
# a function to get the url from google given the name of a company | |
def get_url(name, extra=""): | |
url = 'https://www.google.com/search?q=' + name + extra | |
# convert string to url | |
url = url.replace(" ", "+") | |
# get url with google chrome user agent | |
page = requests.get(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'}) | |
soup = BeautifulSoup(page.content, 'html.parser') | |
# # print separator | |
# print('---') | |
# print('---') | |
# print('---') | |
# # print the page source | |
# print(soup.prettify()) | |
# print('---') | |
# print('---') | |
# print('---') | |
# get every link from the page | |
links = soup.find_all('a') | |
# pprint(links) | |
# get the first result from google page | |
result = soup.find('div', class_='yuRUbf') | |
# pprint(url) | |
# pprint(result) | |
# get the url from the result and remove any hashtags | |
link = result.find('a')['href'].split('#')[0] | |
# if the link equals the dontmatch url then get the next result from google page | |
while link == name: | |
result = result.find_next('div', class_='yuRUbf') | |
link = result.find('a')['href'].split('#')[0] | |
return link | |
def siteContent(url): | |
# retrieve all the words from the website except from the footer | |
page = requests.get(url) | |
soup = BeautifulSoup(page.content, 'html.parser') | |
words = "" | |
# loop through each section of the website and for each get the headings and then the text | |
for section in soup.find_all('section'): | |
# get text of all headings in this section | |
heading = section.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']) | |
# convert list to string | |
heading = ' '.join([str(elem.get_text()) for elem in heading]) | |
# get the text of the section | |
text = section.find_all('p') | |
# convert list to string | |
text = ' '.join([str(elem.get_text()) for elem in text]) | |
# add the heading and text to the data | |
words += heading + " " + text | |
# strip newlines | |
words = words.replace("\n", " ") | |
return words | |
def extractGoogleDescription(url): | |
# google the name and append | |
# https://www.google.com/search?q=what+is+www.adjust.com%3F | |
url = 'https://www.google.com/search?q=what+is+' + url + "%3F" | |
# get url with google chrome user agent | |
page = requests.get(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'}) | |
soup = BeautifulSoup(page.content, 'html.parser') | |
# get the data-attrid with "wa:/description" which contains the description | |
result = soup.find('div', attrs={'data-attrid': 'wa:/description'}) | |
# if there is no description, get the first result | |
if result is None: | |
result = soup.find('div', class_='yuRUbf') | |
# get the description of that first result | |
result = result.find('div', class_='BNeawe s3v9rd AP7Wnd') | |
return result.get_text() if result is not None else "No description available" | |
#################################### | |
# get the data | |
#################################### | |
start = 22 # note that we last got to index 21, start at 22 next | |
end = start + 20 | |
# loop with index | |
for i, x in enumerate(data[start:end]): | |
print(str(i) + ': getting data for ' + x['name']) | |
time.sleep(random.randint(1, 5)) | |
x["www_url"] = get_url(x["name"], " company website") | |
time.sleep(random.randint(1, 5)) | |
x["about_url"] = get_url(x["www_url"], " about us page for the company") | |
time.sleep(random.randint(1, 5)) | |
x["docs_url"] = get_url(x["www_url"], " API docs") | |
x["www_content"] = siteContent(x["www_url"]) | |
x["about_content"] = siteContent(x["about_url"]) | |
time.sleep(random.randint(1, 5)) | |
x["google_description"] = extractGoogleDescription(x["www_url"]) | |
# print separator | |
# getOpenAIResponses(x) | |
print('------------------------------------') | |
# save the data to a file | |
with open('res.yaml', 'w') as f: | |
yaml.dump(data[:i+1], f) | |
# # pretty print the array | |
# pprint(data[0]) | |
# # save the data to a file | |
# with open('res.yaml', 'w') as f: | |
# yaml.dump(data, f) | |
# # save data to csv | |
# import csv | |
# with open('res.csv', 'w', newline='') as csvfile: | |
# # get list of fieldnames from the first item | |
# fieldnames = list(data[0].keys()) | |
# writer = csv.DictWriter(csvfile, fieldnames=fieldnames) | |
# writer.writeheader() | |
# for x in data[:30]: | |
# writer.writerow(x) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment