swyxio/openaiscript.py

## openaiscript.py

# https://beta.openai.com/docs/libraries
import os
import openai
import yaml

# Load your API key from an environment variable or secret management service
openai.api_key = os.getenv("OPENAI_API_KEY")


# open res.yaml and load it into the data variable
with open('res.yaml') as f:
    sources = yaml.load(f, Loader=yaml.FullLoader)


cats = """
The following are valid categories:

- Advertising
- Analytics
- AWS
- CRM
- Collaboration
- Communication
- Content
- Customer Success
- Data Lake
- Data Warehouse
- Databases
- Dev Tools
- E-Commerce
- ERP
- Email
- Event Streaming
- Files
- Finance
- Google Cloud
- Human Resources
- Marketing Automation
- Marketplace
- Microsoft Azure
- Payments
- Recruiting
- Search
- Security
- Survey
"""

def getOpenAIResponses(data):
  prompt = f"""The following is a web scrape of all content relevant to for {data['name']}:

  ---

  Their Google description says: {data['google_description']}

  ---

  Their about page says: {data['about_content']}

  ---

  Their website says: {data['www_url']}

  ---
  """


  instructions = f"""
  Please reply with a comma separated list of up to 3 valid categories for this company specifically only chosen from the list above.
  Do not use any other categories not specifilly listed above.

  Examples

  Name: Adjust.com
  Categories: Advertising, Analytics, Marketing Automation

  Name: AlloyDB for PostgreSQL
  Categories: Databases, Dev Tools, Security

  Name: {data['name']}
  Categories: """
  response = openai.Completion.create(model="text-davinci-002", prompt=prompt + cats + instructions, temperature=0.5, max_tokens=256, top_p=1, frequency_penalty=1, presence_penalty=1)
  # print(response)
  data['ai_categories'] = response['choices'][0]['text'].strip().replace('\n', ' ')

  instructions = f"""Please reply with a non-numbered, comma-separated list of five different search terms for this company. Do not use the company name.

  Examples

  Name: Airtable
  Terms: custom applications, no-code apps, online spreadsheets, workflow management, business transformation
  Name: Apify Dataset
  Terms: data scraping, web scraping, automation, data extraction, web crawler

  Name: {data['name']}
  Terms:"""
  response = openai.Completion.create(model="text-davinci-002", prompt=prompt + instructions, temperature=0, max_tokens=256, top_p=1, frequency_penalty=1, presence_penalty=1)
  # print(instructions)
  # print(response)
  data['ai_searchterms'] = response['choices'][0]['text'].strip().replace('\n', ' ')

  instructions = f"""Please reply with an exciting, dynamic and engaging marketing Headline describing {data['name']} in less than 6 words.

    Do not use its name in this headline.

    Examples

    Name: AppFollow
    Headline: Insights to Help Your Mobile App Thrive

    Name: BigCommerce
    Headline: The Most Trusted Commerce Solution Provider

    Name: {data['name']}
    Headline: """
  response = openai.Completion.create(model="text-davinci-002", prompt=prompt + instructions, temperature=0, max_tokens=256, top_p=1, frequency_penalty=1, presence_penalty=1)
  # print(instructions)
  # print(response)
  data['ai_headline'] = response['choices'][0]['text'].strip().replace('\n', ' ')
  # if the headline starts with the name, remove the name
  if data['ai_headline'].startswith(data['name']):
    data['ai_headline'] = data['ai_headline'].replace(data['name'] + ": ", '').strip()

  instructions = f"""
  Please reply with a detailed and technical medium-length Description for {data['name']} in less than 40 words.

  Describe the names of its main products and what it does, writing for a technical data engineer audience.

  Begin.

  {data['name']} is """
  response = openai.Completion.create(model="text-davinci-002", prompt=prompt + instructions, temperature=0, max_tokens=256, top_p=1, frequency_penalty=1, presence_penalty=1)
  # print(instructions)
  # print(response)
  data['ai_description'] = response['choices'][0]['text'].strip().replace('\n', ' ')


  instructions = f"Please reply with a detailed and technically in-depth description listing each and every products, features, use cases, and origin story of {data['name']} specifically for the data engineering audience in under 200 words.\n\nBegin.\n\n"
  response = openai.Completion.create(model="text-davinci-002", prompt=prompt + instructions, temperature=0, max_tokens=1000, top_p=1, frequency_penalty=1, presence_penalty=1)
  # print(instructions)
  # print(response)
  data['ai_long_description'] = response['choices'][0]['text'].strip().replace('\n', ' ')


import csv
for i, data in enumerate(sources[0:30]):
  print(f"Processing {i} of {len(sources)}: " + data['name'])
  getOpenAIResponses(data)

  # # delete the content field in data
  # del data['www_content']
  # del data['about_content']

  # # dump all sources to an xlsx file
  # with open('res.xlsx', 'w') as f:
  #   w = csv.DictWriter(f, sources[0].keys())
  #   w.writeheader()
  #   w.writerows(sources)

  # save all columns in sources to csv
  with open('res.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(sources[0].keys())
    for data in sources[0:30]:
      writer.writerow(data.values())


## scraping.py
# wait for a random amount of time
import time
import random
from pprint import pprint
# import foo.yaml and parse it
import yaml
with open('source.yaml') as f:
    data = yaml.load(f, Loader=yaml.FullLoader)
    # print length of data
    print('loaded data of size: ' + str(len(data)))


# loop through the data and google each name to find the url
import requests
from bs4 import BeautifulSoup

from openaiscript import getOpenAIResponses

# a function to get the url from google given the name of a company
def get_url(name, extra=""):
    url = 'https://www.google.com/search?q=' + name + extra
    # convert string to url
    url = url.replace(" ", "+")
    # get url with google chrome user agent
    page = requests.get(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'})
    soup = BeautifulSoup(page.content, 'html.parser')
    # # print separator
    # print('---')
    # print('---')
    # print('---')
    # # print the page source
    # print(soup.prettify())
    # print('---')
    # print('---')
    # print('---')
    # get every link from the page
    links = soup.find_all('a')
    # pprint(links)
    # get the first result from google page
    result = soup.find('div', class_='yuRUbf')

    # pprint(url)
    # pprint(result)
    # get the url from the result and remove any hashtags
    link = result.find('a')['href'].split('#')[0]
    # if the link equals the dontmatch url then get the next result from google page
    while link == name:
        result = result.find_next('div', class_='yuRUbf')
        link = result.find('a')['href'].split('#')[0]

    return link


def siteContent(url):
    # retrieve all the words from the website except from the footer
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    words = ""
    # loop through each section of the website and for each get the headings and then the text
    for section in soup.find_all('section'):
        # get text of all headings in this section
        heading = section.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
        # convert list to string
        heading = ' '.join([str(elem.get_text()) for elem in heading])
        # get the text of the section
        text = section.find_all('p')
        # convert list to string
        text = ' '.join([str(elem.get_text()) for elem in text])
        # add the heading and text to the data
        words += heading + " " + text
        # strip newlines
        words = words.replace("\n", " ")
    return words


def extractGoogleDescription(url):

    # google the name and append
    # https://www.google.com/search?q=what+is+www.adjust.com%3F
    url = 'https://www.google.com/search?q=what+is+' + url + "%3F"
    # get url with google chrome user agent
    page = requests.get(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'})
    soup = BeautifulSoup(page.content, 'html.parser')

    # get the data-attrid with "wa:/description" which contains the description
    result = soup.find('div', attrs={'data-attrid': 'wa:/description'})
    # if there is no description, get the first result
    if result is None:
        result = soup.find('div', class_='yuRUbf')
        # get the description of that first result
        result = result.find('div', class_='BNeawe s3v9rd AP7Wnd')
    return result.get_text() if result is not None else "No description available"


####################################
# get the data
####################################
start = 22 # note that we last got to index 21, start at 22 next
end = start + 20
# loop with index
for i, x in enumerate(data[start:end]):
    print(str(i) + ': getting data for ' + x['name'])
    time.sleep(random.randint(1, 5))
    x["www_url"] = get_url(x["name"], " company website")
    time.sleep(random.randint(1, 5))
    x["about_url"] = get_url(x["www_url"], " about us page for the company")
    time.sleep(random.randint(1, 5))
    x["docs_url"] = get_url(x["www_url"], " API docs")
    x["www_content"] = siteContent(x["www_url"])
    x["about_content"] = siteContent(x["about_url"])
    time.sleep(random.randint(1, 5))
    x["google_description"] = extractGoogleDescription(x["www_url"])
    # print separator
    # getOpenAIResponses(x)
    print('------------------------------------')
    # save the data to a file
    with open('res.yaml', 'w') as f:
        yaml.dump(data[:i+1], f)


# # pretty print the array
# pprint(data[0])

# # save the data to a file
# with open('res.yaml', 'w') as f:
#     yaml.dump(data, f)

# # save data to csv
# import csv
# with open('res.csv', 'w', newline='') as csvfile:
#     # get list of fieldnames from the first item
#     fieldnames = list(data[0].keys())
#     writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
#     writer.writeheader()
#     for x in data[:30]:
#         writer.writerow(x)

	# https://beta.openai.com/docs/libraries
	import os
	import openai
	import yaml

	# Load your API key from an environment variable or secret management service
	openai.api_key = os.getenv("OPENAI_API_KEY")



	# open res.yaml and load it into the data variable
	with open('res.yaml') as f:
	sources = yaml.load(f, Loader=yaml.FullLoader)


	cats = """
	The following are valid categories:

	- Advertising
	- Analytics
	- AWS
	- CRM
	- Collaboration
	- Communication
	- Content
	- Customer Success
	- Data Lake
	- Data Warehouse
	- Databases
	- Dev Tools
	- E-Commerce
	- ERP
	- Email
	- Event Streaming
	- Files
	- Finance
	- Google Cloud
	- Human Resources
	- Marketing Automation
	- Marketplace
	- Microsoft Azure
	- Payments
	- Recruiting
	- Search
	- Security
	- Survey
	"""

	def getOpenAIResponses(data):
	prompt = f"""The following is a web scrape of all content relevant to for {data['name']}:

	---

	Their Google description says: {data['google_description']}

	---

	Their about page says: {data['about_content']}

	---

	Their website says: {data['www_url']}

	---
	"""





	instructions = f"""
	Please reply with a comma separated list of up to 3 valid categories for this company specifically only chosen from the list above.
	Do not use any other categories not specifilly listed above.

	Examples

	Name: Adjust.com
	Categories: Advertising, Analytics, Marketing Automation

	Name: AlloyDB for PostgreSQL
	Categories: Databases, Dev Tools, Security

	Name: {data['name']}
	Categories: """
	response = openai.Completion.create(model="text-davinci-002", prompt=prompt + cats + instructions, temperature=0.5, max_tokens=256, top_p=1, frequency_penalty=1, presence_penalty=1)
	# print(response)
	data['ai_categories'] = response['choices'][0]['text'].strip().replace('\n', ' ')

	instructions = f"""Please reply with a non-numbered, comma-separated list of five different search terms for this company. Do not use the company name.

	Examples

	Name: Airtable
	Terms: custom applications, no-code apps, online spreadsheets, workflow management, business transformation
	Name: Apify Dataset
	Terms: data scraping, web scraping, automation, data extraction, web crawler

	Name: {data['name']}
	Terms:"""
	response = openai.Completion.create(model="text-davinci-002", prompt=prompt + instructions, temperature=0, max_tokens=256, top_p=1, frequency_penalty=1, presence_penalty=1)
	# print(instructions)
	# print(response)
	data['ai_searchterms'] = response['choices'][0]['text'].strip().replace('\n', ' ')

	instructions = f"""Please reply with an exciting, dynamic and engaging marketing Headline describing {data['name']} in less than 6 words.

	Do not use its name in this headline.

	Examples

	Name: AppFollow
	Headline: Insights to Help Your Mobile App Thrive

	Name: BigCommerce
	Headline: The Most Trusted Commerce Solution Provider

	Name: {data['name']}
	Headline: """
	response = openai.Completion.create(model="text-davinci-002", prompt=prompt + instructions, temperature=0, max_tokens=256, top_p=1, frequency_penalty=1, presence_penalty=1)
	# print(instructions)
	# print(response)
	data['ai_headline'] = response['choices'][0]['text'].strip().replace('\n', ' ')
	# if the headline starts with the name, remove the name
	if data['ai_headline'].startswith(data['name']):
	data['ai_headline'] = data['ai_headline'].replace(data['name'] + ": ", '').strip()

	instructions = f"""
	Please reply with a detailed and technical medium-length Description for {data['name']} in less than 40 words.

	Describe the names of its main products and what it does, writing for a technical data engineer audience.

	Begin.

	{data['name']} is """
	response = openai.Completion.create(model="text-davinci-002", prompt=prompt + instructions, temperature=0, max_tokens=256, top_p=1, frequency_penalty=1, presence_penalty=1)
	# print(instructions)
	# print(response)
	data['ai_description'] = response['choices'][0]['text'].strip().replace('\n', ' ')


	instructions = f"Please reply with a detailed and technically in-depth description listing each and every products, features, use cases, and origin story of {data['name']} specifically for the data engineering audience in under 200 words.\n\nBegin.\n\n"
	response = openai.Completion.create(model="text-davinci-002", prompt=prompt + instructions, temperature=0, max_tokens=1000, top_p=1, frequency_penalty=1, presence_penalty=1)
	# print(instructions)
	# print(response)
	data['ai_long_description'] = response['choices'][0]['text'].strip().replace('\n', ' ')


	import csv
	for i, data in enumerate(sources[0:30]):
	print(f"Processing {i} of {len(sources)}: " + data['name'])
	getOpenAIResponses(data)

	# # delete the content field in data
	# del data['www_content']
	# del data['about_content']

	# # dump all sources to an xlsx file
	# with open('res.xlsx', 'w') as f:
	# w = csv.DictWriter(f, sources[0].keys())
	# w.writeheader()
	# w.writerows(sources)

	# save all columns in sources to csv
	with open('res.csv', 'w') as f:
	writer = csv.writer(f)
	writer.writerow(sources[0].keys())
	for data in sources[0:30]:
	writer.writerow(data.values())
	# wait for a random amount of time
	import time
	import random
	from pprint import pprint
	# import foo.yaml and parse it
	import yaml
	with open('source.yaml') as f:
	data = yaml.load(f, Loader=yaml.FullLoader)
	# print length of data
	print('loaded data of size: ' + str(len(data)))


	# loop through the data and google each name to find the url
	import requests
	from bs4 import BeautifulSoup

	from openaiscript import getOpenAIResponses

	# a function to get the url from google given the name of a company
	def get_url(name, extra=""):
	url = 'https://www.google.com/search?q=' + name + extra
	# convert string to url
	url = url.replace(" ", "+")
	# get url with google chrome user agent
	page = requests.get(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'})
	soup = BeautifulSoup(page.content, 'html.parser')
	# # print separator
	# print('---')
	# print('---')
	# print('---')
	# # print the page source
	# print(soup.prettify())
	# print('---')
	# print('---')
	# print('---')
	# get every link from the page
	links = soup.find_all('a')
	# pprint(links)
	# get the first result from google page
	result = soup.find('div', class_='yuRUbf')

	# pprint(url)
	# pprint(result)
	# get the url from the result and remove any hashtags
	link = result.find('a')['href'].split('#')[0]
	# if the link equals the dontmatch url then get the next result from google page
	while link == name:
	result = result.find_next('div', class_='yuRUbf')
	link = result.find('a')['href'].split('#')[0]

	return link


	def siteContent(url):
	# retrieve all the words from the website except from the footer
	page = requests.get(url)
	soup = BeautifulSoup(page.content, 'html.parser')
	words = ""
	# loop through each section of the website and for each get the headings and then the text
	for section in soup.find_all('section'):
	# get text of all headings in this section
	heading = section.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
	# convert list to string
	heading = ' '.join([str(elem.get_text()) for elem in heading])
	# get the text of the section
	text = section.find_all('p')
	# convert list to string
	text = ' '.join([str(elem.get_text()) for elem in text])
	# add the heading and text to the data
	words += heading + " " + text
	# strip newlines
	words = words.replace("\n", " ")
	return words


	def extractGoogleDescription(url):

	# google the name and append
	# https://www.google.com/search?q=what+is+www.adjust.com%3F
	url = 'https://www.google.com/search?q=what+is+' + url + "%3F"
	# get url with google chrome user agent
	page = requests.get(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'})
	soup = BeautifulSoup(page.content, 'html.parser')

	# get the data-attrid with "wa:/description" which contains the description
	result = soup.find('div', attrs={'data-attrid': 'wa:/description'})
	# if there is no description, get the first result
	if result is None:
	result = soup.find('div', class_='yuRUbf')
	# get the description of that first result
	result = result.find('div', class_='BNeawe s3v9rd AP7Wnd')
	return result.get_text() if result is not None else "No description available"


	####################################
	# get the data
	####################################
	start = 22 # note that we last got to index 21, start at 22 next
	end = start + 20
	# loop with index
	for i, x in enumerate(data[start:end]):
	print(str(i) + ': getting data for ' + x['name'])
	time.sleep(random.randint(1, 5))
	x["www_url"] = get_url(x["name"], " company website")
	time.sleep(random.randint(1, 5))
	x["about_url"] = get_url(x["www_url"], " about us page for the company")
	time.sleep(random.randint(1, 5))
	x["docs_url"] = get_url(x["www_url"], " API docs")
	x["www_content"] = siteContent(x["www_url"])
	x["about_content"] = siteContent(x["about_url"])
	time.sleep(random.randint(1, 5))
	x["google_description"] = extractGoogleDescription(x["www_url"])
	# print separator
	# getOpenAIResponses(x)
	print('------------------------------------')
	# save the data to a file
	with open('res.yaml', 'w') as f:
	yaml.dump(data[:i+1], f)



	# # pretty print the array
	# pprint(data[0])

	# # save the data to a file
	# with open('res.yaml', 'w') as f:
	# yaml.dump(data, f)

	# # save data to csv
	# import csv
	# with open('res.csv', 'w', newline='') as csvfile:
	# # get list of fieldnames from the first item
	# fieldnames = list(data[0].keys())
	# writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
	# writer.writeheader()
	# for x in data[:30]:
	# writer.writerow(x)