do-me/gemini_summary.py

## gemini_summary.py
import google.generativeai as genai
import pandas as pd

df = pd.read_json("https://github.com/do-me/copernicus-services-semantic-search/raw/refs/heads/main/copernicus_services_embeddings.json.gz")

# ignoring the cleaning of the dataset for brevity

GOOGLE_API_KEY= "YOUR_KEY"
genai.configure(api_key=GOOGLE_API_KEY)
model = genai.GenerativeModel("gemini-1.5-flash")

SUMMARY_PROMPT = """"
PROMPT:
As a professional summarizer, create a concise and comprehensive summary of the provided text while adhering to these fundamental guidelines:
- Craft a summary that is detailed, thorough, in-depth, and complex, while maintaining clarity and conciseness.
- Incorporate main ideas and essential information and focusing on critical aspects.
- Rely strictly on the provided text, without including external information.
- Format the summary in paragraph form for easy understanding.
By following this optimized prompt, you will generate an effective summary that encapsulates the essence of the given text in a clear, concise, and reader-friendly manner.
Think carefully and focus solely on summarizing the text.
Do not give any proposals for improvements or suggestions. Do not judge. Do not return code. Just return the summarized text!
The summary should be short, like around 4-5 sentences maximum but still contain the most important information. If there is not enough text to summarize just return the original text.
Please respond with plain text and avoid using Markdown formatting.

TEXT TO SUMMARIZE:
"""

import time
import pandas as pd
from tqdm import tqdm

def process_content(content):
    response = model.generate_content(SUMMARY_PROMPT + content)
    return response.text

# Retry logic when rate limit error occurs
def process_row_with_retry(content):
    #print(content)
    while True:
        try:
            # Try to process the content
            response = model.generate_content(content)
            return response.text
        except Exception as e:
            # Catch the rate limit error and wait for 61 seconds before retrying
            print(f"Rate limit error: {e}. Retrying in 61 seconds...")
            time.sleep(61)
            continue  # Retry the process

# Iterate over the DataFrame, applying the function with retry logic
for idx, row in tqdm(df.iterrows(), total=len(df)):
    #print(idx , row.length > 20)
    if row.length > 20:
        df.at[idx, 'summary'] = process_row_with_retry(row['Content'])  # Apply function with retry logic
    else:
        df.at[idx, 'summary'] = ""

# 67 min for https://github.com/do-me/copernicus-services-semantic-search
	import google.generativeai as genai
	import pandas as pd

	df = pd.read_json("https://github.com/do-me/copernicus-services-semantic-search/raw/refs/heads/main/copernicus_services_embeddings.json.gz")

	# ignoring the cleaning of the dataset for brevity

	GOOGLE_API_KEY= "YOUR_KEY"
	genai.configure(api_key=GOOGLE_API_KEY)
	model = genai.GenerativeModel("gemini-1.5-flash")

	SUMMARY_PROMPT = """"
	PROMPT:
	As a professional summarizer, create a concise and comprehensive summary of the provided text while adhering to these fundamental guidelines:
	- Craft a summary that is detailed, thorough, in-depth, and complex, while maintaining clarity and conciseness.
	- Incorporate main ideas and essential information and focusing on critical aspects.
	- Rely strictly on the provided text, without including external information.
	- Format the summary in paragraph form for easy understanding.
	By following this optimized prompt, you will generate an effective summary that encapsulates the essence of the given text in a clear, concise, and reader-friendly manner.
	Think carefully and focus solely on summarizing the text.
	Do not give any proposals for improvements or suggestions. Do not judge. Do not return code. Just return the summarized text!
	The summary should be short, like around 4-5 sentences maximum but still contain the most important information. If there is not enough text to summarize just return the original text.
	Please respond with plain text and avoid using Markdown formatting.

	TEXT TO SUMMARIZE:
	"""

	import time
	import pandas as pd
	from tqdm import tqdm

	def process_content(content):
	response = model.generate_content(SUMMARY_PROMPT + content)
	return response.text

	# Retry logic when rate limit error occurs
	def process_row_with_retry(content):
	#print(content)
	while True:
	try:
	# Try to process the content
	response = model.generate_content(content)
	return response.text
	except Exception as e:
	# Catch the rate limit error and wait for 61 seconds before retrying
	print(f"Rate limit error: {e}. Retrying in 61 seconds...")
	time.sleep(61)
	continue # Retry the process

	# Iterate over the DataFrame, applying the function with retry logic
	for idx, row in tqdm(df.iterrows(), total=len(df)):
	#print(idx , row.length > 20)
	if row.length > 20:
	df.at[idx, 'summary'] = process_row_with_retry(row['Content']) # Apply function with retry logic
	else:
	df.at[idx, 'summary'] = ""

	# 67 min for https://github.com/do-me/copernicus-services-semantic-search