This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import requests | |
from tqdm import tqdm | |
# Base URL | |
base_url = "https://data.source.coop/fused/fsq-os-places/2024-11-19/places/" | |
# Range of files to download | |
start_index = 0 | |
end_index = 80 # Exclusive, adjust as needed |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from numpy.linalg import norm | |
cos_sim = lambda a,b: (a @ b.T) / (norm(a)*norm(b)) # from https://huggingface.co/jinaai/jina-embeddings-v2-base-en | |
query = "social democracy" | |
quer_emb = model.encode(query) | |
df["cos_sim"] = df["embeddings"].apply(lambda x: cos_sim(x, quer_emb)) | |
df = df.sort_values("cos_sim", ascending=False) | |
################################################################################################## | |
# 2x faster for 350k rows |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import matplotlib.pyplot as plt | |
import matplotlib | |
import matplotlib.animation as animation | |
from matplotlib.animation import PillowWriter | |
import numpy as np | |
from IPython.display import HTML | |
# Example data for two states | |
state1_x = np.random.rand(10) # x-coordinates for state 1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Country | |
Afghanistan | |
Albania | |
Algeria | |
Andorra | |
Angola | |
Antigua and Barbuda | |
Argentina | |
Armenia | |
Australia |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import re | |
def extract_markdown_table(text): | |
""" | |
Extracts a markdown table from a string, removing other markdown elements. | |
Args: | |
text: The input string containing markdown. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
wiki() { | |
# Combine arguments into a single string for multi-word search | |
search_string="$*" | |
# Perform case-insensitive grep search with the combined string | |
grep -Hni --color=always "$search_string" /Users/dome/work/wikifiles/*.txt | awk -F':' ' | |
BEGIN { | |
prevfile="" | |
} | |
{ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import google.generativeai as genai | |
import pandas as pd | |
df = pd.read_json("https://github.com/do-me/copernicus-services-semantic-search/raw/refs/heads/main/copernicus_services_embeddings.json.gz") | |
# ignoring the cleaning of the dataset for brevity | |
GOOGLE_API_KEY= "YOUR_KEY" | |
genai.configure(api_key=GOOGLE_API_KEY) | |
model = genai.GenerativeModel("gemini-1.5-flash") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Earth_Observation_Pubsy = [0.0023605350870639086,-0.03585183620452881,-0.0018838586984202266,-0.0066082351841032505,0.03577606752514839,0.007964790798723698,0.023150762543082237,0.03316942974925041,-0.038895998150110245,-0.04117076098918915,-0.03140062466263771,-0.017644666135311127,-0.05881122127175331,0.01922798343002796,-0.001551413326524198,0.04579007625579834,0.02461058646440506,0.006413688883185387,0.003569109132513404,0.029188191518187523,-0.008217660710215569,-0.009149713441729546,0.015580502338707447,0.02944401651620865,0.009927663952112198,-0.02080441080033779,0.0313025526702404,0.035126153379678726,-0.03328511863946915,0.0006073070107959211,0.025256695225834846,-0.0033638938330113888,-0.021389279514551163,-0.0021468251943588257,0.009579457342624664,0.012051025405526161,-0.0401134267449379,-0.010880139656364918,-0.038161613047122955,-0.015132302418351173,-0.026435792446136475,-0.002597113372758031,-0.021558517590165138,-0.00289620878174901,-0.023958338424563408,0.015574358403682709,-0.05900900810956 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import numpy as np | |
from pandarallel import pandarallel | |
pandarallel.initialize(progress_bar=True) | |
# Function to round array to 2 decimal places and serialize to JSON | |
def round_and_serialize(x): | |
if isinstance(x, np.ndarray): | |
# Round and format each number to 2 decimal places |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from FlagEmbedding import BGEM3FlagModel | |
model = BGEM3FlagModel('BAAI/bge-m3', use_fp16=True) | |
# assuming gdf is a (geo)pandas dataframe with texts to inference | |
# Step 1: Get the list of texts to encode | |
gdf_list = gdf["texts"].to_list() | |
# Step 2: Deduplicate the list of texts and keep track of the original indices | |
unique_texts = list(set(gdf_list)) |
NewerOlder