Skip to content

Instantly share code, notes, and snippets.

View do-me's full-sized avatar

Dominik Weckmüller do-me

View GitHub Profile
@do-me
do-me / download.py
Created November 22, 2024 16:00
Download all Foursquare files from preprocessed dump on https://source.coop/fused/fsq-os-places/2024-11-19/places
import os
import requests
from tqdm import tqdm
# Base URL
base_url = "https://data.source.coop/fused/fsq-os-places/2024-11-19/places/"
# Range of files to download
start_index = 0
end_index = 80 # Exclusive, adjust as needed
@do-me
do-me / cosine_similarity.py
Last active October 22, 2024 09:13
Quick cosine similarity with numpy & query with pandas
from numpy.linalg import norm
cos_sim = lambda a,b: (a @ b.T) / (norm(a)*norm(b)) # from https://huggingface.co/jinaai/jina-embeddings-v2-base-en
query = "social democracy"
quer_emb = model.encode(query)
df["cos_sim"] = df["embeddings"].apply(lambda x: cos_sim(x, quer_emb))
df = df.sort_values("cos_sim", ascending=False)
##################################################################################################
# 2x faster for 350k rows
@do-me
do-me / scatter_animation.py
Created October 19, 2024 18:03
Scatterplot animation with matplotlib in Jupyter
import matplotlib.pyplot as plt
import matplotlib
import matplotlib.animation as animation
from matplotlib.animation import PillowWriter
import numpy as np
from IPython.display import HTML
# Example data for two states
state1_x = np.random.rand(10) # x-coordinates for state 1
Country
Afghanistan
Albania
Algeria
Andorra
Angola
Antigua and Barbuda
Argentina
Armenia
Australia
@do-me
do-me / extract_markdown_table.py
Created October 13, 2024 12:12
Extract markdown table from arbitrary markdown text based on regex
import pandas as pd
import re
def extract_markdown_table(text):
"""
Extracts a markdown table from a string, removing other markdown elements.
Args:
text: The input string containing markdown.
@do-me
do-me / wiki.sh
Last active October 10, 2024 11:03
personal wiki with bash & zsh, searches a directory with .txt files. includes command for new note
wiki() {
# Combine arguments into a single string for multi-word search
search_string="$*"
# Perform case-insensitive grep search with the combined string
grep -Hni --color=always "$search_string" /Users/dome/work/wikifiles/*.txt | awk -F':' '
BEGIN {
prevfile=""
}
{
@do-me
do-me / gemini_summary.py
Created October 9, 2024 07:52
Gemini Flash 1.5 Summary logic with retry (free plan)
import google.generativeai as genai
import pandas as pd
df = pd.read_json("https://github.com/do-me/copernicus-services-semantic-search/raw/refs/heads/main/copernicus_services_embeddings.json.gz")
# ignoring the cleaning of the dataset for brevity
GOOGLE_API_KEY= "YOUR_KEY"
genai.configure(api_key=GOOGLE_API_KEY)
model = genai.GenerativeModel("gemini-1.5-flash")
@do-me
do-me / eo_pubsy.py
Created October 8, 2024 14:24
Earth Observation Embedding from selected JRC publications (BAAI/bge-base-en-v1.5)
Earth_Observation_Pubsy = [0.0023605350870639086,-0.03585183620452881,-0.0018838586984202266,-0.0066082351841032505,0.03577606752514839,0.007964790798723698,0.023150762543082237,0.03316942974925041,-0.038895998150110245,-0.04117076098918915,-0.03140062466263771,-0.017644666135311127,-0.05881122127175331,0.01922798343002796,-0.001551413326524198,0.04579007625579834,0.02461058646440506,0.006413688883185387,0.003569109132513404,0.029188191518187523,-0.008217660710215569,-0.009149713441729546,0.015580502338707447,0.02944401651620865,0.009927663952112198,-0.02080441080033779,0.0313025526702404,0.035126153379678726,-0.03328511863946915,0.0006073070107959211,0.025256695225834846,-0.0033638938330113888,-0.021389279514551163,-0.0021468251943588257,0.009579457342624664,0.012051025405526161,-0.0401134267449379,-0.010880139656364918,-0.038161613047122955,-0.015132302418351173,-0.026435792446136475,-0.002597113372758031,-0.021558517590165138,-0.00289620878174901,-0.023958338424563408,0.015574358403682709,-0.05900900810956
@do-me
do-me / array_to_string.py
Created October 1, 2024 18:49
Convert array to string with fixed decimal precision, multiprocessing with pandarallel based on pandas df
import json
import numpy as np
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)
# Function to round array to 2 decimal places and serialize to JSON
def round_and_serialize(x):
if isinstance(x, np.ndarray):
# Round and format each number to 2 decimal places
@do-me
do-me / unique_embeddings.py
Created September 27, 2024 07:15
Create embeddings for pandas df for unique texts only, saving resources
from FlagEmbedding import BGEM3FlagModel
model = BGEM3FlagModel('BAAI/bge-m3', use_fp16=True)
# assuming gdf is a (geo)pandas dataframe with texts to inference
# Step 1: Get the list of texts to encode
gdf_list = gdf["texts"].to_list()
# Step 2: Deduplicate the list of texts and keep track of the original indices
unique_texts = list(set(gdf_list))