Last active
July 5, 2024 16:26
-
-
Save davidmezzetti/751328b80c6653c33063052e1b69f6da to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
################################## | |
# Data functions | |
################################## | |
import re | |
from datasets import load_dataset | |
def clean(text): | |
text = text.replace("\n", " ").strip() | |
return re.sub(r"\s{2,}", " ", text) | |
def stream(): | |
arxiv = load_dataset("arxiv_dataset", split="train") | |
for result in arxiv: | |
yield f"{clean(result['title'])}\n{clean(result['abstract'])}" | |
def batch(size): | |
data = [] | |
for result in stream(): | |
data.append(result) | |
if len(data) == size: | |
yield data | |
data = [] | |
if data: | |
yield data | |
################################## | |
# ChromaDB | |
################################## | |
import time | |
import uuid | |
from chromadb import PersistentClient | |
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction | |
start = time.time() | |
# Create vector store - OOM issues (w/ 32 GB RAM) using in-memory client | |
client = PersistentClient(path="chromadb") | |
collection = client.create_collection( | |
"default", | |
embedding_function=SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2", device="cuda") | |
) | |
# Index data | |
for batch in batch(1024): | |
collection.add(ids=[str(uuid.uuid4()) for _ in batch], documents=batch) | |
print(f"ELAPSED = {time.time() - start:.2f}s") | |
################################## | |
# ChromaDB with LangChain | |
################################## | |
import time | |
from chromadb import PersistentClient | |
from langchain_community.vectorstores import Chroma | |
from langchain_huggingface import HuggingFaceEmbeddings | |
start = time.time() | |
# Create embeddings | |
embeddings = HuggingFaceEmbeddings( | |
model_name="sentence-transformers/all-MiniLM-L6-v2", | |
model_kwargs={"device": "cuda"} | |
) | |
# Create vector store | |
# OOM issues (w/ 32 GB RAM) using Chroma in-memory client and FAISS | |
index = None | |
for batch in batch(1024): | |
if not index: | |
index = Chroma.from_texts(batch, embeddings, client=PersistentClient(path="chromadb")) | |
else: | |
index.add_texts(batch) | |
print(f"ELAPSED = {time.time() - start:.2f}s") | |
################################## | |
# txtai | |
################################## | |
import time | |
from txtai import Embeddings | |
start = time.time() | |
# Create vector store. Uses SQLite + Hnswlib. | |
embeddings = Embeddings( | |
path="sentence-transformers/all-MiniLM-L6-v2", | |
backend="hnsw", | |
content=True, | |
maxlength=True | |
) | |
embeddings.index(stream()) | |
print(f"ELAPSED = {time.time() - start:.2f}s") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment