Last active
October 25, 2023 08:08
-
-
Save ice09/ae5409de706fbd5e08183f7632b44434 to your computer and use it in GitHub Desktop.
Downloads three posts of Vitaliks blog and creates embeddings with langchain4j which can then be queried with OpenAI GPT (see https://hackmd.io/@alculexum/embedding4j)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//DEPS dev.langchain4j:langchain4j:0.23.0 | |
//DEPS dev.langchain4j:langchain4j-open-ai:0.23.0 | |
//DEPS dev.langchain4j:langchain4j-embeddings-all-minilm-l6-v2:0.23.0 | |
//DEPS commons-io:commons-io:2.14.0 | |
//DEPS org.apache.commons:commons-text:1.10.0 | |
//DEPS org.jsoup:jsoup:1.16.1 | |
//DEPS org.slf4j:slf4j-simple:2.0.9 | |
package dev.indus340; | |
import dev.langchain4j.chain.ConversationalRetrievalChain; | |
import dev.langchain4j.data.document.Document; | |
import dev.langchain4j.data.document.DocumentType; | |
import dev.langchain4j.data.document.parser.TextDocumentParser; | |
import dev.langchain4j.data.document.splitter.DocumentSplitters; | |
import dev.langchain4j.data.segment.TextSegment; | |
import dev.langchain4j.memory.chat.MessageWindowChatMemory; | |
import dev.langchain4j.model.embedding.AllMiniLmL6V2EmbeddingModel; | |
import dev.langchain4j.model.embedding.EmbeddingModel; | |
import dev.langchain4j.model.openai.OpenAiChatModel; | |
import dev.langchain4j.retriever.EmbeddingStoreRetriever; | |
import dev.langchain4j.store.embedding.EmbeddingStore; | |
import dev.langchain4j.store.embedding.EmbeddingStoreIngestor; | |
import dev.langchain4j.store.embedding.inmemory.InMemoryEmbeddingStore; | |
import org.apache.commons.io.IOUtils; | |
import org.apache.commons.text.WordUtils; | |
import org.jsoup.Jsoup; | |
import org.jsoup.nodes.Element; | |
import java.io.IOException; | |
import java.time.Duration; | |
import java.util.Scanner; | |
public class TalkToEmbeddedButerin { | |
private static final String[] blogPosts = new String[] { | |
// Should Ethereum be okay with enshrining more things in the protocol? | |
"https://vitalik.ca/general/2023/09/30/enshrinement.html", | |
// What do I think about Community Notes? | |
"https://vitalik.ca/general/2023/08/16/communitynotes.html", | |
// What do I think about biometric proof of personhood? | |
"https://vitalik.ca/general/2023/07/24/biometric.html" | |
}; | |
// args[0] should be OPENAI_API_KEY | |
public static void main(String[] args) throws IOException { | |
if (args.length != 1) { | |
System.out.println("OPENAI_API_KEY has to be provided as first argument."); | |
return; | |
} | |
String filesContent = downloadPostsContent(); | |
interactWithEmbeddedDocuments(args[0], filesContent); | |
} | |
private static void interactWithEmbeddedDocuments(String openaiKey, String content) { | |
EmbeddingModel embeddingModel = new AllMiniLmL6V2EmbeddingModel(); | |
EmbeddingStore<TextSegment> embeddingStore = new InMemoryEmbeddingStore<>(); | |
EmbeddingStoreIngestor ingestor = EmbeddingStoreIngestor.builder() | |
// the Devoxx sample used 500, for my texts 1000 seems better. | |
.documentSplitter(DocumentSplitters.recursive(1000, 0)) | |
.embeddingModel(embeddingModel) | |
.embeddingStore(embeddingStore) | |
.build(); | |
System.out.println("creating embeddings"); | |
Document document = new TextDocumentParser(DocumentType.TXT).parse(IOUtils.toInputStream(content)); | |
ingestor.ingest(document); | |
OpenAiChatModel assistant = | |
OpenAiChatModel.builder() | |
.modelName("gpt-3.5-turbo") | |
.apiKey(openaiKey) | |
.timeout(Duration.ofMinutes(5)).build(); | |
ConversationalRetrievalChain chain = ConversationalRetrievalChain.builder() | |
.chatLanguageModel(assistant) | |
.retriever(EmbeddingStoreRetriever.from(embeddingStore, embeddingModel)) | |
// be careful when uncommenting the chat memory window. | |
// to verify the difference between embeddings and plain GPTs | |
// the chat memory has to be deactivated as plain GPT does not | |
// use the chain but the model directly which has no chat memory. | |
//.chatMemory(MessageWindowChatMemory.withMaxMessages(20)) | |
.build(); | |
String question = ""; | |
Scanner scanner = new Scanner(System.in); | |
while (!question.equalsIgnoreCase("q")) { | |
System.out.print("\nWhat's your question: "); | |
question = scanner.nextLine(); | |
System.out.println("\n### With embeddings\n" + WordUtils.wrap(chain.execute(question), 80)); | |
System.out.println("\n### Plain GPT\n" + WordUtils.wrap(assistant.generate(question), 80)); | |
} | |
scanner.close(); | |
} | |
private static String downloadPostsContent() throws IOException { | |
StringBuilder allPostsCombined = new StringBuilder(); | |
for (String url : blogPosts) { | |
System.out.println("downloading " + url); | |
org.jsoup.nodes.Document document = Jsoup.connect(url).get(); | |
Element postBody = document.body(); | |
allPostsCombined.append(postBody.text()); | |
} | |
return allPostsCombined.toString(); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Can be run with JBang:
jbang https://gist.github.com/ice09/ae5409de706fbd5e08183f7632b44434 <OPENAI_API_KEY>