diff --git a/week5/community-contributions/kachaje-andela-genai-bootcamp/week5-genai-bootcamp.ipynb b/week5/community-contributions/kachaje-andela-genai-bootcamp/week5-genai-bootcamp.ipynb new file mode 100644 index 0000000..154f9e9 --- /dev/null +++ b/week5/community-contributions/kachaje-andela-genai-bootcamp/week5-genai-bootcamp.ipynb @@ -0,0 +1,204 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "b4992675", + "metadata": {}, + "source": [ + "# Personal Knowledge Worker Using RAG\n", + "\n", + "Tool for querying personal file storage using RAG. Working with local Llama instance" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2680f9c0", + "metadata": {}, + "outputs": [], + "source": [ + "! pip -q install langchain langchain-community sentence-transformers ollama" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "47b6caab", + "metadata": {}, + "outputs": [], + "source": [ + "# imports\n", + "\n", + "import os\n", + "from langchain_community.document_loaders import DirectoryLoader\n", + "from langchain.text_splitter import CharacterTextSplitter\n", + "from langchain.document_loaders import TextLoader\n", + "import glob\n", + "from langchain_community.llms import Ollama\n", + "from langchain.embeddings import OllamaEmbeddings\n", + "from langchain_chroma import Chroma\n", + "from langchain.memory import ConversationBufferMemory\n", + "from langchain.chains import ConversationalRetrievalChain\n", + "import gradio as gr\n", + "from langchain_core.callbacks import StdOutCallbackHandler\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d42c7523", + "metadata": {}, + "outputs": [], + "source": [ + "# Using mistral model to run locally\n", + "\n", + "MODEL = \"mistral\"\n", + "db_name = \"vector_db\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a3ac38f7", + "metadata": {}, + "outputs": [], + "source": [ + "def add_metadata(doc, doc_type):\n", + " doc.metadata[\"doc_type\"] = doc_type\n", + " return doc\n", + "\n", + "\n", + "# Using reference data as default to avoid repeating the same steps. This can be substituted with any other data later dynamically\n", + "def load_documents(folders = glob.glob(\"../../knowledge-base/*\")):\n", + " text_loader_kwargs = {'encoding': 'utf-8'}\n", + "\n", + " documents = []\n", + " for folder in folders:\n", + " doc_type = os.path.basename(folder)\n", + " loader = DirectoryLoader(folder, glob=\"**/*.md\", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)\n", + " folder_docs = loader.load()\n", + " documents.extend([add_metadata(doc, doc_type) for doc in folder_docs])\n", + "\n", + " text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)\n", + " chunks = text_splitter.split_documents(documents)\n", + "\n", + " print(f\"Total number of chunks: {len(chunks)}\")\n", + " print(f\"Document types found: {set(doc.metadata['doc_type'] for doc in documents)}\")\n", + "\n", + " return documents, chunks\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "75a53301", + "metadata": {}, + "outputs": [], + "source": [ + "embeddings = OllamaEmbeddings(model=MODEL)\n", + "\n", + "documents, chunks = load_documents()\n", + "\n", + "# Use existing vectorstore if it exists\n", + "if os.path.exists(db_name):\n", + " vectorstore = Chroma(persist_directory=db_name, embedding_function=embeddings)\n", + "else:\n", + " # Create a new vectorstore\n", + " vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)\n", + "print(f\"Vectorstore created with {vectorstore._collection.count()} documents\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2e906f8a", + "metadata": {}, + "outputs": [], + "source": [ + "llm = Ollama(model=MODEL)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7f1ca473", + "metadata": {}, + "outputs": [], + "source": [ + "retriever = vectorstore.as_retriever()\n", + "\n", + "memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)\n", + "\n", + "conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "34d5a20a", + "metadata": {}, + "outputs": [], + "source": [ + "question = \"Please explain what Insurellm is in a couple of sentences\"\n", + "result = conversation_chain.invoke(question)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e0ad3702", + "metadata": {}, + "outputs": [], + "source": [ + "print(result[\"answer\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "652d8420", + "metadata": {}, + "outputs": [], + "source": [ + "# Wrapping that in a function\n", + "\n", + "def chat(question, history):\n", + " result = conversation_chain.invoke({\"question\": question})\n", + " return result[\"answer\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "73bd1d46", + "metadata": {}, + "outputs": [], + "source": [ + "# And in Gradio:\n", + "\n", + "view = gr.ChatInterface(chat, type=\"messages\").launch(inbrowser=True)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}