{ "cells": [ { "cell_type": "code", "execution_count": 2, "id": "7015d967", "metadata": {}, "outputs": [], "source": [ "import os\n", "from dotenv import load_dotenv\n", "from langchain.chat_models import ChatOpenAI\n", "from langchain.chains import ConversationalRetrievalChain\n", "from langchain.memory import ConversationBufferMemory\n", "from langchain.vectorstores import Chroma\n", "from langchain.embeddings import OpenAIEmbeddings\n", "from langchain.document_loaders import DirectoryLoader, TextLoader\n", "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", "from langchain_experimental.text_splitter import SemanticChunker\n", "from langchain.schema import Document\n", "import gradio as gr\n", "import glob" ] }, { "cell_type": "code", "execution_count": 3, "id": "87646db6", "metadata": {}, "outputs": [], "source": [ "load_dotenv(override=True)\n", "api_key = os.getenv('OPENAI_API_KEY')" ] }, { "cell_type": "code", "execution_count": null, "id": "1019e3b8", "metadata": {}, "outputs": [], "source": [ "file_paths = glob.glob(\"knowledge_base/**/*.md\", recursive=True)\n", "\n", "documents = []\n", "for path in file_paths:\n", " with open(path, \"r\", encoding=\"utf-8\") as f:\n", " text = f.read()\n", " doc_type = os.path.basename(os.path.dirname(path)) \n", "\n", " documents.append(\n", " Document(\n", " page_content=text,\n", " metadata={\n", " \"doc_type\": doc_type,\n", " },\n", " )\n", " )" ] }, { "cell_type": "code", "execution_count": null, "id": "54527a21", "metadata": {}, "outputs": [], "source": [ "embeddings = OpenAIEmbeddings(model=\"text-embedding-3-small\", openai_api_key=api_key)\n", "\n", "# text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)\n", "text_splitter = SemanticChunker(embeddings)\n", "chunks = text_splitter.split_documents(documents)\n", "\n", "print(f\"Total number of chunks: {len(chunks)}\")" ] }, { "cell_type": "code", "execution_count": null, "id": "15579dda", "metadata": {}, "outputs": [], "source": [ "vectorstore = Chroma.from_documents(\n", " documents=chunks,\n", " embedding=embeddings,\n", " persist_directory=\"chroma_db\"\n", ")\n", "vectorstore.persist()\n", "print(\"Chroma vector store built.\")" ] }, { "cell_type": "code", "execution_count": null, "id": "ca3b4d55", "metadata": {}, "outputs": [], "source": [ "llm = ChatOpenAI(model=\"gpt-4o-mini\", temperature=0, openai_api_key=api_key)\n", "memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)\n", "retriever = vectorstore.as_retriever()\n", "conversation_chain = ConversationalRetrievalChain.from_llm(\n", " llm=llm,\n", " retriever=retriever,\n", " memory=memory,\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "94b3a75a", "metadata": {}, "outputs": [], "source": [ "query = \"Tell me about Langchain.\"\n", "result = conversation_chain({\"question\": query})\n", "\n", "print(\"Answer:\")\n", "print(result[\"answer\"])" ] }, { "cell_type": "code", "execution_count": null, "id": "e814f910", "metadata": {}, "outputs": [], "source": [ "def rag_chat(query, history):\n", " response = conversation_chain({\"question\": query})\n", " answer = response[\"answer\"]\n", " return answer\n", "\n", "with gr.Blocks(theme=gr.themes.Soft()) as rag_ui:\n", " gr.Markdown(\"# RAG Chat Assistant\")\n", " gr.Markdown(\"Ask questions about your Markdown knowledge base.\")\n", " chat_box = gr.ChatInterface(\n", " fn=rag_chat,\n", " title=\"RAG Knowledge Base Assistant\",\n", " description=\"Chat with your Markdown-based knowledge base using RAG.\"\n", " )" ] }, { "cell_type": "code", "execution_count": null, "id": "eef8d2ee", "metadata": {}, "outputs": [], "source": [ "rag_ui.launch(debug=True, share=True)" ] } ], "metadata": { "kernelspec": { "display_name": "llm-engineering", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.12" } }, "nbformat": 4, "nbformat_minor": 5 }