Merge pull request #883 from smqd19/week-5-submission

Gen AI Bootcamp - Week 5 Rag
This commit is contained in:
Ed Donner
2025-10-28 20:01:14 -04:00
committed by GitHub

View File

@@ -0,0 +1,178 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"id": "7015d967",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"from dotenv import load_dotenv\n",
"from langchain.chat_models import ChatOpenAI\n",
"from langchain.chains import ConversationalRetrievalChain\n",
"from langchain.memory import ConversationBufferMemory\n",
"from langchain.vectorstores import Chroma\n",
"from langchain.embeddings import OpenAIEmbeddings\n",
"from langchain.document_loaders import DirectoryLoader, TextLoader\n",
"from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
"from langchain_experimental.text_splitter import SemanticChunker\n",
"from langchain.schema import Document\n",
"import gradio as gr\n",
"import glob"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "87646db6",
"metadata": {},
"outputs": [],
"source": [
"load_dotenv(override=True)\n",
"api_key = os.getenv('OPENAI_API_KEY')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1019e3b8",
"metadata": {},
"outputs": [],
"source": [
"file_paths = glob.glob(\"knowledge_base/**/*.md\", recursive=True)\n",
"\n",
"documents = []\n",
"for path in file_paths:\n",
" with open(path, \"r\", encoding=\"utf-8\") as f:\n",
" text = f.read()\n",
" doc_type = os.path.basename(os.path.dirname(path)) \n",
"\n",
" documents.append(\n",
" Document(\n",
" page_content=text,\n",
" metadata={\n",
" \"doc_type\": doc_type,\n",
" },\n",
" )\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "54527a21",
"metadata": {},
"outputs": [],
"source": [
"embeddings = OpenAIEmbeddings(model=\"text-embedding-3-small\", openai_api_key=api_key)\n",
"\n",
"# text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)\n",
"text_splitter = SemanticChunker(embeddings)\n",
"chunks = text_splitter.split_documents(documents)\n",
"\n",
"print(f\"Total number of chunks: {len(chunks)}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "15579dda",
"metadata": {},
"outputs": [],
"source": [
"vectorstore = Chroma.from_documents(\n",
" documents=chunks,\n",
" embedding=embeddings,\n",
" persist_directory=\"chroma_db\"\n",
")\n",
"vectorstore.persist()\n",
"print(\"Chroma vector store built.\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ca3b4d55",
"metadata": {},
"outputs": [],
"source": [
"llm = ChatOpenAI(model=\"gpt-4o-mini\", temperature=0, openai_api_key=api_key)\n",
"memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)\n",
"retriever = vectorstore.as_retriever()\n",
"conversation_chain = ConversationalRetrievalChain.from_llm(\n",
" llm=llm,\n",
" retriever=retriever,\n",
" memory=memory,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "94b3a75a",
"metadata": {},
"outputs": [],
"source": [
"query = \"Tell me about Langchain.\"\n",
"result = conversation_chain({\"question\": query})\n",
"\n",
"print(\"Answer:\")\n",
"print(result[\"answer\"])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e814f910",
"metadata": {},
"outputs": [],
"source": [
"def rag_chat(query, history):\n",
" response = conversation_chain({\"question\": query})\n",
" answer = response[\"answer\"]\n",
" return answer\n",
"\n",
"with gr.Blocks(theme=gr.themes.Soft()) as rag_ui:\n",
" gr.Markdown(\"# RAG Chat Assistant\")\n",
" gr.Markdown(\"Ask questions about your Markdown knowledge base.\")\n",
" chat_box = gr.ChatInterface(\n",
" fn=rag_chat,\n",
" title=\"RAG Knowledge Base Assistant\",\n",
" description=\"Chat with your Markdown-based knowledge base using RAG.\"\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "eef8d2ee",
"metadata": {},
"outputs": [],
"source": [
"rag_ui.launch(debug=True, share=True)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "llm-engineering",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}