From c2c3655fb3c4260a3ea49d5200e7ec65194ef214 Mon Sep 17 00:00:00 2001 From: aashahid Date: Wed, 29 Oct 2025 02:44:11 +0500 Subject: [PATCH] Add Week 5 submission for muhammad_qasim_sheikh --- .../Week 5/Day 5/rag.ipynb | 178 ++++++++++++++++++ 1 file changed, 178 insertions(+) create mode 100644 community-contributions/muhammad_qasim_sheikh/Week 5/Day 5/rag.ipynb diff --git a/community-contributions/muhammad_qasim_sheikh/Week 5/Day 5/rag.ipynb b/community-contributions/muhammad_qasim_sheikh/Week 5/Day 5/rag.ipynb new file mode 100644 index 0000000..28229d7 --- /dev/null +++ b/community-contributions/muhammad_qasim_sheikh/Week 5/Day 5/rag.ipynb @@ -0,0 +1,178 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "id": "7015d967", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from dotenv import load_dotenv\n", + "from langchain.chat_models import ChatOpenAI\n", + "from langchain.chains import ConversationalRetrievalChain\n", + "from langchain.memory import ConversationBufferMemory\n", + "from langchain.vectorstores import Chroma\n", + "from langchain.embeddings import OpenAIEmbeddings\n", + "from langchain.document_loaders import DirectoryLoader, TextLoader\n", + "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", + "from langchain_experimental.text_splitter import SemanticChunker\n", + "from langchain.schema import Document\n", + "import gradio as gr\n", + "import glob" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "87646db6", + "metadata": {}, + "outputs": [], + "source": [ + "load_dotenv(override=True)\n", + "api_key = os.getenv('OPENAI_API_KEY')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1019e3b8", + "metadata": {}, + "outputs": [], + "source": [ + "file_paths = glob.glob(\"knowledge_base/**/*.md\", recursive=True)\n", + "\n", + "documents = []\n", + "for path in file_paths:\n", + " with open(path, \"r\", encoding=\"utf-8\") as f:\n", + " text = f.read()\n", + " doc_type = os.path.basename(os.path.dirname(path)) \n", + "\n", + " documents.append(\n", + " Document(\n", + " page_content=text,\n", + " metadata={\n", + " \"doc_type\": doc_type,\n", + " },\n", + " )\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "54527a21", + "metadata": {}, + "outputs": [], + "source": [ + "embeddings = OpenAIEmbeddings(model=\"text-embedding-3-small\", openai_api_key=api_key)\n", + "\n", + "# text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)\n", + "text_splitter = SemanticChunker(embeddings)\n", + "chunks = text_splitter.split_documents(documents)\n", + "\n", + "print(f\"Total number of chunks: {len(chunks)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "15579dda", + "metadata": {}, + "outputs": [], + "source": [ + "vectorstore = Chroma.from_documents(\n", + " documents=chunks,\n", + " embedding=embeddings,\n", + " persist_directory=\"chroma_db\"\n", + ")\n", + "vectorstore.persist()\n", + "print(\"Chroma vector store built.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ca3b4d55", + "metadata": {}, + "outputs": [], + "source": [ + "llm = ChatOpenAI(model=\"gpt-4o-mini\", temperature=0, openai_api_key=api_key)\n", + "memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)\n", + "retriever = vectorstore.as_retriever()\n", + "conversation_chain = ConversationalRetrievalChain.from_llm(\n", + " llm=llm,\n", + " retriever=retriever,\n", + " memory=memory,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "94b3a75a", + "metadata": {}, + "outputs": [], + "source": [ + "query = \"Tell me about Langchain.\"\n", + "result = conversation_chain({\"question\": query})\n", + "\n", + "print(\"Answer:\")\n", + "print(result[\"answer\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e814f910", + "metadata": {}, + "outputs": [], + "source": [ + "def rag_chat(query, history):\n", + " response = conversation_chain({\"question\": query})\n", + " answer = response[\"answer\"]\n", + " return answer\n", + "\n", + "with gr.Blocks(theme=gr.themes.Soft()) as rag_ui:\n", + " gr.Markdown(\"# RAG Chat Assistant\")\n", + " gr.Markdown(\"Ask questions about your Markdown knowledge base.\")\n", + " chat_box = gr.ChatInterface(\n", + " fn=rag_chat,\n", + " title=\"RAG Knowledge Base Assistant\",\n", + " description=\"Chat with your Markdown-based knowledge base using RAG.\"\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eef8d2ee", + "metadata": {}, + "outputs": [], + "source": [ + "rag_ui.launch(debug=True, share=True)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "llm-engineering", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}