diff --git a/week5/community-contributions/RAG-based-academic-assistant-v3.ipynb b/week5/community-contributions/RAG-based-academic-assistant-v3.ipynb new file mode 100644 index 0000000..7899ff8 --- /dev/null +++ b/week5/community-contributions/RAG-based-academic-assistant-v3.ipynb @@ -0,0 +1,409 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "97a93fee-6bbd-477b-aba8-577d318a9f9d", + "metadata": {}, + "source": [ + "# AI-Powered Academic Knowledge Assistant\n", + "AI-powered RAG (Retrieval-Augmented Generation) system that transforms document collections into queryable knowledge bases using OpenAI embeddings and vector search. Features configurable chunking, file size limits, and retrieval parameters with a Gradio interface for processing PDFs and generating contextually-aware responses via LangChain and ChromaDB." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3589eee0-ce34-42f4-b538-b43f3b0d9f6f", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import glob\n", + "from dotenv import load_dotenv\n", + "import gradio as gr\n", + "import shutil\n", + "import tiktoken\n", + "import time\n", + "import uuid\n", + "from typing import List, Tuple, Optional\n", + "\n", + "# imports for langchain and Chroma\n", + "from langchain.document_loaders import DirectoryLoader, TextLoader, PyPDFLoader\n", + "from langchain.text_splitter import CharacterTextSplitter\n", + "from langchain.schema import Document\n", + "from langchain_openai import OpenAIEmbeddings, ChatOpenAI\n", + "from langchain_chroma import Chroma\n", + "from langchain.memory import ConversationBufferMemory\n", + "from langchain.chains import ConversationalRetrievalChain\n", + "from langchain.embeddings import HuggingFaceEmbeddings\n", + "\n", + "from langchain_community.document_loaders import PyPDFLoader, TextLoader\n", + "from langchain.docstore.document import Document\n", + "\n", + "# Load environment variables\n", + "load_dotenv(override=True)\n", + "os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')\n", + "\n", + "# Global variables to store the current setup\n", + "current_vectorstore = None\n", + "current_conversation_chain = None\n", + "processing_status = \"\"\n", + "\n", + "def count_tokens(text: str, model: str = \"gpt-4o-mini\") -> int:\n", + " \"\"\"Count tokens in text using tiktoken\"\"\"\n", + " try:\n", + " encoding = tiktoken.encoding_for_model(model)\n", + " return len(encoding.encode(text))\n", + " except:\n", + " # Fallback estimation: roughly 4 characters per token\n", + " return len(text) // 4\n", + "\n", + "def filter_chunks_by_tokens(chunks: List[Document], max_total_tokens: int = 250000) -> List[Document]:\n", + " \"\"\"Filter chunks to stay within token limits\"\"\"\n", + " filtered_chunks = []\n", + " total_tokens = 0\n", + " \n", + " for chunk in chunks:\n", + " chunk_tokens = count_tokens(chunk.page_content)\n", + " \n", + " # Skip individual chunks that are too large (shouldn't happen with proper splitting)\n", + " if chunk_tokens > 8000: # Individual chunk limit\n", + " continue\n", + " \n", + " if total_tokens + chunk_tokens <= max_total_tokens:\n", + " filtered_chunks.append(chunk)\n", + " total_tokens += chunk_tokens\n", + " else:\n", + " break\n", + " \n", + " return filtered_chunks\n", + "\n", + "def add_metadata(doc, doc_type, file_path):\n", + " \"\"\"Add metadata including document type and file information\"\"\"\n", + " doc.metadata[\"doc_type\"] = doc_type\n", + " doc.metadata[\"file_path\"] = file_path\n", + " doc.metadata[\"file_name\"] = os.path.basename(file_path)\n", + " return doc\n", + "\n", + "def check_file_size(file_path, max_size_bytes):\n", + " \"\"\"Check if file size is within the limit\"\"\"\n", + " try:\n", + " file_size = os.path.getsize(file_path)\n", + " return file_size <= max_size_bytes, file_size\n", + " except OSError:\n", + " return False, 0\n", + "\n", + "def load_pdfs_with_size_limit(folder_path, doc_type, max_size_bytes):\n", + " \"\"\"Load PDF files from a folder with size restrictions\"\"\"\n", + " pdf_files = glob.glob(os.path.join(folder_path, \"**/*.pdf\"), recursive=True)\n", + " loaded_docs = []\n", + " skipped_files = []\n", + " \n", + " for pdf_file in pdf_files:\n", + " is_valid_size, file_size = check_file_size(pdf_file, max_size_bytes)\n", + " \n", + " if is_valid_size:\n", + " try:\n", + " loader = PyPDFLoader(pdf_file)\n", + " docs = loader.load()\n", + " docs_with_metadata = [add_metadata(doc, doc_type, pdf_file) for doc in docs]\n", + " loaded_docs.extend(docs_with_metadata)\n", + " except Exception as e:\n", + " skipped_files.append((pdf_file, f\"Loading error: {str(e)}\"))\n", + " else:\n", + " file_size_mb = file_size / 1024 / 1024\n", + " skipped_files.append((pdf_file, f\"File too large: {file_size_mb:.2f} MB\"))\n", + " \n", + " return loaded_docs, skipped_files\n", + "\n", + "def process_documents(knowledge_base_dir: str, max_file_size_mb: float, chunk_size: int, chunk_overlap: int) -> Tuple[str, str]:\n", + " \"\"\"Process documents and create vector store\"\"\"\n", + " global current_vectorstore, current_conversation_chain\n", + " \n", + " try:\n", + " # Validate directory\n", + " if not knowledge_base_dir or not knowledge_base_dir.strip():\n", + " return \"āŒ Error: Please enter a directory path!\", \"\"\n", + " \n", + " directory_path = knowledge_base_dir.strip()\n", + " \n", + " if not os.path.exists(directory_path):\n", + " return \"āŒ Error: Directory does not exist! Please check the path.\", \"\"\n", + " \n", + " # Configuration\n", + " MAX_FILE_SIZE_BYTES = int(max_file_size_mb * 1024 * 1024)\n", + " \n", + " # Find folders\n", + " if directory_path.endswith('*'):\n", + " folders = glob.glob(directory_path)\n", + " else:\n", + " folders = glob.glob(os.path.join(directory_path, \"*\"))\n", + " \n", + " if not folders:\n", + " return \"āŒ Error: No folders found in the specified directory!\", \"\"\n", + " \n", + " # Process documents\n", + " documents = []\n", + " all_skipped_files = []\n", + " status_lines = []\n", + " \n", + " status_lines.append(f\"šŸ” Processing folders with {max_file_size_mb} MB file size limit...\")\n", + " status_lines.append(\"-\" * 60)\n", + " \n", + " for folder in folders:\n", + " if os.path.isdir(folder):\n", + " doc_type = os.path.basename(folder)\n", + " status_lines.append(f\"šŸ“ Processing folder: {doc_type}\")\n", + " \n", + " folder_docs, skipped_files = load_pdfs_with_size_limit(folder, doc_type, MAX_FILE_SIZE_BYTES)\n", + " documents.extend(folder_docs)\n", + " all_skipped_files.extend(skipped_files)\n", + " \n", + " if folder_docs:\n", + " status_lines.append(f\" āœ… Loaded {len(folder_docs)} document pages\")\n", + " if skipped_files:\n", + " status_lines.append(f\" āš ļø Skipped {len(skipped_files)} files\")\n", + " \n", + " if not documents:\n", + " error_msg = \"āŒ No PDF documents were loaded successfully.\"\n", + " if all_skipped_files:\n", + " error_msg += f\"\\n\\nAll {len(all_skipped_files)} files were skipped:\"\n", + " for file_path, reason in all_skipped_files[:10]: # Show first 10\n", + " error_msg += f\"\\n • {os.path.basename(file_path)}: {reason}\"\n", + " if len(all_skipped_files) > 10:\n", + " error_msg += f\"\\n ... and {len(all_skipped_files) - 10} more\"\n", + " return error_msg, \"\"\n", + " \n", + " # Text splitting\n", + " status_lines.append(\"\\n\" + \"=\"*40)\n", + " status_lines.append(\"āœ‚ļø TEXT SPLITTING\")\n", + " status_lines.append(\"=\"*40)\n", + " \n", + " text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)\n", + " chunks = text_splitter.split_documents(documents)\n", + " \n", + " # Filter chunks by token count to prevent API errors\n", + " status_lines.append(\"šŸ”¢ Checking token limits...\")\n", + " original_chunk_count = len(chunks)\n", + " chunks = filter_chunks_by_tokens(chunks, max_total_tokens=250000)\n", + " \n", + " if len(chunks) < original_chunk_count:\n", + " status_lines.append(f\"āš ļø Filtered from {original_chunk_count} to {len(chunks)} chunks to stay within token limits\")\n", + " \n", + " # Create vectorstore\n", + " status_lines.append(\"🧮 Creating vector embeddings...\")\n", + " embeddings = OpenAIEmbeddings()\n", + " \n", + " # Use a temporary database name\n", + " db_name = \"temp_vector_db\"\n", + " \n", + " # Delete if already exists\n", + " if os.path.exists(db_name):\n", + " shutil.rmtree(db_name)\n", + " \n", + " # Create vectorstore\n", + " vectorstore = Chroma.from_documents(\n", + " documents=chunks, \n", + " embedding=embeddings, \n", + " persist_directory=db_name\n", + " )\n", + " \n", + " # Update global variables\n", + " current_vectorstore = vectorstore\n", + " \n", + " # Create conversation chain\n", + " llm = ChatOpenAI(temperature=0.7, model_name=\"gpt-4o-mini\")\n", + " memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)\n", + " retriever = vectorstore.as_retriever(search_kwargs={\"k\": 25})\n", + " current_conversation_chain = ConversationalRetrievalChain.from_llm(\n", + " llm=llm, \n", + " retriever=retriever, \n", + " memory=memory\n", + " )\n", + " \n", + " # Summary statistics\n", + " status_lines.append(\"\\n\" + \"=\"*40)\n", + " status_lines.append(\"šŸ“Š SUMMARY\")\n", + " status_lines.append(\"=\"*40)\n", + " status_lines.append(f\"āœ… Total PDFs processed: {len(set(doc.metadata['file_path'] for doc in documents))}\")\n", + " status_lines.append(f\"šŸ“„ Total document pages: {len(documents)}\")\n", + " status_lines.append(f\"🧩 Total text chunks: {len(chunks)}\")\n", + " status_lines.append(f\"šŸ“ Document types: {', '.join(set(doc.metadata['doc_type'] for doc in documents))}\")\n", + " status_lines.append(f\"šŸ—ƒļø Vector store size: {vectorstore._collection.count()} embeddings\")\n", + " \n", + " if all_skipped_files:\n", + " status_lines.append(f\"\\nāš ļø Skipped files: {len(all_skipped_files)}\")\n", + " for file_path, reason in all_skipped_files[:5]: # Show first 5\n", + " status_lines.append(f\" • {os.path.basename(file_path)}: {reason}\")\n", + " if len(all_skipped_files) > 5:\n", + " status_lines.append(f\" ... and {len(all_skipped_files) - 5} more\")\n", + " \n", + " success_msg = \"āœ… Knowledge base successfully created and ready for questions!\"\n", + " detailed_status = \"\\n\".join(status_lines)\n", + " \n", + " return success_msg, detailed_status\n", + " \n", + " except Exception as e:\n", + " error_msg = f\"āŒ Error processing documents: {str(e)}\"\n", + " return error_msg, \"\"\n", + "\n", + "def chat_with_documents(message, history, num_chunks):\n", + " \"\"\"Chat with the processed documents\"\"\"\n", + " global current_conversation_chain, current_vectorstore\n", + " \n", + " if current_conversation_chain is None:\n", + " return \"āŒ Please process documents first before asking questions!\"\n", + " \n", + " try:\n", + " # Update retriever with new chunk count\n", + " if current_vectorstore is not None:\n", + " retriever = current_vectorstore.as_retriever(search_kwargs={\"k\": num_chunks})\n", + " current_conversation_chain.retriever = retriever\n", + " \n", + " result = current_conversation_chain.invoke({\"question\": message})\n", + " return result[\"answer\"]\n", + " \n", + " except Exception as e:\n", + " return f\"āŒ Error generating response: {str(e)}\"\n", + "\n", + "def reset_conversation():\n", + " \"\"\"Reset the conversation memory\"\"\"\n", + " global current_conversation_chain\n", + " if current_conversation_chain is not None:\n", + " current_conversation_chain.memory.clear()\n", + " return \"āœ… Conversation history cleared!\"\n", + " return \"No active conversation to reset.\"\n", + "\n", + "# Create Gradio Interface\n", + "with gr.Blocks(title=\"AI-Powered Academic Knowledge Assistant\", theme=gr.themes.Soft()) as app:\n", + " gr.Markdown(\"# šŸŽ“ AI-Powered Academic Knowledge Assistant\")\n", + " gr.Markdown(\"Transform your entire document library into an intelligent, searchable AI tutor that answers questions instantly.\")\n", + " \n", + " with gr.Tabs():\n", + " # Configuration Tab\n", + " with gr.Tab(\"āš™ļø Configuration\"):\n", + " gr.Markdown(\"### šŸ“ Document Processing Settings\")\n", + " \n", + " gr.Markdown(\"šŸ’” **Tip:** Copy and paste your folder path here. On mobile, you can use file manager apps to copy folder paths.\")\n", + " \n", + " with gr.Row():\n", + " with gr.Column():\n", + " knowledge_dir = gr.Textbox(\n", + " label=\"Knowledge Base Directory\",\n", + " value=r\"C:\\Users\\Documents\\Syllabi\\Georgia Tech\\Spring 22\\Microwave Design\",\n", + " placeholder=\"Enter or paste your document directory path\",\n", + " lines=1\n", + " )\n", + " \n", + " max_file_size = gr.Slider(\n", + " label=\"Max File Size (MB)\",\n", + " minimum=0.5,\n", + " maximum=50,\n", + " value=4,\n", + " step=0.5\n", + " )\n", + " \n", + " with gr.Column():\n", + " chunk_size = gr.Slider(\n", + " label=\"Chunk Size (characters)\",\n", + " minimum=200,\n", + " maximum=1500,\n", + " value=800,\n", + " step=100,\n", + " info=\"Smaller chunks = better token management\"\n", + " )\n", + " \n", + " chunk_overlap = gr.Slider(\n", + " label=\"Chunk Overlap (characters)\",\n", + " minimum=0,\n", + " maximum=300,\n", + " value=150,\n", + " step=25,\n", + " info=\"Overlap preserves context between chunks\"\n", + " )\n", + " \n", + " process_btn = gr.Button(\"šŸš€ Process Documents\", variant=\"primary\", size=\"lg\")\n", + " \n", + " with gr.Row():\n", + " status_output = gr.Textbox(\n", + " label=\"Status\",\n", + " lines=2,\n", + " max_lines=2\n", + " )\n", + " \n", + " detailed_output = gr.Textbox(\n", + " label=\"Detailed Processing Log\",\n", + " lines=15,\n", + " max_lines=20\n", + " )\n", + " \n", + " # Chat Tab\n", + " with gr.Tab(\"šŸ’¬ Chat\"):\n", + " gr.Markdown(\"### šŸ¤– Ask Questions About Your Documents\")\n", + " \n", + " with gr.Row():\n", + " with gr.Column(scale=1):\n", + " num_chunks = gr.Slider(\n", + " label=\"Number of chunks to retrieve\",\n", + " minimum=1,\n", + " maximum=50,\n", + " value=25,\n", + " step=1\n", + " )\n", + " \n", + " reset_btn = gr.Button(\"šŸ—‘ļø Clear Chat History\", variant=\"secondary\")\n", + " reset_output = gr.Textbox(label=\"Reset Status\", lines=1)\n", + " \n", + " with gr.Column(scale=3):\n", + " chatbot = gr.ChatInterface(\n", + " fn=lambda msg, history: chat_with_documents(msg, history, num_chunks.value),\n", + " type=\"messages\",\n", + " title=\"Academic Assistant Chat\",\n", + " description=\"Ask questions about your processed documents\"\n", + " )\n", + " \n", + " # Event handlers\n", + " process_btn.click(\n", + " fn=process_documents,\n", + " inputs=[knowledge_dir, max_file_size, chunk_size, chunk_overlap],\n", + " outputs=[status_output, detailed_output]\n", + " )\n", + " \n", + " reset_btn.click(\n", + " fn=reset_conversation,\n", + " outputs=reset_output\n", + " )\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9eb807e0-194b-48dd-a1e9-b1b9b8a99620", + "metadata": {}, + "outputs": [], + "source": [ + "app.launch(share=True, inbrowser=True)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}