{ "cells": [ { "cell_type": "markdown", "id": "97a93fee-6bbd-477b-aba8-577d318a9f9d", "metadata": {}, "source": [ "# AI-Powered Academic Knowledge Assistant\n", "AI-powered RAG (Retrieval-Augmented Generation) system that transforms document collections into queryable knowledge bases using OpenAI embeddings and vector search. Features configurable chunking, file size limits, and retrieval parameters with a Gradio interface for processing PDFs and generating contextually-aware responses via LangChain and ChromaDB." ] }, { "cell_type": "code", "execution_count": null, "id": "3589eee0-ce34-42f4-b538-b43f3b0d9f6f", "metadata": {}, "outputs": [], "source": [ "import os\n", "import glob\n", "from dotenv import load_dotenv\n", "import gradio as gr\n", "import shutil\n", "import tiktoken\n", "import time\n", "import uuid\n", "from typing import List, Tuple, Optional\n", "\n", "# imports for langchain and Chroma\n", "from langchain.document_loaders import DirectoryLoader, TextLoader, PyPDFLoader\n", "from langchain.text_splitter import CharacterTextSplitter\n", "from langchain.schema import Document\n", "from langchain_openai import OpenAIEmbeddings, ChatOpenAI\n", "from langchain_chroma import Chroma\n", "from langchain.memory import ConversationBufferMemory\n", "from langchain.chains import ConversationalRetrievalChain\n", "from langchain.embeddings import HuggingFaceEmbeddings\n", "\n", "from langchain_community.document_loaders import PyPDFLoader, TextLoader\n", "from langchain.docstore.document import Document\n", "\n", "# Load environment variables\n", "load_dotenv(override=True)\n", "os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')\n", "\n", "# Global variables to store the current setup\n", "current_vectorstore = None\n", "current_conversation_chain = None\n", "processing_status = \"\"\n", "\n", "def count_tokens(text: str, model: str = \"gpt-4o-mini\") -> int:\n", " \"\"\"Count tokens in text using tiktoken\"\"\"\n", " try:\n", " encoding = tiktoken.encoding_for_model(model)\n", " return len(encoding.encode(text))\n", " except:\n", " # Fallback estimation: roughly 4 characters per token\n", " return len(text) // 4\n", "\n", "def filter_chunks_by_tokens(chunks: List[Document], max_total_tokens: int = 250000) -> List[Document]:\n", " \"\"\"Filter chunks to stay within token limits\"\"\"\n", " filtered_chunks = []\n", " total_tokens = 0\n", " \n", " for chunk in chunks:\n", " chunk_tokens = count_tokens(chunk.page_content)\n", " \n", " # Skip individual chunks that are too large (shouldn't happen with proper splitting)\n", " if chunk_tokens > 8000: # Individual chunk limit\n", " continue\n", " \n", " if total_tokens + chunk_tokens <= max_total_tokens:\n", " filtered_chunks.append(chunk)\n", " total_tokens += chunk_tokens\n", " else:\n", " break\n", " \n", " return filtered_chunks\n", "\n", "def add_metadata(doc, doc_type, file_path):\n", " \"\"\"Add metadata including document type and file information\"\"\"\n", " doc.metadata[\"doc_type\"] = doc_type\n", " doc.metadata[\"file_path\"] = file_path\n", " doc.metadata[\"file_name\"] = os.path.basename(file_path)\n", " return doc\n", "\n", "def check_file_size(file_path, max_size_bytes):\n", " \"\"\"Check if file size is within the limit\"\"\"\n", " try:\n", " file_size = os.path.getsize(file_path)\n", " return file_size <= max_size_bytes, file_size\n", " except OSError:\n", " return False, 0\n", "\n", "def load_pdfs_with_size_limit(folder_path, doc_type, max_size_bytes):\n", " \"\"\"Load PDF files from a folder with size restrictions\"\"\"\n", " pdf_files = glob.glob(os.path.join(folder_path, \"**/*.pdf\"), recursive=True)\n", " loaded_docs = []\n", " skipped_files = []\n", " \n", " for pdf_file in pdf_files:\n", " is_valid_size, file_size = check_file_size(pdf_file, max_size_bytes)\n", " \n", " if is_valid_size:\n", " try:\n", " loader = PyPDFLoader(pdf_file)\n", " docs = loader.load()\n", " docs_with_metadata = [add_metadata(doc, doc_type, pdf_file) for doc in docs]\n", " loaded_docs.extend(docs_with_metadata)\n", " except Exception as e:\n", " skipped_files.append((pdf_file, f\"Loading error: {str(e)}\"))\n", " else:\n", " file_size_mb = file_size / 1024 / 1024\n", " skipped_files.append((pdf_file, f\"File too large: {file_size_mb:.2f} MB\"))\n", " \n", " return loaded_docs, skipped_files\n", "\n", "def process_documents(knowledge_base_dir: str, max_file_size_mb: float, chunk_size: int, chunk_overlap: int) -> Tuple[str, str]:\n", " \"\"\"Process documents and create vector store\"\"\"\n", " global current_vectorstore, current_conversation_chain\n", " \n", " try:\n", " # Validate directory\n", " if not knowledge_base_dir or not knowledge_base_dir.strip():\n", " return \"āŒ Error: Please enter a directory path!\", \"\"\n", " \n", " directory_path = knowledge_base_dir.strip()\n", " \n", " if not os.path.exists(directory_path):\n", " return \"āŒ Error: Directory does not exist! Please check the path.\", \"\"\n", " \n", " # Configuration\n", " MAX_FILE_SIZE_BYTES = int(max_file_size_mb * 1024 * 1024)\n", " \n", " # Find folders\n", " if directory_path.endswith('*'):\n", " folders = glob.glob(directory_path)\n", " else:\n", " folders = glob.glob(os.path.join(directory_path, \"*\"))\n", " \n", " if not folders:\n", " return \"āŒ Error: No folders found in the specified directory!\", \"\"\n", " \n", " # Process documents\n", " documents = []\n", " all_skipped_files = []\n", " status_lines = []\n", " \n", " status_lines.append(f\"šŸ” Processing folders with {max_file_size_mb} MB file size limit...\")\n", " status_lines.append(\"-\" * 60)\n", " \n", " for folder in folders:\n", " if os.path.isdir(folder):\n", " doc_type = os.path.basename(folder)\n", " status_lines.append(f\"šŸ“ Processing folder: {doc_type}\")\n", " \n", " folder_docs, skipped_files = load_pdfs_with_size_limit(folder, doc_type, MAX_FILE_SIZE_BYTES)\n", " documents.extend(folder_docs)\n", " all_skipped_files.extend(skipped_files)\n", " \n", " if folder_docs:\n", " status_lines.append(f\" āœ… Loaded {len(folder_docs)} document pages\")\n", " if skipped_files:\n", " status_lines.append(f\" āš ļø Skipped {len(skipped_files)} files\")\n", " \n", " if not documents:\n", " error_msg = \"āŒ No PDF documents were loaded successfully.\"\n", " if all_skipped_files:\n", " error_msg += f\"\\n\\nAll {len(all_skipped_files)} files were skipped:\"\n", " for file_path, reason in all_skipped_files[:10]: # Show first 10\n", " error_msg += f\"\\n • {os.path.basename(file_path)}: {reason}\"\n", " if len(all_skipped_files) > 10:\n", " error_msg += f\"\\n ... and {len(all_skipped_files) - 10} more\"\n", " return error_msg, \"\"\n", " \n", " # Text splitting\n", " status_lines.append(\"\\n\" + \"=\"*40)\n", " status_lines.append(\"āœ‚ļø TEXT SPLITTING\")\n", " status_lines.append(\"=\"*40)\n", " \n", " text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)\n", " chunks = text_splitter.split_documents(documents)\n", " \n", " # Filter chunks by token count to prevent API errors\n", " status_lines.append(\"šŸ”¢ Checking token limits...\")\n", " original_chunk_count = len(chunks)\n", " chunks = filter_chunks_by_tokens(chunks, max_total_tokens=250000)\n", " \n", " if len(chunks) < original_chunk_count:\n", " status_lines.append(f\"āš ļø Filtered from {original_chunk_count} to {len(chunks)} chunks to stay within token limits\")\n", " \n", " # Create vectorstore\n", " status_lines.append(\"🧮 Creating vector embeddings...\")\n", " embeddings = OpenAIEmbeddings()\n", " \n", " # Use a temporary database name\n", " db_name = \"temp_vector_db\"\n", " \n", " # Delete if already exists\n", " if os.path.exists(db_name):\n", " shutil.rmtree(db_name)\n", " \n", " # Create vectorstore\n", " vectorstore = Chroma.from_documents(\n", " documents=chunks, \n", " embedding=embeddings, \n", " persist_directory=db_name\n", " )\n", " \n", " # Update global variables\n", " current_vectorstore = vectorstore\n", " \n", " # Create conversation chain\n", " llm = ChatOpenAI(temperature=0.7, model_name=\"gpt-4o-mini\")\n", " memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)\n", " retriever = vectorstore.as_retriever(search_kwargs={\"k\": 25})\n", " current_conversation_chain = ConversationalRetrievalChain.from_llm(\n", " llm=llm, \n", " retriever=retriever, \n", " memory=memory\n", " )\n", " \n", " # Summary statistics\n", " status_lines.append(\"\\n\" + \"=\"*40)\n", " status_lines.append(\"šŸ“Š SUMMARY\")\n", " status_lines.append(\"=\"*40)\n", " status_lines.append(f\"āœ… Total PDFs processed: {len(set(doc.metadata['file_path'] for doc in documents))}\")\n", " status_lines.append(f\"šŸ“„ Total document pages: {len(documents)}\")\n", " status_lines.append(f\"🧩 Total text chunks: {len(chunks)}\")\n", " status_lines.append(f\"šŸ“ Document types: {', '.join(set(doc.metadata['doc_type'] for doc in documents))}\")\n", " status_lines.append(f\"šŸ—ƒļø Vector store size: {vectorstore._collection.count()} embeddings\")\n", " \n", " if all_skipped_files:\n", " status_lines.append(f\"\\nāš ļø Skipped files: {len(all_skipped_files)}\")\n", " for file_path, reason in all_skipped_files[:5]: # Show first 5\n", " status_lines.append(f\" • {os.path.basename(file_path)}: {reason}\")\n", " if len(all_skipped_files) > 5:\n", " status_lines.append(f\" ... and {len(all_skipped_files) - 5} more\")\n", " \n", " success_msg = \"āœ… Knowledge base successfully created and ready for questions!\"\n", " detailed_status = \"\\n\".join(status_lines)\n", " \n", " return success_msg, detailed_status\n", " \n", " except Exception as e:\n", " error_msg = f\"āŒ Error processing documents: {str(e)}\"\n", " return error_msg, \"\"\n", "\n", "def chat_with_documents(message, history, num_chunks):\n", " \"\"\"Chat with the processed documents\"\"\"\n", " global current_conversation_chain, current_vectorstore\n", " \n", " if current_conversation_chain is None:\n", " return \"āŒ Please process documents first before asking questions!\"\n", " \n", " try:\n", " # Update retriever with new chunk count\n", " if current_vectorstore is not None:\n", " retriever = current_vectorstore.as_retriever(search_kwargs={\"k\": num_chunks})\n", " current_conversation_chain.retriever = retriever\n", " \n", " result = current_conversation_chain.invoke({\"question\": message})\n", " return result[\"answer\"]\n", " \n", " except Exception as e:\n", " return f\"āŒ Error generating response: {str(e)}\"\n", "\n", "def reset_conversation():\n", " \"\"\"Reset the conversation memory\"\"\"\n", " global current_conversation_chain\n", " if current_conversation_chain is not None:\n", " current_conversation_chain.memory.clear()\n", " return \"āœ… Conversation history cleared!\"\n", " return \"No active conversation to reset.\"\n", "\n", "# Create Gradio Interface\n", "with gr.Blocks(title=\"AI-Powered Academic Knowledge Assistant\", theme=gr.themes.Soft()) as app:\n", " gr.Markdown(\"# šŸŽ“ AI-Powered Academic Knowledge Assistant\")\n", " gr.Markdown(\"Transform your entire document library into an intelligent, searchable AI tutor that answers questions instantly.\")\n", " \n", " with gr.Tabs():\n", " # Configuration Tab\n", " with gr.Tab(\"āš™ļø Configuration\"):\n", " gr.Markdown(\"### šŸ“ Document Processing Settings\")\n", " \n", " gr.Markdown(\"šŸ’” **Tip:** Copy and paste your folder path here. On mobile, you can use file manager apps to copy folder paths.\")\n", " \n", " with gr.Row():\n", " with gr.Column():\n", " knowledge_dir = gr.Textbox(\n", " label=\"Knowledge Base Directory\",\n", " value=r\"C:\\Users\\Documents\\Syllabi\\Georgia Tech\\Spring 22\\Microwave Design\",\n", " placeholder=\"Enter or paste your document directory path\",\n", " lines=1\n", " )\n", " \n", " max_file_size = gr.Slider(\n", " label=\"Max File Size (MB)\",\n", " minimum=0.5,\n", " maximum=50,\n", " value=4,\n", " step=0.5\n", " )\n", " \n", " with gr.Column():\n", " chunk_size = gr.Slider(\n", " label=\"Chunk Size (characters)\",\n", " minimum=200,\n", " maximum=1500,\n", " value=800,\n", " step=100,\n", " info=\"Smaller chunks = better token management\"\n", " )\n", " \n", " chunk_overlap = gr.Slider(\n", " label=\"Chunk Overlap (characters)\",\n", " minimum=0,\n", " maximum=300,\n", " value=150,\n", " step=25,\n", " info=\"Overlap preserves context between chunks\"\n", " )\n", " \n", " process_btn = gr.Button(\"šŸš€ Process Documents\", variant=\"primary\", size=\"lg\")\n", " \n", " with gr.Row():\n", " status_output = gr.Textbox(\n", " label=\"Status\",\n", " lines=2,\n", " max_lines=2\n", " )\n", " \n", " detailed_output = gr.Textbox(\n", " label=\"Detailed Processing Log\",\n", " lines=15,\n", " max_lines=20\n", " )\n", " \n", " # Chat Tab\n", " with gr.Tab(\"šŸ’¬ Chat\"):\n", " gr.Markdown(\"### šŸ¤– Ask Questions About Your Documents\")\n", " \n", " with gr.Row():\n", " with gr.Column(scale=1):\n", " num_chunks = gr.Slider(\n", " label=\"Number of chunks to retrieve\",\n", " minimum=1,\n", " maximum=50,\n", " value=25,\n", " step=1\n", " )\n", " \n", " reset_btn = gr.Button(\"šŸ—‘ļø Clear Chat History\", variant=\"secondary\")\n", " reset_output = gr.Textbox(label=\"Reset Status\", lines=1)\n", " \n", " with gr.Column(scale=3):\n", " chatbot = gr.ChatInterface(\n", " fn=lambda msg, history: chat_with_documents(msg, history, num_chunks.value),\n", " type=\"messages\",\n", " title=\"Academic Assistant Chat\",\n", " description=\"Ask questions about your processed documents\"\n", " )\n", " \n", " # Event handlers\n", " process_btn.click(\n", " fn=process_documents,\n", " inputs=[knowledge_dir, max_file_size, chunk_size, chunk_overlap],\n", " outputs=[status_output, detailed_output]\n", " )\n", " \n", " reset_btn.click(\n", " fn=reset_conversation,\n", " outputs=reset_output\n", " )\n" ] }, { "cell_type": "code", "execution_count": null, "id": "9eb807e0-194b-48dd-a1e9-b1b9b8a99620", "metadata": {}, "outputs": [], "source": [ "app.launch(share=True, inbrowser=True)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.13" } }, "nbformat": 4, "nbformat_minor": 5 }