Added week 5 challenge to community-contributions
This commit is contained in:
@@ -0,0 +1,409 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "97a93fee-6bbd-477b-aba8-577d318a9f9d",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# AI-Powered Academic Knowledge Assistant\n",
|
||||
"AI-powered RAG (Retrieval-Augmented Generation) system that transforms document collections into queryable knowledge bases using OpenAI embeddings and vector search. Features configurable chunking, file size limits, and retrieval parameters with a Gradio interface for processing PDFs and generating contextually-aware responses via LangChain and ChromaDB."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "3589eee0-ce34-42f4-b538-b43f3b0d9f6f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import glob\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"import gradio as gr\n",
|
||||
"import shutil\n",
|
||||
"import tiktoken\n",
|
||||
"import time\n",
|
||||
"import uuid\n",
|
||||
"from typing import List, Tuple, Optional\n",
|
||||
"\n",
|
||||
"# imports for langchain and Chroma\n",
|
||||
"from langchain.document_loaders import DirectoryLoader, TextLoader, PyPDFLoader\n",
|
||||
"from langchain.text_splitter import CharacterTextSplitter\n",
|
||||
"from langchain.schema import Document\n",
|
||||
"from langchain_openai import OpenAIEmbeddings, ChatOpenAI\n",
|
||||
"from langchain_chroma import Chroma\n",
|
||||
"from langchain.memory import ConversationBufferMemory\n",
|
||||
"from langchain.chains import ConversationalRetrievalChain\n",
|
||||
"from langchain.embeddings import HuggingFaceEmbeddings\n",
|
||||
"\n",
|
||||
"from langchain_community.document_loaders import PyPDFLoader, TextLoader\n",
|
||||
"from langchain.docstore.document import Document\n",
|
||||
"\n",
|
||||
"# Load environment variables\n",
|
||||
"load_dotenv(override=True)\n",
|
||||
"os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')\n",
|
||||
"\n",
|
||||
"# Global variables to store the current setup\n",
|
||||
"current_vectorstore = None\n",
|
||||
"current_conversation_chain = None\n",
|
||||
"processing_status = \"\"\n",
|
||||
"\n",
|
||||
"def count_tokens(text: str, model: str = \"gpt-4o-mini\") -> int:\n",
|
||||
" \"\"\"Count tokens in text using tiktoken\"\"\"\n",
|
||||
" try:\n",
|
||||
" encoding = tiktoken.encoding_for_model(model)\n",
|
||||
" return len(encoding.encode(text))\n",
|
||||
" except:\n",
|
||||
" # Fallback estimation: roughly 4 characters per token\n",
|
||||
" return len(text) // 4\n",
|
||||
"\n",
|
||||
"def filter_chunks_by_tokens(chunks: List[Document], max_total_tokens: int = 250000) -> List[Document]:\n",
|
||||
" \"\"\"Filter chunks to stay within token limits\"\"\"\n",
|
||||
" filtered_chunks = []\n",
|
||||
" total_tokens = 0\n",
|
||||
" \n",
|
||||
" for chunk in chunks:\n",
|
||||
" chunk_tokens = count_tokens(chunk.page_content)\n",
|
||||
" \n",
|
||||
" # Skip individual chunks that are too large (shouldn't happen with proper splitting)\n",
|
||||
" if chunk_tokens > 8000: # Individual chunk limit\n",
|
||||
" continue\n",
|
||||
" \n",
|
||||
" if total_tokens + chunk_tokens <= max_total_tokens:\n",
|
||||
" filtered_chunks.append(chunk)\n",
|
||||
" total_tokens += chunk_tokens\n",
|
||||
" else:\n",
|
||||
" break\n",
|
||||
" \n",
|
||||
" return filtered_chunks\n",
|
||||
"\n",
|
||||
"def add_metadata(doc, doc_type, file_path):\n",
|
||||
" \"\"\"Add metadata including document type and file information\"\"\"\n",
|
||||
" doc.metadata[\"doc_type\"] = doc_type\n",
|
||||
" doc.metadata[\"file_path\"] = file_path\n",
|
||||
" doc.metadata[\"file_name\"] = os.path.basename(file_path)\n",
|
||||
" return doc\n",
|
||||
"\n",
|
||||
"def check_file_size(file_path, max_size_bytes):\n",
|
||||
" \"\"\"Check if file size is within the limit\"\"\"\n",
|
||||
" try:\n",
|
||||
" file_size = os.path.getsize(file_path)\n",
|
||||
" return file_size <= max_size_bytes, file_size\n",
|
||||
" except OSError:\n",
|
||||
" return False, 0\n",
|
||||
"\n",
|
||||
"def load_pdfs_with_size_limit(folder_path, doc_type, max_size_bytes):\n",
|
||||
" \"\"\"Load PDF files from a folder with size restrictions\"\"\"\n",
|
||||
" pdf_files = glob.glob(os.path.join(folder_path, \"**/*.pdf\"), recursive=True)\n",
|
||||
" loaded_docs = []\n",
|
||||
" skipped_files = []\n",
|
||||
" \n",
|
||||
" for pdf_file in pdf_files:\n",
|
||||
" is_valid_size, file_size = check_file_size(pdf_file, max_size_bytes)\n",
|
||||
" \n",
|
||||
" if is_valid_size:\n",
|
||||
" try:\n",
|
||||
" loader = PyPDFLoader(pdf_file)\n",
|
||||
" docs = loader.load()\n",
|
||||
" docs_with_metadata = [add_metadata(doc, doc_type, pdf_file) for doc in docs]\n",
|
||||
" loaded_docs.extend(docs_with_metadata)\n",
|
||||
" except Exception as e:\n",
|
||||
" skipped_files.append((pdf_file, f\"Loading error: {str(e)}\"))\n",
|
||||
" else:\n",
|
||||
" file_size_mb = file_size / 1024 / 1024\n",
|
||||
" skipped_files.append((pdf_file, f\"File too large: {file_size_mb:.2f} MB\"))\n",
|
||||
" \n",
|
||||
" return loaded_docs, skipped_files\n",
|
||||
"\n",
|
||||
"def process_documents(knowledge_base_dir: str, max_file_size_mb: float, chunk_size: int, chunk_overlap: int) -> Tuple[str, str]:\n",
|
||||
" \"\"\"Process documents and create vector store\"\"\"\n",
|
||||
" global current_vectorstore, current_conversation_chain\n",
|
||||
" \n",
|
||||
" try:\n",
|
||||
" # Validate directory\n",
|
||||
" if not knowledge_base_dir or not knowledge_base_dir.strip():\n",
|
||||
" return \"❌ Error: Please enter a directory path!\", \"\"\n",
|
||||
" \n",
|
||||
" directory_path = knowledge_base_dir.strip()\n",
|
||||
" \n",
|
||||
" if not os.path.exists(directory_path):\n",
|
||||
" return \"❌ Error: Directory does not exist! Please check the path.\", \"\"\n",
|
||||
" \n",
|
||||
" # Configuration\n",
|
||||
" MAX_FILE_SIZE_BYTES = int(max_file_size_mb * 1024 * 1024)\n",
|
||||
" \n",
|
||||
" # Find folders\n",
|
||||
" if directory_path.endswith('*'):\n",
|
||||
" folders = glob.glob(directory_path)\n",
|
||||
" else:\n",
|
||||
" folders = glob.glob(os.path.join(directory_path, \"*\"))\n",
|
||||
" \n",
|
||||
" if not folders:\n",
|
||||
" return \"❌ Error: No folders found in the specified directory!\", \"\"\n",
|
||||
" \n",
|
||||
" # Process documents\n",
|
||||
" documents = []\n",
|
||||
" all_skipped_files = []\n",
|
||||
" status_lines = []\n",
|
||||
" \n",
|
||||
" status_lines.append(f\"🔍 Processing folders with {max_file_size_mb} MB file size limit...\")\n",
|
||||
" status_lines.append(\"-\" * 60)\n",
|
||||
" \n",
|
||||
" for folder in folders:\n",
|
||||
" if os.path.isdir(folder):\n",
|
||||
" doc_type = os.path.basename(folder)\n",
|
||||
" status_lines.append(f\"📁 Processing folder: {doc_type}\")\n",
|
||||
" \n",
|
||||
" folder_docs, skipped_files = load_pdfs_with_size_limit(folder, doc_type, MAX_FILE_SIZE_BYTES)\n",
|
||||
" documents.extend(folder_docs)\n",
|
||||
" all_skipped_files.extend(skipped_files)\n",
|
||||
" \n",
|
||||
" if folder_docs:\n",
|
||||
" status_lines.append(f\" ✅ Loaded {len(folder_docs)} document pages\")\n",
|
||||
" if skipped_files:\n",
|
||||
" status_lines.append(f\" ⚠️ Skipped {len(skipped_files)} files\")\n",
|
||||
" \n",
|
||||
" if not documents:\n",
|
||||
" error_msg = \"❌ No PDF documents were loaded successfully.\"\n",
|
||||
" if all_skipped_files:\n",
|
||||
" error_msg += f\"\\n\\nAll {len(all_skipped_files)} files were skipped:\"\n",
|
||||
" for file_path, reason in all_skipped_files[:10]: # Show first 10\n",
|
||||
" error_msg += f\"\\n • {os.path.basename(file_path)}: {reason}\"\n",
|
||||
" if len(all_skipped_files) > 10:\n",
|
||||
" error_msg += f\"\\n ... and {len(all_skipped_files) - 10} more\"\n",
|
||||
" return error_msg, \"\"\n",
|
||||
" \n",
|
||||
" # Text splitting\n",
|
||||
" status_lines.append(\"\\n\" + \"=\"*40)\n",
|
||||
" status_lines.append(\"✂️ TEXT SPLITTING\")\n",
|
||||
" status_lines.append(\"=\"*40)\n",
|
||||
" \n",
|
||||
" text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)\n",
|
||||
" chunks = text_splitter.split_documents(documents)\n",
|
||||
" \n",
|
||||
" # Filter chunks by token count to prevent API errors\n",
|
||||
" status_lines.append(\"🔢 Checking token limits...\")\n",
|
||||
" original_chunk_count = len(chunks)\n",
|
||||
" chunks = filter_chunks_by_tokens(chunks, max_total_tokens=250000)\n",
|
||||
" \n",
|
||||
" if len(chunks) < original_chunk_count:\n",
|
||||
" status_lines.append(f\"⚠️ Filtered from {original_chunk_count} to {len(chunks)} chunks to stay within token limits\")\n",
|
||||
" \n",
|
||||
" # Create vectorstore\n",
|
||||
" status_lines.append(\"🧮 Creating vector embeddings...\")\n",
|
||||
" embeddings = OpenAIEmbeddings()\n",
|
||||
" \n",
|
||||
" # Use a temporary database name\n",
|
||||
" db_name = \"temp_vector_db\"\n",
|
||||
" \n",
|
||||
" # Delete if already exists\n",
|
||||
" if os.path.exists(db_name):\n",
|
||||
" shutil.rmtree(db_name)\n",
|
||||
" \n",
|
||||
" # Create vectorstore\n",
|
||||
" vectorstore = Chroma.from_documents(\n",
|
||||
" documents=chunks, \n",
|
||||
" embedding=embeddings, \n",
|
||||
" persist_directory=db_name\n",
|
||||
" )\n",
|
||||
" \n",
|
||||
" # Update global variables\n",
|
||||
" current_vectorstore = vectorstore\n",
|
||||
" \n",
|
||||
" # Create conversation chain\n",
|
||||
" llm = ChatOpenAI(temperature=0.7, model_name=\"gpt-4o-mini\")\n",
|
||||
" memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)\n",
|
||||
" retriever = vectorstore.as_retriever(search_kwargs={\"k\": 25})\n",
|
||||
" current_conversation_chain = ConversationalRetrievalChain.from_llm(\n",
|
||||
" llm=llm, \n",
|
||||
" retriever=retriever, \n",
|
||||
" memory=memory\n",
|
||||
" )\n",
|
||||
" \n",
|
||||
" # Summary statistics\n",
|
||||
" status_lines.append(\"\\n\" + \"=\"*40)\n",
|
||||
" status_lines.append(\"📊 SUMMARY\")\n",
|
||||
" status_lines.append(\"=\"*40)\n",
|
||||
" status_lines.append(f\"✅ Total PDFs processed: {len(set(doc.metadata['file_path'] for doc in documents))}\")\n",
|
||||
" status_lines.append(f\"📄 Total document pages: {len(documents)}\")\n",
|
||||
" status_lines.append(f\"🧩 Total text chunks: {len(chunks)}\")\n",
|
||||
" status_lines.append(f\"📁 Document types: {', '.join(set(doc.metadata['doc_type'] for doc in documents))}\")\n",
|
||||
" status_lines.append(f\"🗃️ Vector store size: {vectorstore._collection.count()} embeddings\")\n",
|
||||
" \n",
|
||||
" if all_skipped_files:\n",
|
||||
" status_lines.append(f\"\\n⚠️ Skipped files: {len(all_skipped_files)}\")\n",
|
||||
" for file_path, reason in all_skipped_files[:5]: # Show first 5\n",
|
||||
" status_lines.append(f\" • {os.path.basename(file_path)}: {reason}\")\n",
|
||||
" if len(all_skipped_files) > 5:\n",
|
||||
" status_lines.append(f\" ... and {len(all_skipped_files) - 5} more\")\n",
|
||||
" \n",
|
||||
" success_msg = \"✅ Knowledge base successfully created and ready for questions!\"\n",
|
||||
" detailed_status = \"\\n\".join(status_lines)\n",
|
||||
" \n",
|
||||
" return success_msg, detailed_status\n",
|
||||
" \n",
|
||||
" except Exception as e:\n",
|
||||
" error_msg = f\"❌ Error processing documents: {str(e)}\"\n",
|
||||
" return error_msg, \"\"\n",
|
||||
"\n",
|
||||
"def chat_with_documents(message, history, num_chunks):\n",
|
||||
" \"\"\"Chat with the processed documents\"\"\"\n",
|
||||
" global current_conversation_chain, current_vectorstore\n",
|
||||
" \n",
|
||||
" if current_conversation_chain is None:\n",
|
||||
" return \"❌ Please process documents first before asking questions!\"\n",
|
||||
" \n",
|
||||
" try:\n",
|
||||
" # Update retriever with new chunk count\n",
|
||||
" if current_vectorstore is not None:\n",
|
||||
" retriever = current_vectorstore.as_retriever(search_kwargs={\"k\": num_chunks})\n",
|
||||
" current_conversation_chain.retriever = retriever\n",
|
||||
" \n",
|
||||
" result = current_conversation_chain.invoke({\"question\": message})\n",
|
||||
" return result[\"answer\"]\n",
|
||||
" \n",
|
||||
" except Exception as e:\n",
|
||||
" return f\"❌ Error generating response: {str(e)}\"\n",
|
||||
"\n",
|
||||
"def reset_conversation():\n",
|
||||
" \"\"\"Reset the conversation memory\"\"\"\n",
|
||||
" global current_conversation_chain\n",
|
||||
" if current_conversation_chain is not None:\n",
|
||||
" current_conversation_chain.memory.clear()\n",
|
||||
" return \"✅ Conversation history cleared!\"\n",
|
||||
" return \"No active conversation to reset.\"\n",
|
||||
"\n",
|
||||
"# Create Gradio Interface\n",
|
||||
"with gr.Blocks(title=\"AI-Powered Academic Knowledge Assistant\", theme=gr.themes.Soft()) as app:\n",
|
||||
" gr.Markdown(\"# 🎓 AI-Powered Academic Knowledge Assistant\")\n",
|
||||
" gr.Markdown(\"Transform your entire document library into an intelligent, searchable AI tutor that answers questions instantly.\")\n",
|
||||
" \n",
|
||||
" with gr.Tabs():\n",
|
||||
" # Configuration Tab\n",
|
||||
" with gr.Tab(\"⚙️ Configuration\"):\n",
|
||||
" gr.Markdown(\"### 📁 Document Processing Settings\")\n",
|
||||
" \n",
|
||||
" gr.Markdown(\"💡 **Tip:** Copy and paste your folder path here. On mobile, you can use file manager apps to copy folder paths.\")\n",
|
||||
" \n",
|
||||
" with gr.Row():\n",
|
||||
" with gr.Column():\n",
|
||||
" knowledge_dir = gr.Textbox(\n",
|
||||
" label=\"Knowledge Base Directory\",\n",
|
||||
" value=r\"C:\\Users\\Documents\\Syllabi\\Georgia Tech\\Spring 22\\Microwave Design\",\n",
|
||||
" placeholder=\"Enter or paste your document directory path\",\n",
|
||||
" lines=1\n",
|
||||
" )\n",
|
||||
" \n",
|
||||
" max_file_size = gr.Slider(\n",
|
||||
" label=\"Max File Size (MB)\",\n",
|
||||
" minimum=0.5,\n",
|
||||
" maximum=50,\n",
|
||||
" value=4,\n",
|
||||
" step=0.5\n",
|
||||
" )\n",
|
||||
" \n",
|
||||
" with gr.Column():\n",
|
||||
" chunk_size = gr.Slider(\n",
|
||||
" label=\"Chunk Size (characters)\",\n",
|
||||
" minimum=200,\n",
|
||||
" maximum=1500,\n",
|
||||
" value=800,\n",
|
||||
" step=100,\n",
|
||||
" info=\"Smaller chunks = better token management\"\n",
|
||||
" )\n",
|
||||
" \n",
|
||||
" chunk_overlap = gr.Slider(\n",
|
||||
" label=\"Chunk Overlap (characters)\",\n",
|
||||
" minimum=0,\n",
|
||||
" maximum=300,\n",
|
||||
" value=150,\n",
|
||||
" step=25,\n",
|
||||
" info=\"Overlap preserves context between chunks\"\n",
|
||||
" )\n",
|
||||
" \n",
|
||||
" process_btn = gr.Button(\"🚀 Process Documents\", variant=\"primary\", size=\"lg\")\n",
|
||||
" \n",
|
||||
" with gr.Row():\n",
|
||||
" status_output = gr.Textbox(\n",
|
||||
" label=\"Status\",\n",
|
||||
" lines=2,\n",
|
||||
" max_lines=2\n",
|
||||
" )\n",
|
||||
" \n",
|
||||
" detailed_output = gr.Textbox(\n",
|
||||
" label=\"Detailed Processing Log\",\n",
|
||||
" lines=15,\n",
|
||||
" max_lines=20\n",
|
||||
" )\n",
|
||||
" \n",
|
||||
" # Chat Tab\n",
|
||||
" with gr.Tab(\"💬 Chat\"):\n",
|
||||
" gr.Markdown(\"### 🤖 Ask Questions About Your Documents\")\n",
|
||||
" \n",
|
||||
" with gr.Row():\n",
|
||||
" with gr.Column(scale=1):\n",
|
||||
" num_chunks = gr.Slider(\n",
|
||||
" label=\"Number of chunks to retrieve\",\n",
|
||||
" minimum=1,\n",
|
||||
" maximum=50,\n",
|
||||
" value=25,\n",
|
||||
" step=1\n",
|
||||
" )\n",
|
||||
" \n",
|
||||
" reset_btn = gr.Button(\"🗑️ Clear Chat History\", variant=\"secondary\")\n",
|
||||
" reset_output = gr.Textbox(label=\"Reset Status\", lines=1)\n",
|
||||
" \n",
|
||||
" with gr.Column(scale=3):\n",
|
||||
" chatbot = gr.ChatInterface(\n",
|
||||
" fn=lambda msg, history: chat_with_documents(msg, history, num_chunks.value),\n",
|
||||
" type=\"messages\",\n",
|
||||
" title=\"Academic Assistant Chat\",\n",
|
||||
" description=\"Ask questions about your processed documents\"\n",
|
||||
" )\n",
|
||||
" \n",
|
||||
" # Event handlers\n",
|
||||
" process_btn.click(\n",
|
||||
" fn=process_documents,\n",
|
||||
" inputs=[knowledge_dir, max_file_size, chunk_size, chunk_overlap],\n",
|
||||
" outputs=[status_output, detailed_output]\n",
|
||||
" )\n",
|
||||
" \n",
|
||||
" reset_btn.click(\n",
|
||||
" fn=reset_conversation,\n",
|
||||
" outputs=reset_output\n",
|
||||
" )\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9eb807e0-194b-48dd-a1e9-b1b9b8a99620",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"app.launch(share=True, inbrowser=True)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
Reference in New Issue
Block a user