Files
LLM_Engineering_OLD/community-contributions/sach91-bootcamp/week8/app.py
2025-10-30 15:42:04 +05:30

847 lines
31 KiB
Python

"""
KnowledgeHub - Personal Knowledge Management & Research Assistant
Main Gradio Application
"""
import os
import logging
import json
import gradio as gr
from pathlib import Path
import chromadb
from datetime import datetime
# Setup logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
# Import utilities and agents
from utils import OllamaClient, EmbeddingModel, DocumentParser
from agents import (
IngestionAgent, QuestionAgent, SummaryAgent,
ConnectionAgent, ExportAgent
)
from models import Document
# Constants
VECTORSTORE_PATH = "./vectorstore"
TEMP_UPLOAD_PATH = "./temp_uploads"
DOCUMENTS_METADATA_PATH = "./vectorstore/documents_metadata.json"
# Ensure directories exist
os.makedirs(VECTORSTORE_PATH, exist_ok=True)
os.makedirs(TEMP_UPLOAD_PATH, exist_ok=True)
class KnowledgeHub:
"""Main application class managing all agents"""
def __init__(self):
logger.info("Initializing KnowledgeHub...")
# Initialize ChromaDB
self.client = chromadb.PersistentClient(path=VECTORSTORE_PATH)
self.collection = self.client.get_or_create_collection(
name="knowledge_base",
metadata={"description": "Personal knowledge management collection"}
)
# Initialize embedding model
self.embedding_model = EmbeddingModel()
# Initialize shared LLM client
self.llm_client = OllamaClient(model="llama3.2")
# Check Ollama connection
if not self.llm_client.check_connection():
logger.warning("⚠️ Cannot connect to Ollama. Please ensure Ollama is running.")
logger.warning("Start Ollama with: ollama serve")
else:
logger.info("✓ Connected to Ollama")
# Initialize agents
self.ingestion_agent = IngestionAgent(
collection=self.collection,
embedding_model=self.embedding_model,
llm_client=self.llm_client
)
self.question_agent = QuestionAgent(
collection=self.collection,
embedding_model=self.embedding_model,
llm_client=self.llm_client
)
self.summary_agent = SummaryAgent(
collection=self.collection,
llm_client=self.llm_client
)
self.connection_agent = ConnectionAgent(
collection=self.collection,
embedding_model=self.embedding_model,
llm_client=self.llm_client
)
self.export_agent = ExportAgent(
llm_client=self.llm_client
)
# Track uploaded documents
self.documents = {}
# Load existing documents from metadata file
self._load_documents_metadata()
logger.info("✓ KnowledgeHub initialized successfully")
def _save_documents_metadata(self):
"""Save document metadata to JSON file"""
try:
metadata = {
doc_id: doc.to_dict()
for doc_id, doc in self.documents.items()
}
with open(DOCUMENTS_METADATA_PATH, 'w') as f:
json.dump(metadata, f, indent=2)
logger.debug(f"Saved metadata for {len(metadata)} documents")
except Exception as e:
logger.error(f"Error saving document metadata: {e}")
def _load_documents_metadata(self):
"""Load document metadata from JSON file"""
try:
if os.path.exists(DOCUMENTS_METADATA_PATH):
with open(DOCUMENTS_METADATA_PATH, 'r') as f:
metadata = json.load(f)
# Reconstruct Document objects (simplified - without chunks)
for doc_id, doc_data in metadata.items():
# Create a minimal Document object for UI purposes
# Full chunks are still in ChromaDB
doc = Document(
id=doc_id,
filename=doc_data['filename'],
filepath=doc_data.get('filepath', ''),
content=doc_data.get('content', ''),
chunks=[], # Chunks are in ChromaDB
metadata=doc_data.get('metadata', {}),
created_at=datetime.fromisoformat(doc_data['created_at'])
)
self.documents[doc_id] = doc
logger.info(f"✓ Loaded {len(self.documents)} existing documents from storage")
else:
logger.info("No existing documents found (starting fresh)")
except Exception as e:
logger.error(f"Error loading document metadata: {e}")
logger.info("Starting with empty document list")
def upload_document(self, files, progress=gr.Progress()):
"""Handle document upload - supports single or multiple files with progress tracking"""
if files is None or len(files) == 0:
return "⚠️ Please select file(s) to upload", "", []
# Convert single file to list for consistent handling
if not isinstance(files, list):
files = [files]
results = []
successful = 0
failed = 0
total_chunks = 0
# Initialize progress tracking
progress(0, desc="Starting upload...")
for file_idx, file in enumerate(files, 1):
# Update progress
progress_pct = (file_idx - 1) / len(files)
progress(progress_pct, desc=f"Processing {file_idx}/{len(files)}: {Path(file.name).name}")
try:
logger.info(f"Processing file {file_idx}/{len(files)}: {file.name}")
# Save uploaded file temporarily
temp_path = os.path.join(TEMP_UPLOAD_PATH, Path(file.name).name)
# Copy file content
with open(temp_path, 'wb') as f:
f.write(file.read() if hasattr(file, 'read') else open(file.name, 'rb').read())
# Process document
document = self.ingestion_agent.process(temp_path)
# Store document reference
self.documents[document.id] = document
# Track stats
successful += 1
total_chunks += document.num_chunks
# Add to results
results.append({
'status': '',
'filename': document.filename,
'chunks': document.num_chunks,
'size': f"{document.total_chars:,} chars"
})
# Clean up temp file
os.remove(temp_path)
except Exception as e:
logger.error(f"Error processing {file.name}: {e}")
failed += 1
results.append({
'status': '',
'filename': Path(file.name).name,
'chunks': 0,
'size': f"Error: {str(e)[:50]}"
})
# Final progress update
progress(1.0, desc="Upload complete!")
# Save metadata once after all uploads
if successful > 0:
self._save_documents_metadata()
# Create summary
summary = f"""## Upload Complete! 🎉
**Total Files:** {len(files)}
**✅ Successful:** {successful}
**❌ Failed:** {failed}
**Total Chunks Created:** {total_chunks:,}
{f"⚠️ **{failed} file(s) failed** - Check results table below for details" if failed > 0 else "All files processed successfully!"}
"""
# Create detailed results table
results_table = [[r['status'], r['filename'], r['chunks'], r['size']] for r in results]
# Create preview of first successful document
preview = ""
for doc in self.documents.values():
if doc.filename in [r['filename'] for r in results if r['status'] == '']:
preview = doc.content[:500] + "..." if len(doc.content) > 500 else doc.content
break
return summary, preview, results_table
def ask_question(self, question, top_k, progress=gr.Progress()):
"""Handle question answering with progress tracking"""
if not question.strip():
return "⚠️ Please enter a question", [], ""
try:
# Initial status
progress(0, desc="Processing your question...")
status = "🔄 **Searching knowledge base...**\n\nRetrieving relevant documents..."
logger.info(f"Answering question: {question[:100]}")
# Update progress
progress(0.3, desc="Finding relevant documents...")
result = self.question_agent.process(question, top_k=top_k)
# Update progress
progress(0.7, desc="Generating answer with LLM...")
# Format answer
answer = f"""### Answer\n\n{result['answer']}\n\n"""
if result['sources']:
answer += f"**Sources:** {result['num_sources']} documents referenced\n\n"
# Format sources for display
sources_data = []
for i, source in enumerate(result['sources'], 1):
sources_data.append([
i,
source['document'],
f"{source['score']:.2%}",
source['preview']
])
progress(1.0, desc="Answer ready!")
return answer, sources_data, "✅ Answer generated successfully!"
except Exception as e:
logger.error(f"Error answering question: {e}")
return f"❌ Error: {str(e)}", [], f"❌ Error: {str(e)}"
def create_summary(self, doc_selector, progress=gr.Progress()):
"""Create document summary with progress tracking"""
if not doc_selector:
return "⚠️ Please select a document to summarize", ""
try:
# Initial status
progress(0, desc="Preparing to summarize...")
logger.info(f'doc_selector : {doc_selector}')
doc_id = doc_selector.split(" -|- ")[1]
document = self.documents.get(doc_id)
if not document:
return "", "❌ Document not found"
# Update status
status_msg = f"🔄 **Generating summary for:** {document.filename}\n\nPlease wait, this may take 10-20 seconds..."
progress(0.3, desc=f"Analyzing {document.filename}...")
logger.info(f"Creating summary for: {document.filename}")
# Generate summary
summary = self.summary_agent.process(
document_id=doc_id,
document_name=document.filename
)
progress(1.0, desc="Summary complete!")
# Format result
result = f"""## Summary of {summary.document_name}\n\n{summary.summary_text}\n\n"""
if summary.key_points:
result += "### Key Points\n\n"
for point in summary.key_points:
result += f"- {point}\n"
return result, "✅ Summary generated successfully!"
except Exception as e:
logger.error(f"Error creating summary: {e}")
return "", f"❌ Error: {str(e)}"
def find_connections(self, doc_selector, top_k, progress=gr.Progress()):
"""Find related documents with progress tracking"""
if not doc_selector:
return "⚠️ Please select a document", [], ""
try:
progress(0, desc="Preparing to find connections...")
doc_id = doc_selector.split(" -|- ")[1]
document = self.documents.get(doc_id)
if not document:
return "❌ Document not found", [], "❌ Document not found"
status = f"🔄 **Finding documents related to:** {document.filename}\n\nSearching knowledge base..."
progress(0.3, desc=f"Analyzing {document.filename}...")
logger.info(f"Finding connections for: {document.filename}")
result = self.connection_agent.process(document_id=doc_id, top_k=top_k)
progress(0.8, desc="Calculating similarity scores...")
if 'error' in result:
return f"❌ Error: {result['error']}", [], f"❌ Error: {result['error']}"
message = f"""## Related Documents\n\n**Source:** {result['source_document']}\n\n"""
message += f"**Found {result['num_related']} related documents:**\n\n"""
# Format for table
table_data = []
for i, rel in enumerate(result['related'], 1):
table_data.append([
i,
rel['document_name'],
f"{rel['similarity']:.2%}",
rel['preview']
])
progress(1.0, desc="Connections found!")
return message, table_data, "✅ Related documents found!"
except Exception as e:
logger.error(f"Error finding connections: {e}")
return f"❌ Error: {str(e)}", [], f"❌ Error: {str(e)}"
def export_knowledge(self, format_choice):
"""Export knowledge base"""
try:
logger.info(f"Exporting as {format_choice}")
# Get statistics
stats = self.ingestion_agent.get_statistics()
# Create export content
content = {
'title': 'Knowledge Base Export',
'summary': f"Total documents in knowledge base: {len(self.documents)}",
'sections': [
{
'title': 'Documents',
'content': '\n'.join([f"- {doc.filename}" for doc in self.documents.values()])
},
{
'title': 'Statistics',
'content': f"Total chunks stored: {stats['total_chunks']}"
}
]
}
# Export
if format_choice == "Markdown":
output = self.export_agent.process(content, format="markdown")
filename = f"knowledge_export_{datetime.now().strftime('%Y%m%d_%H%M%S')}.md"
elif format_choice == "HTML":
output = self.export_agent.process(content, format="html")
filename = f"knowledge_export_{datetime.now().strftime('%Y%m%d_%H%M%S')}.html"
else: # Text
output = self.export_agent.process(content, format="text")
filename = f"knowledge_export_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt"
# Save file
export_path = os.path.join(TEMP_UPLOAD_PATH, filename)
with open(export_path, 'w', encoding='utf-8') as f:
f.write(output)
return f"✅ Exported as {format_choice}", export_path
except Exception as e:
logger.error(f"Error exporting: {e}")
return f"❌ Error: {str(e)}", None
def get_statistics(self):
"""Get knowledge base statistics"""
try:
stats = self.ingestion_agent.get_statistics()
total_docs = len(self.documents)
total_chunks = stats.get('total_chunks', 0)
total_chars = sum(doc.total_chars for doc in self.documents.values())
# Check if data is persisted
persistence_status = "✅ Enabled" if os.path.exists(DOCUMENTS_METADATA_PATH) else "⚠️ Not configured"
vectorstore_size = self._get_directory_size(VECTORSTORE_PATH)
stats_text = f"""## Knowledge Base Statistics
**Persistence Status:** {persistence_status}
**Total Documents:** {total_docs}
**Total Chunks:** {total_chunks:,}
**Total Characters:** {total_chars:,}
**Vector Store Size:** {vectorstore_size}
### Storage Locations
- **Vector DB:** `{VECTORSTORE_PATH}/`
- **Metadata:** `{DOCUMENTS_METADATA_PATH}`
**📝 Note:** Your data persists across app restarts!
**Recent Documents:**
{chr(10).join([f"- {doc.filename} ({doc.num_chunks} chunks)" for doc in list(self.documents.values())[-5:]])}
"""
if self.documents:
stats_text += "\n".join([f"- {doc.filename} ({doc.num_chunks} chunks, added {doc.created_at.strftime('%Y-%m-%d')})"
for doc in list(self.documents.values())[-10:]])
else:
stats_text += "\n*No documents yet. Upload some to get started!*"
return stats_text
except Exception as e:
return f"❌ Error: {str(e)}"
def _get_directory_size(self, path):
"""Calculate directory size"""
try:
total_size = 0
for dirpath, dirnames, filenames in os.walk(path):
for filename in filenames:
filepath = os.path.join(dirpath, filename)
if os.path.exists(filepath):
total_size += os.path.getsize(filepath)
# Convert to human readable
for unit in ['B', 'KB', 'MB', 'GB']:
if total_size < 1024.0:
return f"{total_size:.1f} {unit}"
total_size /= 1024.0
return f"{total_size:.1f} TB"
except:
return "Unknown"
def get_document_list(self):
"""Get list of documents for dropdown"""
new_choices = [f"{doc.filename} -|- {doc.id}" for doc in self.documents.values()]
return gr.update(choices=new_choices, value=None)
def delete_document(self, doc_selector):
"""Delete a document from the knowledge base"""
if not doc_selector:
return "⚠️ Please select a document to delete", self.get_document_list()
try:
doc_id = doc_selector.split(" - ")[0]
document = self.documents.get(doc_id)
if not document:
return "❌ Document not found", self.get_document_list()
# Delete from ChromaDB
success = self.ingestion_agent.delete_document(doc_id)
if success:
# Remove from documents dict
filename = document.filename
del self.documents[doc_id]
# Save updated metadata
self._save_documents_metadata()
return f"✅ Deleted: {filename}", self.get_document_list()
else:
return f"❌ Error deleting document", self.get_document_list()
except Exception as e:
logger.error(f"Error deleting document: {e}")
return f"❌ Error: {str(e)}", self.get_document_list()
def clear_all_documents(self):
"""Clear entire knowledge base"""
try:
# Delete collection
self.client.delete_collection("knowledge_base")
# Recreate empty collection
self.collection = self.client.create_collection(
name="knowledge_base",
metadata={"description": "Personal knowledge management collection"}
)
# Update agents with new collection
self.ingestion_agent.collection = self.collection
self.question_agent.collection = self.collection
self.summary_agent.collection = self.collection
self.connection_agent.collection = self.collection
# Clear documents
self.documents = {}
self._save_documents_metadata()
return "✅ All documents cleared from knowledge base"
except Exception as e:
logger.error(f"Error clearing database: {e}")
return f"❌ Error: {str(e)}"
def create_ui():
"""Create Gradio interface"""
# Initialize app
app = KnowledgeHub()
# Custom CSS
custom_css = """
.main-header {
text-align: center;
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
color: white;
padding: 30px;
border-radius: 10px;
margin-bottom: 20px;
}
.stat-box {
background: #f8f9fa;
padding: 15px;
border-radius: 8px;
border-left: 4px solid #667eea;
}
"""
with gr.Blocks(title="KnowledgeHub", css=custom_css, theme=gr.themes.Soft()) as interface:
# Header
gr.HTML("""
<div class="main-header">
<h1>🧠 KnowledgeHub</h1>
<p>Personal Knowledge Management & Research Assistant</p>
<p style="font-size: 14px; opacity: 0.9;">
Powered by Ollama (Llama 3.2) • Fully Local & Private
</p>
</div>
""")
# Main tabs
with gr.Tabs():
# Tab 1: Upload Documents
with gr.Tab("📤 Upload Documents"):
gr.Markdown("### Upload your documents to build your knowledge base")
gr.Markdown("*Supported formats: PDF, DOCX, TXT, MD, HTML, PY*")
gr.Markdown("*💡 Tip: You can select multiple files at once!*")
with gr.Row():
with gr.Column():
file_input = gr.File(
label="Select Document(s)",
file_types=[".pdf", ".docx", ".txt", ".md", ".html", ".py"],
file_count="multiple" # Enable multiple file selection
)
upload_btn = gr.Button("📤 Upload & Process", variant="primary")
with gr.Column():
upload_status = gr.Markdown("Ready to upload documents")
# Results table for batch uploads
with gr.Row():
upload_results = gr.Dataframe(
headers=["Status", "Filename", "Chunks", "Size"],
label="Upload Results",
wrap=True,
visible=True
)
with gr.Row():
document_preview = gr.Textbox(
label="Document Preview (First Uploaded)",
lines=10,
max_lines=15
)
upload_btn.click(
fn=app.upload_document,
inputs=[file_input],
outputs=[upload_status, document_preview, upload_results]
)
# Tab 2: Ask Questions
with gr.Tab("❓ Ask Questions"):
gr.Markdown("### Ask questions about your documents")
gr.Markdown("*Uses RAG (Retrieval Augmented Generation) to answer based on your knowledge base*")
with gr.Row():
with gr.Column(scale=3):
question_input = gr.Textbox(
label="Your Question",
placeholder="What would you like to know?",
lines=3
)
with gr.Column(scale=1):
top_k_slider = gr.Slider(
minimum=1,
maximum=10,
value=5,
step=1,
label="Number of sources"
)
ask_btn = gr.Button("🔍 Ask", variant="primary")
qa_status = gr.Markdown("Ready to answer questions")
answer_output = gr.Markdown(label="Answer")
sources_table = gr.Dataframe(
headers=["#", "Document", "Relevance", "Preview"],
label="Sources",
wrap=True
)
ask_btn.click(
fn=app.ask_question,
inputs=[question_input, top_k_slider],
outputs=[answer_output, sources_table, qa_status]
)
# Tab 3: Summarize
with gr.Tab("📝 Summarize"):
gr.Markdown("### Generate summaries and extract key points")
with gr.Row():
with gr.Column():
doc_selector = gr.Dropdown(
choices=[],
label="Select Document",
info="Choose a document to summarize",
allow_custom_value=True
)
refresh_btn = gr.Button("🔄 Refresh List")
summarize_btn = gr.Button("📝 Generate Summary", variant="primary")
summary_status = gr.Markdown("Ready to generate summaries")
with gr.Column(scale=2):
summary_output = gr.Markdown(label="Summary")
summarize_btn.click(
fn=app.create_summary,
inputs=[doc_selector],
outputs=[summary_output, summary_status]
)
refresh_btn.click(
fn=app.get_document_list,
outputs=[doc_selector]
)
# Tab 4: Find Connections
with gr.Tab("🔗 Find Connections"):
gr.Markdown("### Discover relationships between documents")
with gr.Row():
with gr.Column():
conn_doc_selector = gr.Dropdown(
choices=[],
label="Select Document",
info="Find documents related to this one",
allow_custom_value=True
)
conn_top_k = gr.Slider(
minimum=1,
maximum=10,
value=5,
step=1,
label="Number of related documents"
)
refresh_conn_btn = gr.Button("🔄 Refresh List")
find_btn = gr.Button("🔗 Find Connections", variant="primary")
connection_status = gr.Markdown("Ready to find connections")
connection_output = gr.Markdown(label="Connections")
connections_table = gr.Dataframe(
headers=["#", "Document", "Similarity", "Preview"],
label="Related Documents",
wrap=True
)
find_btn.click(
fn=app.find_connections,
inputs=[conn_doc_selector, conn_top_k],
outputs=[connection_output, connections_table, connection_status]
)
refresh_conn_btn.click(
fn=app.get_document_list,
outputs=[conn_doc_selector]
)
# Tab 5: Export
with gr.Tab("💾 Export"):
gr.Markdown("### Export your knowledge base")
with gr.Row():
with gr.Column():
format_choice = gr.Radio(
choices=["Markdown", "HTML", "Text"],
value="Markdown",
label="Export Format"
)
export_btn = gr.Button("💾 Export", variant="primary")
with gr.Column():
export_status = gr.Markdown("Ready to export")
export_file = gr.File(label="Download Export")
export_btn.click(
fn=app.export_knowledge,
inputs=[format_choice],
outputs=[export_status, export_file]
)
# Tab 6: Manage Documents
with gr.Tab("🗂️ Manage Documents"):
gr.Markdown("### Manage your document library")
with gr.Row():
with gr.Column():
gr.Markdown("#### Delete Document")
delete_doc_selector = gr.Dropdown(
choices=[],
label="Select Document to Delete",
info="Choose a document to remove from knowledge base"
)
with gr.Row():
refresh_delete_btn = gr.Button("🔄 Refresh List")
delete_btn = gr.Button("🗑️ Delete Document", variant="stop")
delete_status = gr.Markdown("")
with gr.Column():
gr.Markdown("#### Clear All Documents")
gr.Markdown("⚠️ **Warning:** This will delete your entire knowledge base!")
clear_confirm = gr.Textbox(
label="Type 'DELETE ALL' to confirm",
placeholder="DELETE ALL"
)
clear_all_btn = gr.Button("🗑️ Clear All Documents", variant="stop")
clear_status = gr.Markdown("")
def confirm_and_clear(confirm_text):
if confirm_text.strip() == "DELETE ALL":
return app.clear_all_documents()
else:
return "⚠️ Please type 'DELETE ALL' to confirm"
delete_btn.click(
fn=app.delete_document,
inputs=[delete_doc_selector],
outputs=[delete_status, delete_doc_selector]
)
refresh_delete_btn.click(
fn=app.get_document_list,
outputs=[delete_doc_selector]
)
clear_all_btn.click(
fn=confirm_and_clear,
inputs=[clear_confirm],
outputs=[clear_status]
)
# Tab 7: Statistics
with gr.Tab("📊 Statistics"):
gr.Markdown("### Knowledge Base Overview")
stats_output = gr.Markdown()
stats_btn = gr.Button("🔄 Refresh Statistics", variant="primary")
stats_btn.click(
fn=app.get_statistics,
outputs=[stats_output]
)
# Auto-load stats on tab open
interface.load(
fn=app.get_statistics,
outputs=[stats_output]
)
# Footer
gr.HTML("""
<div style="text-align: center; margin-top: 30px; padding: 20px; color: #666;">
<p>🔒 All processing happens locally on your machine • Your data never leaves your computer</p>
<p style="font-size: 12px;">Powered by Ollama, ChromaDB, and Sentence Transformers</p>
</div>
""")
return interface
if __name__ == "__main__":
logger.info("Starting KnowledgeHub...")
# Create and launch interface
interface = create_ui()
interface.launch(
server_name="127.0.0.1",
server_port=7860,
share=False,
inbrowser=True
)