From ef48ed539dccdfd06703a364e37d276073e9516f Mon Sep 17 00:00:00 2001 From: sach91 Date: Thu, 30 Oct 2025 15:42:04 +0530 Subject: [PATCH] sach91 bootcamp week8 exercise --- .../sach91-bootcamp/week8/README.md | 259 ++++++ .../sach91-bootcamp/week8/agents/__init__.py | 18 + .../week8/agents/base_agent.py | 91 ++ .../week8/agents/connection_agent.py | 289 ++++++ .../week8/agents/export_agent.py | 233 +++++ .../week8/agents/ingestion_agent.py | 157 ++++ .../week8/agents/question_agent.py | 156 ++++ .../week8/agents/summary_agent.py | 181 ++++ .../sach91-bootcamp/week8/app.py | 846 ++++++++++++++++++ .../sach91-bootcamp/week8/models/__init__.py | 13 + .../sach91-bootcamp/week8/models/document.py | 82 ++ .../week8/models/knowledge_graph.py | 110 +++ .../sach91-bootcamp/week8/requirements.txt | 26 + .../sach91-bootcamp/week8/start.bat | 71 ++ .../sach91-bootcamp/week8/start.sh | 42 + .../sach91-bootcamp/week8/utils/__init__.py | 12 + .../week8/utils/document_parser.py | 218 +++++ .../sach91-bootcamp/week8/utils/embeddings.py | 84 ++ .../week8/utils/ollama_client.py | 107 +++ .../sach91-bootcamp/week8/verify_setup.py | 129 +++ 20 files changed, 3124 insertions(+) create mode 100644 community-contributions/sach91-bootcamp/week8/README.md create mode 100644 community-contributions/sach91-bootcamp/week8/agents/__init__.py create mode 100644 community-contributions/sach91-bootcamp/week8/agents/base_agent.py create mode 100644 community-contributions/sach91-bootcamp/week8/agents/connection_agent.py create mode 100644 community-contributions/sach91-bootcamp/week8/agents/export_agent.py create mode 100644 community-contributions/sach91-bootcamp/week8/agents/ingestion_agent.py create mode 100644 community-contributions/sach91-bootcamp/week8/agents/question_agent.py create mode 100644 community-contributions/sach91-bootcamp/week8/agents/summary_agent.py create mode 100644 community-contributions/sach91-bootcamp/week8/app.py create mode 100644 community-contributions/sach91-bootcamp/week8/models/__init__.py create mode 100644 community-contributions/sach91-bootcamp/week8/models/document.py create mode 100644 community-contributions/sach91-bootcamp/week8/models/knowledge_graph.py create mode 100644 community-contributions/sach91-bootcamp/week8/requirements.txt create mode 100644 community-contributions/sach91-bootcamp/week8/start.bat create mode 100755 community-contributions/sach91-bootcamp/week8/start.sh create mode 100644 community-contributions/sach91-bootcamp/week8/utils/__init__.py create mode 100644 community-contributions/sach91-bootcamp/week8/utils/document_parser.py create mode 100644 community-contributions/sach91-bootcamp/week8/utils/embeddings.py create mode 100644 community-contributions/sach91-bootcamp/week8/utils/ollama_client.py create mode 100644 community-contributions/sach91-bootcamp/week8/verify_setup.py diff --git a/community-contributions/sach91-bootcamp/week8/README.md b/community-contributions/sach91-bootcamp/week8/README.md new file mode 100644 index 0000000..69d492a --- /dev/null +++ b/community-contributions/sach91-bootcamp/week8/README.md @@ -0,0 +1,259 @@ +# 🧠 KnowledgeHub - Personal Knowledge Management & Research Assistant + +An elegant, fully local AI-powered knowledge management system that helps you organize, search, and understand your documents using state-of-the-art LLM technology. + +## ✨ Features + +### šŸŽÆ Core Capabilities +- **šŸ“¤ Document Ingestion**: Upload PDF, DOCX, TXT, MD, and HTML files +- **ā“ Intelligent Q&A**: Ask questions and get answers from your documents using RAG +- **šŸ“ Smart Summarization**: Generate concise summaries with key points +- **šŸ”— Connection Discovery**: Find relationships between documents +- **šŸ’¾ Multi-format Export**: Export as Markdown, HTML, or plain text +- **šŸ“Š Statistics Dashboard**: Track your knowledge base growth + +### šŸ”’ Privacy-First +- **100% Local Processing**: All data stays on your machine +- **No Cloud Dependencies**: Uses Ollama for local LLM inference +- **Open Source**: Full transparency and control + +### ⚔ Technology Stack +- **LLM**: Ollama with Llama 3.2 (3B) or Llama 3.1 (8B) +- **Embeddings**: sentence-transformers (all-MiniLM-L6-v2) +- **Vector Database**: ChromaDB +- **UI**: Gradio +- **Document Processing**: pypdf, python-docx, beautifulsoup4 + +## šŸš€ Quick Start + +### Prerequisites + +1. **Python 3.8+** installed +2. **Ollama** installed and running + +#### Installing Ollama + +**macOS/Linux:** +```bash +curl -fsSL https://ollama.com/install.sh | sh +``` + +**Windows:** +Download from [ollama.com/download](https://ollama.com/download) + +### Installation + +1. **Clone or download this repository** + +2. **Install Python dependencies:** +```bash +pip install -r requirements.txt +``` + +3. **Pull Llama model using Ollama:** +```bash +# For faster inference (recommended for most users) +ollama pull llama3.2 + +# OR for better quality (requires more RAM) +ollama pull llama3.1 +``` + +4. **Start Ollama server** (if not already running): +```bash +ollama serve +``` + +5. **Launch KnowledgeHub:** +```bash +python app.py +``` + +The application will open in your browser at `http://127.0.0.1:7860` + +## šŸ“– Usage Guide + +### 1. Upload Documents +- Go to the "Upload Documents" tab +- Select a file (PDF, DOCX, TXT, MD, or HTML) +- Click "Upload & Process" +- The document will be chunked and stored in your local vector database + +### 2. Ask Questions +- Go to the "Ask Questions" tab +- Type your question in natural language +- Adjust the number of sources to retrieve (default: 5) +- Click "Ask" to get an AI-generated answer with sources + +### 3. Summarize Documents +- Go to the "Summarize" tab +- Select a document from the dropdown +- Click "Generate Summary" +- Get a concise summary with key points + +### 4. Find Connections +- Go to the "Find Connections" tab +- Select a document to analyze +- Adjust how many related documents to find +- See documents that are semantically similar + +### 5. Export Knowledge +- Go to the "Export" tab +- Choose your format (Markdown, HTML, or Text) +- Click "Export" to download your knowledge base + +### 6. View Statistics +- Go to the "Statistics" tab +- See overview of your knowledge base +- Track total documents, chunks, and characters + +## šŸ—ļø Architecture + +``` +KnowledgeHub/ +ā”œā”€ā”€ agents/ # Specialized AI agents +│ ā”œā”€ā”€ base_agent.py # Base class for all agents +│ ā”œā”€ā”€ ingestion_agent.py # Document processing +│ ā”œā”€ā”€ question_agent.py # RAG-based Q&A +│ ā”œā”€ā”€ summary_agent.py # Summarization +│ ā”œā”€ā”€ connection_agent.py # Finding relationships +│ └── export_agent.py # Exporting data +ā”œā”€ā”€ models/ # Data models +│ ā”œā”€ā”€ document.py # Document structures +│ └── knowledge_graph.py # Graph structures +ā”œā”€ā”€ utils/ # Utilities +│ ā”œā”€ā”€ ollama_client.py # Ollama API wrapper +│ ā”œā”€ā”€ embeddings.py # Embedding generation +│ └── document_parser.py # File parsing +ā”œā”€ā”€ vectorstore/ # ChromaDB storage (auto-created) +ā”œā”€ā”€ temp_uploads/ # Temporary file storage (auto-created) +ā”œā”€ā”€ app.py # Main Gradio application +└── requirements.txt # Python dependencies +``` + +## šŸŽÆ Multi-Agent Framework + +KnowledgeHub uses a sophisticated multi-agent architecture: + +1. **Ingestion Agent**: Parses documents, creates chunks, generates embeddings +2. **Question Agent**: Retrieves relevant context and answers questions +3. **Summary Agent**: Creates concise summaries and extracts key points +4. **Connection Agent**: Finds semantic relationships between documents +5. **Export Agent**: Formats and exports knowledge in multiple formats + +Each agent is independent, reusable, and focused on a specific task, following best practices in agentic AI development. + +## āš™ļø Configuration + +### Changing Models + +Edit `app.py` to use different models: + +```python +# For Llama 3.1 8B (better quality, more RAM) +self.llm_client = OllamaClient(model="llama3.1") + +# For Llama 3.2 3B (faster, less RAM) +self.llm_client = OllamaClient(model="llama3.2") +``` + +### Adjusting Chunk Size + +Edit `agents/ingestion_agent.py`: + +```python +self.parser = DocumentParser( + chunk_size=1000, # Characters per chunk + chunk_overlap=200 # Overlap between chunks +) +``` + +### Changing Embedding Model + +Edit `app.py`: + +```python +self.embedding_model = EmbeddingModel( + model_name="sentence-transformers/all-MiniLM-L6-v2" +) +``` + +## šŸ”§ Troubleshooting + +### "Cannot connect to Ollama" +- Ensure Ollama is installed: `ollama --version` +- Start the Ollama service: `ollama serve` +- Verify the model is pulled: `ollama list` + +### "Module not found" errors +- Ensure all dependencies are installed: `pip install -r requirements.txt` +- Try upgrading pip: `pip install --upgrade pip` + +### "Out of memory" errors +- Use Llama 3.2 (3B) instead of Llama 3.1 (8B) +- Reduce chunk_size in document parser +- Process fewer documents at once + +### Slow response times +- Ensure you're using a CUDA-enabled GPU (if available) +- Reduce the number of retrieved chunks (top_k parameter) +- Use a smaller model (llama3.2) + +## šŸŽ“ Learning Resources + +This project demonstrates key concepts in LLM engineering: + +- **RAG (Retrieval Augmented Generation)**: Combining retrieval with generation +- **Vector Databases**: Using ChromaDB for semantic search +- **Multi-Agent Systems**: Specialized agents working together +- **Embeddings**: Semantic representation of text +- **Local LLM Deployment**: Using Ollama for privacy-focused AI + +## šŸ“Š Performance + +**Hardware Requirements:** +- Minimum: 8GB RAM, CPU +- Recommended: 16GB RAM, GPU (NVIDIA with CUDA) +- Optimal: 32GB RAM, GPU (RTX 3060 or better) + +**Processing Speed** (Llama 3.2 on M1 Mac): +- Document ingestion: ~2-5 seconds per page +- Question answering: ~5-15 seconds +- Summarization: ~10-20 seconds + +## šŸ¤ Contributing + +This is a learning project showcasing LLM engineering principles. Feel free to: +- Experiment with different models +- Add new agents for specialized tasks +- Improve the UI +- Optimize performance + +## šŸ“„ License + +This project is open source and available for educational purposes. + +## šŸ™ Acknowledgments + +Built with: +- [Ollama](https://ollama.com/) - Local LLM runtime +- [Gradio](https://gradio.app/) - UI framework +- [ChromaDB](https://www.trychroma.com/) - Vector database +- [Sentence Transformers](https://www.sbert.net/) - Embeddings +- [Llama](https://ai.meta.com/llama/) - Meta's open source LLMs + +## šŸŽÆ Next Steps + +Potential enhancements: +1. Add support for images and diagrams +2. Implement multi-document chat history +3. Build a visual knowledge graph +4. Add collaborative features +5. Create mobile app interface +6. Implement advanced filters and search +7. Add citation tracking +8. Create automated study guides + +--- + +**Made with ā¤ļø for the LLM Engineering Community** diff --git a/community-contributions/sach91-bootcamp/week8/agents/__init__.py b/community-contributions/sach91-bootcamp/week8/agents/__init__.py new file mode 100644 index 0000000..c33fc04 --- /dev/null +++ b/community-contributions/sach91-bootcamp/week8/agents/__init__.py @@ -0,0 +1,18 @@ +""" +KnowledgeHub Agents +""" +from .base_agent import BaseAgent +from .ingestion_agent import IngestionAgent +from .question_agent import QuestionAgent +from .summary_agent import SummaryAgent +from .connection_agent import ConnectionAgent +from .export_agent import ExportAgent + +__all__ = [ + 'BaseAgent', + 'IngestionAgent', + 'QuestionAgent', + 'SummaryAgent', + 'ConnectionAgent', + 'ExportAgent' +] diff --git a/community-contributions/sach91-bootcamp/week8/agents/base_agent.py b/community-contributions/sach91-bootcamp/week8/agents/base_agent.py new file mode 100644 index 0000000..93cf278 --- /dev/null +++ b/community-contributions/sach91-bootcamp/week8/agents/base_agent.py @@ -0,0 +1,91 @@ +""" +Base Agent class - Foundation for all specialized agents +""" +from abc import ABC, abstractmethod +import logging +from typing import Optional, Dict, Any +from utils.ollama_client import OllamaClient + +logger = logging.getLogger(__name__) + +class BaseAgent(ABC): + """Abstract base class for all agents""" + + def __init__(self, name: str, llm_client: Optional[OllamaClient] = None, + model: str = "llama3.2"): + """ + Initialize base agent + + Args: + name: Agent name for logging + llm_client: Shared Ollama client (creates new one if None) + model: Ollama model to use + """ + self.name = name + self.model = model + + # Use shared client or create new one + if llm_client is None: + self.llm = OllamaClient(model=model) + logger.info(f"{self.name} initialized with new LLM client (model: {model})") + else: + self.llm = llm_client + logger.info(f"{self.name} initialized with shared LLM client (model: {model})") + + def generate(self, prompt: str, system: Optional[str] = None, + temperature: float = 0.7, max_tokens: int = 2048) -> str: + """ + Generate text using the LLM + + Args: + prompt: User prompt + system: System message (optional) + temperature: Sampling temperature + max_tokens: Maximum tokens to generate + + Returns: + Generated text + """ + logger.info(f"{self.name} generating response") + response = self.llm.generate( + prompt=prompt, + system=system, + temperature=temperature, + max_tokens=max_tokens + ) + logger.debug(f"{self.name} generated {len(response)} characters") + return response + + def chat(self, messages: list, temperature: float = 0.7, + max_tokens: int = 2048) -> str: + """ + Chat completion with message history + + Args: + messages: List of message dicts with 'role' and 'content' + temperature: Sampling temperature + max_tokens: Maximum tokens to generate + + Returns: + Generated text + """ + logger.info(f"{self.name} processing chat with {len(messages)} messages") + response = self.llm.chat( + messages=messages, + temperature=temperature, + max_tokens=max_tokens + ) + logger.debug(f"{self.name} generated {len(response)} characters") + return response + + @abstractmethod + def process(self, *args, **kwargs) -> Any: + """ + Main processing method - must be implemented by subclasses + + Each agent implements its specialized logic here + """ + pass + + def __str__(self): + return f"{self.name} (model: {self.model})" diff --git a/community-contributions/sach91-bootcamp/week8/agents/connection_agent.py b/community-contributions/sach91-bootcamp/week8/agents/connection_agent.py new file mode 100644 index 0000000..18aa25c --- /dev/null +++ b/community-contributions/sach91-bootcamp/week8/agents/connection_agent.py @@ -0,0 +1,289 @@ +""" +Connection Agent - Finds relationships and connections between documents +""" +import logging +from typing import List, Dict, Tuple +from agents.base_agent import BaseAgent +from models.knowledge_graph import KnowledgeNode, KnowledgeEdge, KnowledgeGraph +from utils.embeddings import EmbeddingModel +import chromadb +import numpy as np + +logger = logging.getLogger(__name__) + +class ConnectionAgent(BaseAgent): + """Agent that discovers connections between documents and concepts""" + + def __init__(self, collection: chromadb.Collection, + embedding_model: EmbeddingModel, + llm_client=None, model: str = "llama3.2"): + """ + Initialize connection agent + + Args: + collection: ChromaDB collection with documents + embedding_model: Model for computing similarities + llm_client: Optional shared LLM client + model: Ollama model name + """ + super().__init__(name="ConnectionAgent", llm_client=llm_client, model=model) + + self.collection = collection + self.embedding_model = embedding_model + + logger.info(f"{self.name} initialized") + + def process(self, document_id: str = None, query: str = None, + top_k: int = 5) -> Dict: + """ + Find documents related to a document or query + + Args: + document_id: ID of reference document + query: Search query (used if document_id not provided) + top_k: Number of related documents to find + + Returns: + Dictionary with related documents and connections + """ + if document_id: + logger.info(f"{self.name} finding connections for document: {document_id}") + return self._find_related_to_document(document_id, top_k) + elif query: + logger.info(f"{self.name} finding connections for query: {query[:100]}") + return self._find_related_to_query(query, top_k) + else: + return {'related': [], 'error': 'No document_id or query provided'} + + def _find_related_to_document(self, document_id: str, top_k: int) -> Dict: + """Find documents related to a specific document""" + try: + # Get chunks from the document + results = self.collection.get( + where={"document_id": document_id}, + include=['embeddings', 'documents', 'metadatas'] + ) + + if not results['ids']: + return {'related': [], 'error': 'Document not found'} + + # Use the first chunk's embedding as representative + query_embedding = results['embeddings'][0] + document_name = results['metadatas'][0].get('filename', 'Unknown') + + # Search for similar chunks from OTHER documents + search_results = self.collection.query( + query_embeddings=[query_embedding], + n_results=top_k * 3, # Get more to filter out same document + include=['documents', 'metadatas', 'distances'] + ) + + # Filter out chunks from the same document + related = [] + seen_docs = set([document_id]) + + if search_results['ids']: + for i in range(len(search_results['ids'][0])): + related_doc_id = search_results['metadatas'][0][i].get('document_id') + + if related_doc_id not in seen_docs: + seen_docs.add(related_doc_id) + + similarity = 1.0 - search_results['distances'][0][i] + + related.append({ + 'document_id': related_doc_id, + 'document_name': search_results['metadatas'][0][i].get('filename', 'Unknown'), + 'similarity': float(similarity), + 'preview': search_results['documents'][0][i][:150] + "..." + }) + + if len(related) >= top_k: + break + + return { + 'source_document': document_name, + 'source_id': document_id, + 'related': related, + 'num_related': len(related) + } + + except Exception as e: + logger.error(f"Error finding related documents: {e}") + return {'related': [], 'error': str(e)} + + def _find_related_to_query(self, query: str, top_k: int) -> Dict: + """Find documents related to a query""" + try: + # Generate query embedding + query_embedding = self.embedding_model.embed_query(query) + + # Search + results = self.collection.query( + query_embeddings=[query_embedding], + n_results=top_k * 2, # Get more to deduplicate by document + include=['documents', 'metadatas', 'distances'] + ) + + # Deduplicate by document + related = [] + seen_docs = set() + + if results['ids']: + for i in range(len(results['ids'][0])): + doc_id = results['metadatas'][0][i].get('document_id') + + if doc_id not in seen_docs: + seen_docs.add(doc_id) + + similarity = 1.0 - results['distances'][0][i] + + related.append({ + 'document_id': doc_id, + 'document_name': results['metadatas'][0][i].get('filename', 'Unknown'), + 'similarity': float(similarity), + 'preview': results['documents'][0][i][:150] + "..." + }) + + if len(related) >= top_k: + break + + return { + 'query': query, + 'related': related, + 'num_related': len(related) + } + + except Exception as e: + logger.error(f"Error finding related documents: {e}") + return {'related': [], 'error': str(e)} + + def build_knowledge_graph(self, similarity_threshold: float = 0.7) -> KnowledgeGraph: + """ + Build a knowledge graph showing document relationships + + Args: + similarity_threshold: Minimum similarity to create an edge + + Returns: + KnowledgeGraph object + """ + logger.info(f"{self.name} building knowledge graph") + + graph = KnowledgeGraph() + + try: + # Get all documents + all_results = self.collection.get( + include=['embeddings', 'metadatas'] + ) + + if not all_results['ids']: + return graph + + # Group by document + documents = {} + for i, metadata in enumerate(all_results['metadatas']): + doc_id = metadata.get('document_id') + if doc_id not in documents: + documents[doc_id] = { + 'name': metadata.get('filename', 'Unknown'), + 'embedding': all_results['embeddings'][i] + } + + # Create nodes + for doc_id, doc_data in documents.items(): + node = KnowledgeNode( + id=doc_id, + name=doc_data['name'], + node_type='document', + description=f"Document: {doc_data['name']}" + ) + graph.add_node(node) + + # Create edges based on similarity + doc_ids = list(documents.keys()) + for i, doc_id1 in enumerate(doc_ids): + emb1 = np.array(documents[doc_id1]['embedding']) + + for doc_id2 in doc_ids[i+1:]: + emb2 = np.array(documents[doc_id2]['embedding']) + + # Calculate similarity + similarity = np.dot(emb1, emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2)) + + if similarity >= similarity_threshold: + edge = KnowledgeEdge( + source_id=doc_id1, + target_id=doc_id2, + relationship='similar_to', + weight=float(similarity) + ) + graph.add_edge(edge) + + logger.info(f"{self.name} built graph with {len(graph.nodes)} nodes and {len(graph.edges)} edges") + return graph + + except Exception as e: + logger.error(f"Error building knowledge graph: {e}") + return graph + + def explain_connection(self, doc_id1: str, doc_id2: str) -> str: + """ + Use LLM to explain why two documents are related + + Args: + doc_id1: First document ID + doc_id2: Second document ID + + Returns: + Explanation text + """ + try: + # Get sample chunks from each document + results1 = self.collection.get( + where={"document_id": doc_id1}, + limit=2, + include=['documents', 'metadatas'] + ) + + results2 = self.collection.get( + where={"document_id": doc_id2}, + limit=2, + include=['documents', 'metadatas'] + ) + + if not results1['ids'] or not results2['ids']: + return "Could not retrieve documents" + + doc1_name = results1['metadatas'][0].get('filename', 'Document 1') + doc2_name = results2['metadatas'][0].get('filename', 'Document 2') + + doc1_text = " ".join(results1['documents'][:2])[:1000] + doc2_text = " ".join(results2['documents'][:2])[:1000] + + system_prompt = """You analyze documents and explain their relationships. +Provide a brief, clear explanation of how two documents are related.""" + + user_prompt = f"""Analyze these two documents and explain how they are related: + +Document 1 ({doc1_name}): +{doc1_text} + +Document 2 ({doc2_name}): +{doc2_text} + +How are these documents related? Provide a concise explanation:""" + + explanation = self.generate( + prompt=user_prompt, + system=system_prompt, + temperature=0.3, + max_tokens=256 + ) + + return explanation + + except Exception as e: + logger.error(f"Error explaining connection: {e}") + return f"Error: {str(e)}" diff --git a/community-contributions/sach91-bootcamp/week8/agents/export_agent.py b/community-contributions/sach91-bootcamp/week8/agents/export_agent.py new file mode 100644 index 0000000..9ec898f --- /dev/null +++ b/community-contributions/sach91-bootcamp/week8/agents/export_agent.py @@ -0,0 +1,233 @@ +""" +Export Agent - Generates formatted reports and exports +""" +import logging +from typing import List, Dict +from datetime import datetime +from agents.base_agent import BaseAgent +from models.document import Summary + +logger = logging.getLogger(__name__) + +class ExportAgent(BaseAgent): + """Agent that exports summaries and reports in various formats""" + + def __init__(self, llm_client=None, model: str = "llama3.2"): + """ + Initialize export agent + + Args: + llm_client: Optional shared LLM client + model: Ollama model name + """ + super().__init__(name="ExportAgent", llm_client=llm_client, model=model) + + logger.info(f"{self.name} initialized") + + def process(self, content: Dict, format: str = "markdown") -> str: + """ + Export content in specified format + + Args: + content: Content dictionary to export + format: Export format ('markdown', 'text', 'html') + + Returns: + Formatted content string + """ + logger.info(f"{self.name} exporting as {format}") + + if format == "markdown": + return self._export_markdown(content) + elif format == "text": + return self._export_text(content) + elif format == "html": + return self._export_html(content) + else: + return str(content) + + def _export_markdown(self, content: Dict) -> str: + """Export as Markdown""" + md = [] + md.append(f"# Knowledge Report") + md.append(f"\n*Generated: {datetime.now().strftime('%Y-%m-%d %H:%M')}*\n") + + if 'title' in content: + md.append(f"## {content['title']}\n") + + if 'summary' in content: + md.append(f"### Summary\n") + md.append(f"{content['summary']}\n") + + if 'key_points' in content and content['key_points']: + md.append(f"### Key Points\n") + for point in content['key_points']: + md.append(f"- {point}") + md.append("") + + if 'sections' in content: + for section in content['sections']: + md.append(f"### {section['title']}\n") + md.append(f"{section['content']}\n") + + if 'sources' in content and content['sources']: + md.append(f"### Sources\n") + for i, source in enumerate(content['sources'], 1): + md.append(f"{i}. {source}") + md.append("") + + return "\n".join(md) + + def _export_text(self, content: Dict) -> str: + """Export as plain text""" + lines = [] + lines.append("=" * 60) + lines.append("KNOWLEDGE REPORT") + lines.append(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M')}") + lines.append("=" * 60) + lines.append("") + + if 'title' in content: + lines.append(content['title']) + lines.append("-" * len(content['title'])) + lines.append("") + + if 'summary' in content: + lines.append("SUMMARY:") + lines.append(content['summary']) + lines.append("") + + if 'key_points' in content and content['key_points']: + lines.append("KEY POINTS:") + for i, point in enumerate(content['key_points'], 1): + lines.append(f" {i}. {point}") + lines.append("") + + if 'sections' in content: + for section in content['sections']: + lines.append(section['title'].upper()) + lines.append("-" * 40) + lines.append(section['content']) + lines.append("") + + if 'sources' in content and content['sources']: + lines.append("SOURCES:") + for i, source in enumerate(content['sources'], 1): + lines.append(f" {i}. {source}") + + lines.append("") + lines.append("=" * 60) + + return "\n".join(lines) + + def _export_html(self, content: Dict) -> str: + """Export as HTML""" + html = [] + html.append("") + html.append("") + html.append("") + html.append(" ") + html.append(" Knowledge Report") + html.append(" ") + html.append("") + html.append("") + + html.append("

Knowledge Report

") + html.append(f"

Generated: {datetime.now().strftime('%Y-%m-%d %H:%M')}

") + + if 'title' in content: + html.append(f"

{content['title']}

") + + if 'summary' in content: + html.append(f"

Summary

") + html.append(f"

{content['summary']}

") + + if 'key_points' in content and content['key_points']: + html.append("

Key Points

") + html.append("
") + html.append(" ") + html.append("
") + + if 'sections' in content: + for section in content['sections']: + html.append(f"

{section['title']}

") + html.append(f"

{section['content']}

") + + if 'sources' in content and content['sources']: + html.append("

Sources

") + html.append("
    ") + for source in content['sources']: + html.append(f"
  1. {source}
  2. ") + html.append("
") + + html.append("") + html.append("") + + return "\n".join(html) + + def create_study_guide(self, summaries: List[Summary]) -> str: + """ + Create a study guide from multiple summaries + + Args: + summaries: List of Summary objects + + Returns: + Formatted study guide + """ + logger.info(f"{self.name} creating study guide from {len(summaries)} summaries") + + # Compile all content + all_summaries = "\n\n".join([ + f"{s.document_name}:\n{s.summary_text}" + for s in summaries + ]) + + all_key_points = [] + for s in summaries: + all_key_points.extend(s.key_points) + + # Use LLM to create cohesive study guide + system_prompt = """You create excellent study guides that synthesize information from multiple sources. +Create a well-organized study guide with clear sections, key concepts, and important points.""" + + user_prompt = f"""Create a comprehensive study guide based on these document summaries: + +{all_summaries} + +Create a well-structured study guide with: +1. An overview +2. Key concepts +3. Important details +4. Study tips + +Study Guide:""" + + study_guide = self.generate( + prompt=user_prompt, + system=system_prompt, + temperature=0.5, + max_tokens=2048 + ) + + # Format as markdown + content = { + 'title': 'Study Guide', + 'sections': [ + {'title': 'Overview', 'content': study_guide}, + {'title': 'Key Points from All Documents', 'content': '\n'.join([f"• {p}" for p in all_key_points[:15]])} + ], + 'sources': [s.document_name for s in summaries] + } + + return self._export_markdown(content) diff --git a/community-contributions/sach91-bootcamp/week8/agents/ingestion_agent.py b/community-contributions/sach91-bootcamp/week8/agents/ingestion_agent.py new file mode 100644 index 0000000..4f55350 --- /dev/null +++ b/community-contributions/sach91-bootcamp/week8/agents/ingestion_agent.py @@ -0,0 +1,157 @@ +""" +Ingestion Agent - Processes and stores documents in the vector database +""" +import logging +from typing import Dict, List +import uuid +from datetime import datetime + +from agents.base_agent import BaseAgent +from models.document import Document, DocumentChunk +from utils.document_parser import DocumentParser +from utils.embeddings import EmbeddingModel +import chromadb + +logger = logging.getLogger(__name__) + +class IngestionAgent(BaseAgent): + """Agent responsible for ingesting and storing documents""" + + def __init__(self, collection: chromadb.Collection, + embedding_model: EmbeddingModel, + llm_client=None, model: str = "llama3.2"): + """ + Initialize ingestion agent + + Args: + collection: ChromaDB collection for storage + embedding_model: Model for generating embeddings + llm_client: Optional shared LLM client + model: Ollama model name + """ + super().__init__(name="IngestionAgent", llm_client=llm_client, model=model) + + self.collection = collection + self.embedding_model = embedding_model + self.parser = DocumentParser(chunk_size=1000, chunk_overlap=200) + + logger.info(f"{self.name} ready with ChromaDB collection") + + def process(self, file_path: str) -> Document: + """ + Process and ingest a document + + Args: + file_path: Path to the document file + + Returns: + Document object with metadata + """ + logger.info(f"{self.name} processing: {file_path}") + + # Parse the document + parsed = self.parser.parse_file(file_path) + + # Generate document ID + doc_id = str(uuid.uuid4()) + + # Create document chunks + chunks = [] + chunk_texts = [] + chunk_ids = [] + chunk_metadatas = [] + + for i, chunk_text in enumerate(parsed['chunks']): + chunk_id = f"{doc_id}_chunk_{i}" + + chunk = DocumentChunk( + id=chunk_id, + document_id=doc_id, + content=chunk_text, + chunk_index=i, + metadata={ + 'filename': parsed['filename'], + 'extension': parsed['extension'], + 'total_chunks': len(parsed['chunks']) + } + ) + + chunks.append(chunk) + chunk_texts.append(chunk_text) + chunk_ids.append(chunk_id) + chunk_metadatas.append({ + 'document_id': doc_id, + 'filename': parsed['filename'], + 'chunk_index': i, + 'extension': parsed['extension'] + }) + + # Generate embeddings + logger.info(f"{self.name} generating embeddings for {len(chunks)} chunks") + embeddings = self.embedding_model.embed_documents(chunk_texts) + + # Store in ChromaDB + logger.info(f"{self.name} storing in ChromaDB") + self.collection.add( + ids=chunk_ids, + documents=chunk_texts, + embeddings=embeddings, + metadatas=chunk_metadatas + ) + + # Create document object + document = Document( + id=doc_id, + filename=parsed['filename'], + filepath=parsed['filepath'], + content=parsed['text'], + chunks=chunks, + metadata={ + 'extension': parsed['extension'], + 'num_chunks': len(chunks), + 'total_chars': parsed['total_chars'] + }, + created_at=datetime.now() + ) + + logger.info(f"{self.name} successfully ingested: {document}") + return document + + def get_statistics(self) -> Dict: + """Get statistics about stored documents""" + try: + count = self.collection.count() + return { + 'total_chunks': count, + 'collection_name': self.collection.name + } + except Exception as e: + logger.error(f"Error getting statistics: {e}") + return {'total_chunks': 0, 'error': str(e)} + + def delete_document(self, document_id: str) -> bool: + """ + Delete all chunks of a document + + Args: + document_id: ID of document to delete + + Returns: + True if successful + """ + try: + # Get all chunk IDs for this document + results = self.collection.get( + where={"document_id": document_id} + ) + + if results['ids']: + self.collection.delete(ids=results['ids']) + logger.info(f"{self.name} deleted document {document_id}") + return True + + return False + + except Exception as e: + logger.error(f"Error deleting document: {e}") + return False diff --git a/community-contributions/sach91-bootcamp/week8/agents/question_agent.py b/community-contributions/sach91-bootcamp/week8/agents/question_agent.py new file mode 100644 index 0000000..efd723b --- /dev/null +++ b/community-contributions/sach91-bootcamp/week8/agents/question_agent.py @@ -0,0 +1,156 @@ +""" +Question Agent - Answers questions using RAG (Retrieval Augmented Generation) +""" +import logging +from typing import List, Dict +from agents.base_agent import BaseAgent +from models.document import SearchResult, DocumentChunk +from utils.embeddings import EmbeddingModel +import chromadb + +logger = logging.getLogger(__name__) + +class QuestionAgent(BaseAgent): + """Agent that answers questions using retrieved context""" + + def __init__(self, collection: chromadb.Collection, + embedding_model: EmbeddingModel, + llm_client=None, model: str = "llama3.2"): + """ + Initialize question agent + + Args: + collection: ChromaDB collection with documents + embedding_model: Model for query embeddings + llm_client: Optional shared LLM client + model: Ollama model name + """ + super().__init__(name="QuestionAgent", llm_client=llm_client, model=model) + + self.collection = collection + self.embedding_model = embedding_model + self.top_k = 5 # Number of chunks to retrieve + + logger.info(f"{self.name} initialized") + + def retrieve(self, query: str, top_k: int = None) -> List[SearchResult]: + """ + Retrieve relevant document chunks for a query + + Args: + query: Search query + top_k: Number of results to return (uses self.top_k if None) + + Returns: + List of SearchResult objects + """ + if top_k is None: + top_k = self.top_k + + logger.info(f"{self.name} retrieving top {top_k} chunks for query") + + # Generate query embedding + query_embedding = self.embedding_model.embed_query(query) + + # Search ChromaDB + results = self.collection.query( + query_embeddings=[query_embedding], + n_results=top_k + ) + + # Convert to SearchResult objects + search_results = [] + + if results['ids'] and len(results['ids']) > 0: + for i in range(len(results['ids'][0])): + chunk = DocumentChunk( + id=results['ids'][0][i], + document_id=results['metadatas'][0][i].get('document_id', ''), + content=results['documents'][0][i], + chunk_index=results['metadatas'][0][i].get('chunk_index', 0), + metadata=results['metadatas'][0][i] + ) + + result = SearchResult( + chunk=chunk, + score=1.0 - results['distances'][0][i], # Convert distance to similarity + document_id=results['metadatas'][0][i].get('document_id', ''), + document_name=results['metadatas'][0][i].get('filename', 'Unknown') + ) + + search_results.append(result) + + logger.info(f"{self.name} retrieved {len(search_results)} results") + return search_results + + def process(self, question: str, top_k: int = None) -> Dict[str, any]: + """ + Answer a question using RAG + + Args: + question: User's question + top_k: Number of chunks to retrieve + + Returns: + Dictionary with answer and sources + """ + logger.info(f"{self.name} processing question: {question[:100]}...") + + # Retrieve relevant chunks + search_results = self.retrieve(question, top_k) + + if not search_results: + return { + 'answer': "I don't have any relevant information in my knowledge base to answer this question.", + 'sources': [], + 'context_used': "" + } + + # Build context from retrieved chunks + context_parts = [] + sources = [] + + for i, result in enumerate(search_results, 1): + context_parts.append(f"[Source {i}] {result.chunk.content}") + sources.append({ + 'document': result.document_name, + 'score': result.score, + 'preview': result.chunk.content[:150] + "..." + }) + + context = "\n\n".join(context_parts) + + # Create prompt for LLM + system_prompt = """You are a helpful research assistant. Answer questions based on the provided context. +Be accurate and cite sources when possible. If the context doesn't contain enough information to answer fully, say so. +Keep your answer concise and relevant.""" + + user_prompt = f"""Context from my knowledge base: + +{context} + +Question: {question} + +Answer based on the context above. If you reference specific information, mention which source(s) you're using.""" + + # Generate answer + answer = self.generate( + prompt=user_prompt, + system=system_prompt, + temperature=0.3, # Lower temperature for more factual responses + max_tokens=1024 + ) + + logger.info(f"{self.name} generated answer ({len(answer)} chars)") + + return { + 'answer': answer, + 'sources': sources, + 'context_used': context, + 'num_sources': len(sources) + } + + def set_top_k(self, k: int): + """Set the number of chunks to retrieve""" + self.top_k = k + logger.info(f"{self.name} top_k set to {k}") diff --git a/community-contributions/sach91-bootcamp/week8/agents/summary_agent.py b/community-contributions/sach91-bootcamp/week8/agents/summary_agent.py new file mode 100644 index 0000000..8d2a5fd --- /dev/null +++ b/community-contributions/sach91-bootcamp/week8/agents/summary_agent.py @@ -0,0 +1,181 @@ +""" +Summary Agent - Creates summaries and extracts key points from documents +""" +import logging +from typing import Dict, List +from agents.base_agent import BaseAgent +from models.document import Summary +import chromadb + +logger = logging.getLogger(__name__) + +class SummaryAgent(BaseAgent): + """Agent that creates summaries of documents""" + + def __init__(self, collection: chromadb.Collection, + llm_client=None, model: str = "llama3.2"): + """ + Initialize summary agent + + Args: + collection: ChromaDB collection with documents + llm_client: Optional shared LLM client + model: Ollama model name + """ + super().__init__(name="SummaryAgent", llm_client=llm_client, model=model) + self.collection = collection + + logger.info(f"{self.name} initialized") + + def process(self, document_id: str = None, document_text: str = None, + document_name: str = "Unknown") -> Summary: + """ + Create a summary of a document + + Args: + document_id: ID of document in ChromaDB (retrieves chunks if provided) + document_text: Full document text (used if document_id not provided) + document_name: Name of the document + + Returns: + Summary object + """ + logger.info(f"{self.name} creating summary for: {document_name}") + + # Get document text + if document_id: + text = self._get_document_text(document_id) + if not text: + return Summary( + document_id=document_id, + document_name=document_name, + summary_text="Error: Could not retrieve document", + key_points=[] + ) + elif document_text: + text = document_text + else: + return Summary( + document_id="", + document_name=document_name, + summary_text="Error: No document provided", + key_points=[] + ) + + # Truncate if too long (to fit in context) + max_chars = 8000 + if len(text) > max_chars: + logger.warning(f"{self.name} truncating document from {len(text)} to {max_chars} chars") + text = text[:max_chars] + "\n\n[Document truncated...]" + + # Generate summary + summary_text = self._generate_summary(text) + + # Extract key points + key_points = self._extract_key_points(text) + + summary = Summary( + document_id=document_id or "", + document_name=document_name, + summary_text=summary_text, + key_points=key_points + ) + + logger.info(f"{self.name} completed summary with {len(key_points)} key points") + return summary + + def _get_document_text(self, document_id: str) -> str: + """Retrieve and reconstruct document text from chunks""" + try: + results = self.collection.get( + where={"document_id": document_id} + ) + + if not results['ids']: + return "" + + # Sort by chunk index + chunks_data = list(zip( + results['documents'], + results['metadatas'] + )) + + chunks_data.sort(key=lambda x: x[1].get('chunk_index', 0)) + + # Combine chunks + text = "\n\n".join([chunk[0] for chunk in chunks_data]) + return text + + except Exception as e: + logger.error(f"Error retrieving document: {e}") + return "" + + def _generate_summary(self, text: str) -> str: + """Generate a concise summary of the text""" + system_prompt = """You are an expert at creating concise, informative summaries. +Your summaries capture the main ideas and key information in clear, accessible language. +Keep summaries to 3-5 sentences unless the document is very long.""" + + user_prompt = f"""Please create a concise summary of the following document: + +{text} + +Summary:""" + + summary = self.generate( + prompt=user_prompt, + system=system_prompt, + temperature=0.3, + max_tokens=512 + ) + + return summary.strip() + + def _extract_key_points(self, text: str) -> List[str]: + """Extract key points from the text""" + system_prompt = """You extract the most important key points from documents. +List 3-7 key points as concise bullet points. Each point should be a complete, standalone statement.""" + + user_prompt = f"""Please extract the key points from the following document: + +{text} + +List the key points (one per line, without bullets or numbers):""" + + response = self.generate( + prompt=user_prompt, + system=system_prompt, + temperature=0.3, + max_tokens=512 + ) + + # Parse the response into a list + key_points = [] + for line in response.split('\n'): + line = line.strip() + # Remove common list markers + line = line.lstrip('•-*0123456789.)') + line = line.strip() + + if line and len(line) > 10: # Filter out very short lines + key_points.append(line) + + return key_points[:7] # Limit to 7 points + + def summarize_multiple(self, document_ids: List[str]) -> List[Summary]: + """ + Create summaries for multiple documents + + Args: + document_ids: List of document IDs + + Returns: + List of Summary objects + """ + summaries = [] + + for doc_id in document_ids: + summary = self.process(document_id=doc_id) + summaries.append(summary) + + return summaries diff --git a/community-contributions/sach91-bootcamp/week8/app.py b/community-contributions/sach91-bootcamp/week8/app.py new file mode 100644 index 0000000..8eb4c51 --- /dev/null +++ b/community-contributions/sach91-bootcamp/week8/app.py @@ -0,0 +1,846 @@ +""" +KnowledgeHub - Personal Knowledge Management & Research Assistant +Main Gradio Application +""" +import os +import logging +import json +import gradio as gr +from pathlib import Path +import chromadb +from datetime import datetime + +# Setup logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' +) +logger = logging.getLogger(__name__) + +# Import utilities and agents +from utils import OllamaClient, EmbeddingModel, DocumentParser +from agents import ( + IngestionAgent, QuestionAgent, SummaryAgent, + ConnectionAgent, ExportAgent +) +from models import Document + +# Constants +VECTORSTORE_PATH = "./vectorstore" +TEMP_UPLOAD_PATH = "./temp_uploads" +DOCUMENTS_METADATA_PATH = "./vectorstore/documents_metadata.json" + +# Ensure directories exist +os.makedirs(VECTORSTORE_PATH, exist_ok=True) +os.makedirs(TEMP_UPLOAD_PATH, exist_ok=True) + +class KnowledgeHub: + """Main application class managing all agents""" + + def __init__(self): + logger.info("Initializing KnowledgeHub...") + + # Initialize ChromaDB + self.client = chromadb.PersistentClient(path=VECTORSTORE_PATH) + self.collection = self.client.get_or_create_collection( + name="knowledge_base", + metadata={"description": "Personal knowledge management collection"} + ) + + # Initialize embedding model + self.embedding_model = EmbeddingModel() + + # Initialize shared LLM client + self.llm_client = OllamaClient(model="llama3.2") + + # Check Ollama connection + if not self.llm_client.check_connection(): + logger.warning("āš ļø Cannot connect to Ollama. Please ensure Ollama is running.") + logger.warning("Start Ollama with: ollama serve") + else: + logger.info("āœ“ Connected to Ollama") + + # Initialize agents + self.ingestion_agent = IngestionAgent( + collection=self.collection, + embedding_model=self.embedding_model, + llm_client=self.llm_client + ) + + self.question_agent = QuestionAgent( + collection=self.collection, + embedding_model=self.embedding_model, + llm_client=self.llm_client + ) + + self.summary_agent = SummaryAgent( + collection=self.collection, + llm_client=self.llm_client + ) + + self.connection_agent = ConnectionAgent( + collection=self.collection, + embedding_model=self.embedding_model, + llm_client=self.llm_client + ) + + self.export_agent = ExportAgent( + llm_client=self.llm_client + ) + + # Track uploaded documents + self.documents = {} + + # Load existing documents from metadata file + self._load_documents_metadata() + + logger.info("āœ“ KnowledgeHub initialized successfully") + + def _save_documents_metadata(self): + """Save document metadata to JSON file""" + try: + metadata = { + doc_id: doc.to_dict() + for doc_id, doc in self.documents.items() + } + + with open(DOCUMENTS_METADATA_PATH, 'w') as f: + json.dump(metadata, f, indent=2) + + logger.debug(f"Saved metadata for {len(metadata)} documents") + except Exception as e: + logger.error(f"Error saving document metadata: {e}") + + def _load_documents_metadata(self): + """Load document metadata from JSON file""" + try: + if os.path.exists(DOCUMENTS_METADATA_PATH): + with open(DOCUMENTS_METADATA_PATH, 'r') as f: + metadata = json.load(f) + + # Reconstruct Document objects (simplified - without chunks) + for doc_id, doc_data in metadata.items(): + # Create a minimal Document object for UI purposes + # Full chunks are still in ChromaDB + doc = Document( + id=doc_id, + filename=doc_data['filename'], + filepath=doc_data.get('filepath', ''), + content=doc_data.get('content', ''), + chunks=[], # Chunks are in ChromaDB + metadata=doc_data.get('metadata', {}), + created_at=datetime.fromisoformat(doc_data['created_at']) + ) + self.documents[doc_id] = doc + + logger.info(f"āœ“ Loaded {len(self.documents)} existing documents from storage") + else: + logger.info("No existing documents found (starting fresh)") + + except Exception as e: + logger.error(f"Error loading document metadata: {e}") + logger.info("Starting with empty document list") + + def upload_document(self, files, progress=gr.Progress()): + """Handle document upload - supports single or multiple files with progress tracking""" + if files is None or len(files) == 0: + return "āš ļø Please select file(s) to upload", "", [] + + # Convert single file to list for consistent handling + if not isinstance(files, list): + files = [files] + + results = [] + successful = 0 + failed = 0 + total_chunks = 0 + + # Initialize progress tracking + progress(0, desc="Starting upload...") + + for file_idx, file in enumerate(files, 1): + # Update progress + progress_pct = (file_idx - 1) / len(files) + progress(progress_pct, desc=f"Processing {file_idx}/{len(files)}: {Path(file.name).name}") + + try: + logger.info(f"Processing file {file_idx}/{len(files)}: {file.name}") + + # Save uploaded file temporarily + temp_path = os.path.join(TEMP_UPLOAD_PATH, Path(file.name).name) + + # Copy file content + with open(temp_path, 'wb') as f: + f.write(file.read() if hasattr(file, 'read') else open(file.name, 'rb').read()) + + # Process document + document = self.ingestion_agent.process(temp_path) + + # Store document reference + self.documents[document.id] = document + + # Track stats + successful += 1 + total_chunks += document.num_chunks + + # Add to results + results.append({ + 'status': 'āœ…', + 'filename': document.filename, + 'chunks': document.num_chunks, + 'size': f"{document.total_chars:,} chars" + }) + + # Clean up temp file + os.remove(temp_path) + + except Exception as e: + logger.error(f"Error processing {file.name}: {e}") + failed += 1 + results.append({ + 'status': 'āŒ', + 'filename': Path(file.name).name, + 'chunks': 0, + 'size': f"Error: {str(e)[:50]}" + }) + + # Final progress update + progress(1.0, desc="Upload complete!") + + # Save metadata once after all uploads + if successful > 0: + self._save_documents_metadata() + + # Create summary + summary = f"""## Upload Complete! šŸŽ‰ + +**Total Files:** {len(files)} +**āœ… Successful:** {successful} +**āŒ Failed:** {failed} +**Total Chunks Created:** {total_chunks:,} + +{f"āš ļø **{failed} file(s) failed** - Check results table below for details" if failed > 0 else "All files processed successfully!"} +""" + + # Create detailed results table + results_table = [[r['status'], r['filename'], r['chunks'], r['size']] for r in results] + + # Create preview of first successful document + preview = "" + for doc in self.documents.values(): + if doc.filename in [r['filename'] for r in results if r['status'] == 'āœ…']: + preview = doc.content[:500] + "..." if len(doc.content) > 500 else doc.content + break + + return summary, preview, results_table + + def ask_question(self, question, top_k, progress=gr.Progress()): + """Handle question answering with progress tracking""" + if not question.strip(): + return "āš ļø Please enter a question", [], "" + + try: + # Initial status + progress(0, desc="Processing your question...") + status = "šŸ”„ **Searching knowledge base...**\n\nRetrieving relevant documents..." + + logger.info(f"Answering question: {question[:100]}") + + # Update progress + progress(0.3, desc="Finding relevant documents...") + + result = self.question_agent.process(question, top_k=top_k) + + # Update progress + progress(0.7, desc="Generating answer with LLM...") + + # Format answer + answer = f"""### Answer\n\n{result['answer']}\n\n""" + + if result['sources']: + answer += f"**Sources:** {result['num_sources']} documents referenced\n\n" + + # Format sources for display + sources_data = [] + for i, source in enumerate(result['sources'], 1): + sources_data.append([ + i, + source['document'], + f"{source['score']:.2%}", + source['preview'] + ]) + + progress(1.0, desc="Answer ready!") + + return answer, sources_data, "āœ… Answer generated successfully!" + + except Exception as e: + logger.error(f"Error answering question: {e}") + return f"āŒ Error: {str(e)}", [], f"āŒ Error: {str(e)}" + + def create_summary(self, doc_selector, progress=gr.Progress()): + """Create document summary with progress tracking""" + if not doc_selector: + return "āš ļø Please select a document to summarize", "" + + try: + # Initial status + progress(0, desc="Preparing to summarize...") + + logger.info(f'doc_selector : {doc_selector}') + doc_id = doc_selector.split(" -|- ")[1] + document = self.documents.get(doc_id) + + if not document: + return "", "āŒ Document not found" + + # Update status + status_msg = f"šŸ”„ **Generating summary for:** {document.filename}\n\nPlease wait, this may take 10-20 seconds..." + progress(0.3, desc=f"Analyzing {document.filename}...") + + logger.info(f"Creating summary for: {document.filename}") + + # Generate summary + summary = self.summary_agent.process( + document_id=doc_id, + document_name=document.filename + ) + + progress(1.0, desc="Summary complete!") + + # Format result + result = f"""## Summary of {summary.document_name}\n\n{summary.summary_text}\n\n""" + + if summary.key_points: + result += "### Key Points\n\n" + for point in summary.key_points: + result += f"- {point}\n" + + return result, "āœ… Summary generated successfully!" + + except Exception as e: + logger.error(f"Error creating summary: {e}") + return "", f"āŒ Error: {str(e)}" + + def find_connections(self, doc_selector, top_k, progress=gr.Progress()): + """Find related documents with progress tracking""" + if not doc_selector: + return "āš ļø Please select a document", [], "" + + try: + progress(0, desc="Preparing to find connections...") + + doc_id = doc_selector.split(" -|- ")[1] + document = self.documents.get(doc_id) + + if not document: + return "āŒ Document not found", [], "āŒ Document not found" + + status = f"šŸ”„ **Finding documents related to:** {document.filename}\n\nSearching knowledge base..." + progress(0.3, desc=f"Analyzing {document.filename}...") + + logger.info(f"Finding connections for: {document.filename}") + + result = self.connection_agent.process(document_id=doc_id, top_k=top_k) + + progress(0.8, desc="Calculating similarity scores...") + + if 'error' in result: + return f"āŒ Error: {result['error']}", [], f"āŒ Error: {result['error']}" + + message = f"""## Related Documents\n\n**Source:** {result['source_document']}\n\n""" + message += f"**Found {result['num_related']} related documents:**\n\n""" + + # Format for table + table_data = [] + for i, rel in enumerate(result['related'], 1): + table_data.append([ + i, + rel['document_name'], + f"{rel['similarity']:.2%}", + rel['preview'] + ]) + + progress(1.0, desc="Connections found!") + + return message, table_data, "āœ… Related documents found!" + + except Exception as e: + logger.error(f"Error finding connections: {e}") + return f"āŒ Error: {str(e)}", [], f"āŒ Error: {str(e)}" + + def export_knowledge(self, format_choice): + """Export knowledge base""" + try: + logger.info(f"Exporting as {format_choice}") + + # Get statistics + stats = self.ingestion_agent.get_statistics() + + # Create export content + content = { + 'title': 'Knowledge Base Export', + 'summary': f"Total documents in knowledge base: {len(self.documents)}", + 'sections': [ + { + 'title': 'Documents', + 'content': '\n'.join([f"- {doc.filename}" for doc in self.documents.values()]) + }, + { + 'title': 'Statistics', + 'content': f"Total chunks stored: {stats['total_chunks']}" + } + ] + } + + # Export + if format_choice == "Markdown": + output = self.export_agent.process(content, format="markdown") + filename = f"knowledge_export_{datetime.now().strftime('%Y%m%d_%H%M%S')}.md" + elif format_choice == "HTML": + output = self.export_agent.process(content, format="html") + filename = f"knowledge_export_{datetime.now().strftime('%Y%m%d_%H%M%S')}.html" + else: # Text + output = self.export_agent.process(content, format="text") + filename = f"knowledge_export_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt" + + # Save file + export_path = os.path.join(TEMP_UPLOAD_PATH, filename) + with open(export_path, 'w', encoding='utf-8') as f: + f.write(output) + + return f"āœ… Exported as {format_choice}", export_path + + except Exception as e: + logger.error(f"Error exporting: {e}") + return f"āŒ Error: {str(e)}", None + + def get_statistics(self): + """Get knowledge base statistics""" + try: + stats = self.ingestion_agent.get_statistics() + + total_docs = len(self.documents) + total_chunks = stats.get('total_chunks', 0) + total_chars = sum(doc.total_chars for doc in self.documents.values()) + + # Check if data is persisted + persistence_status = "āœ… Enabled" if os.path.exists(DOCUMENTS_METADATA_PATH) else "āš ļø Not configured" + vectorstore_size = self._get_directory_size(VECTORSTORE_PATH) + + stats_text = f"""## Knowledge Base Statistics + +**Persistence Status:** {persistence_status} +**Total Documents:** {total_docs} +**Total Chunks:** {total_chunks:,} +**Total Characters:** {total_chars:,} +**Vector Store Size:** {vectorstore_size} + +### Storage Locations +- **Vector DB:** `{VECTORSTORE_PATH}/` +- **Metadata:** `{DOCUMENTS_METADATA_PATH}` + +**šŸ“ Note:** Your data persists across app restarts! + +**Recent Documents:** +{chr(10).join([f"- {doc.filename} ({doc.num_chunks} chunks)" for doc in list(self.documents.values())[-5:]])} +""" + if self.documents: + stats_text += "\n".join([f"- {doc.filename} ({doc.num_chunks} chunks, added {doc.created_at.strftime('%Y-%m-%d')})" + for doc in list(self.documents.values())[-10:]]) + else: + stats_text += "\n*No documents yet. Upload some to get started!*" + + return stats_text + + except Exception as e: + return f"āŒ Error: {str(e)}" + + def _get_directory_size(self, path): + """Calculate directory size""" + try: + total_size = 0 + for dirpath, dirnames, filenames in os.walk(path): + for filename in filenames: + filepath = os.path.join(dirpath, filename) + if os.path.exists(filepath): + total_size += os.path.getsize(filepath) + + # Convert to human readable + for unit in ['B', 'KB', 'MB', 'GB']: + if total_size < 1024.0: + return f"{total_size:.1f} {unit}" + total_size /= 1024.0 + return f"{total_size:.1f} TB" + except: + return "Unknown" + + def get_document_list(self): + """Get list of documents for dropdown""" + new_choices = [f"{doc.filename} -|- {doc.id}" for doc in self.documents.values()] + return gr.update(choices=new_choices, value=None) + + + def delete_document(self, doc_selector): + """Delete a document from the knowledge base""" + if not doc_selector: + return "āš ļø Please select a document to delete", self.get_document_list() + + try: + doc_id = doc_selector.split(" - ")[0] + document = self.documents.get(doc_id) + + if not document: + return "āŒ Document not found", self.get_document_list() + + # Delete from ChromaDB + success = self.ingestion_agent.delete_document(doc_id) + + if success: + # Remove from documents dict + filename = document.filename + del self.documents[doc_id] + + # Save updated metadata + self._save_documents_metadata() + + return f"āœ… Deleted: {filename}", self.get_document_list() + else: + return f"āŒ Error deleting document", self.get_document_list() + + except Exception as e: + logger.error(f"Error deleting document: {e}") + return f"āŒ Error: {str(e)}", self.get_document_list() + + def clear_all_documents(self): + """Clear entire knowledge base""" + try: + # Delete collection + self.client.delete_collection("knowledge_base") + + # Recreate empty collection + self.collection = self.client.create_collection( + name="knowledge_base", + metadata={"description": "Personal knowledge management collection"} + ) + + # Update agents with new collection + self.ingestion_agent.collection = self.collection + self.question_agent.collection = self.collection + self.summary_agent.collection = self.collection + self.connection_agent.collection = self.collection + + # Clear documents + self.documents = {} + self._save_documents_metadata() + + return "āœ… All documents cleared from knowledge base" + + except Exception as e: + logger.error(f"Error clearing database: {e}") + return f"āŒ Error: {str(e)}" + + +def create_ui(): + """Create Gradio interface""" + + # Initialize app + app = KnowledgeHub() + + # Custom CSS + custom_css = """ + .main-header { + text-align: center; + background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); + color: white; + padding: 30px; + border-radius: 10px; + margin-bottom: 20px; + } + .stat-box { + background: #f8f9fa; + padding: 15px; + border-radius: 8px; + border-left: 4px solid #667eea; + } + """ + + with gr.Blocks(title="KnowledgeHub", css=custom_css, theme=gr.themes.Soft()) as interface: + + # Header + gr.HTML(""" +
+

🧠 KnowledgeHub

+

Personal Knowledge Management & Research Assistant

+

+ Powered by Ollama (Llama 3.2) • Fully Local & Private +

+
+ """) + + # Main tabs + with gr.Tabs(): + + # Tab 1: Upload Documents + with gr.Tab("šŸ“¤ Upload Documents"): + gr.Markdown("### Upload your documents to build your knowledge base") + gr.Markdown("*Supported formats: PDF, DOCX, TXT, MD, HTML, PY*") + gr.Markdown("*šŸ’” Tip: You can select multiple files at once!*") + + with gr.Row(): + with gr.Column(): + file_input = gr.File( + label="Select Document(s)", + file_types=[".pdf", ".docx", ".txt", ".md", ".html", ".py"], + file_count="multiple" # Enable multiple file selection + ) + upload_btn = gr.Button("šŸ“¤ Upload & Process", variant="primary") + + with gr.Column(): + upload_status = gr.Markdown("Ready to upload documents") + + # Results table for batch uploads + with gr.Row(): + upload_results = gr.Dataframe( + headers=["Status", "Filename", "Chunks", "Size"], + label="Upload Results", + wrap=True, + visible=True + ) + + with gr.Row(): + document_preview = gr.Textbox( + label="Document Preview (First Uploaded)", + lines=10, + max_lines=15 + ) + + upload_btn.click( + fn=app.upload_document, + inputs=[file_input], + outputs=[upload_status, document_preview, upload_results] + ) + + # Tab 2: Ask Questions + with gr.Tab("ā“ Ask Questions"): + gr.Markdown("### Ask questions about your documents") + gr.Markdown("*Uses RAG (Retrieval Augmented Generation) to answer based on your knowledge base*") + + with gr.Row(): + with gr.Column(scale=3): + question_input = gr.Textbox( + label="Your Question", + placeholder="What would you like to know?", + lines=3 + ) + + with gr.Column(scale=1): + top_k_slider = gr.Slider( + minimum=1, + maximum=10, + value=5, + step=1, + label="Number of sources" + ) + ask_btn = gr.Button("šŸ” Ask", variant="primary") + + qa_status = gr.Markdown("Ready to answer questions") + answer_output = gr.Markdown(label="Answer") + + sources_table = gr.Dataframe( + headers=["#", "Document", "Relevance", "Preview"], + label="Sources", + wrap=True + ) + + ask_btn.click( + fn=app.ask_question, + inputs=[question_input, top_k_slider], + outputs=[answer_output, sources_table, qa_status] + ) + + # Tab 3: Summarize + with gr.Tab("šŸ“ Summarize"): + gr.Markdown("### Generate summaries and extract key points") + + with gr.Row(): + with gr.Column(): + doc_selector = gr.Dropdown( + choices=[], + label="Select Document", + info="Choose a document to summarize", + allow_custom_value=True + ) + refresh_btn = gr.Button("šŸ”„ Refresh List") + summarize_btn = gr.Button("šŸ“ Generate Summary", variant="primary") + summary_status = gr.Markdown("Ready to generate summaries") + + with gr.Column(scale=2): + summary_output = gr.Markdown(label="Summary") + + summarize_btn.click( + fn=app.create_summary, + inputs=[doc_selector], + outputs=[summary_output, summary_status] + ) + + refresh_btn.click( + fn=app.get_document_list, + outputs=[doc_selector] + ) + + # Tab 4: Find Connections + with gr.Tab("šŸ”— Find Connections"): + gr.Markdown("### Discover relationships between documents") + + with gr.Row(): + with gr.Column(): + conn_doc_selector = gr.Dropdown( + choices=[], + label="Select Document", + info="Find documents related to this one", + allow_custom_value=True + ) + conn_top_k = gr.Slider( + minimum=1, + maximum=10, + value=5, + step=1, + label="Number of related documents" + ) + refresh_conn_btn = gr.Button("šŸ”„ Refresh List") + find_btn = gr.Button("šŸ”— Find Connections", variant="primary") + connection_status = gr.Markdown("Ready to find connections") + + connection_output = gr.Markdown(label="Connections") + + connections_table = gr.Dataframe( + headers=["#", "Document", "Similarity", "Preview"], + label="Related Documents", + wrap=True + ) + + find_btn.click( + fn=app.find_connections, + inputs=[conn_doc_selector, conn_top_k], + outputs=[connection_output, connections_table, connection_status] + ) + + refresh_conn_btn.click( + fn=app.get_document_list, + outputs=[conn_doc_selector] + ) + + # Tab 5: Export + with gr.Tab("šŸ’¾ Export"): + gr.Markdown("### Export your knowledge base") + + with gr.Row(): + with gr.Column(): + format_choice = gr.Radio( + choices=["Markdown", "HTML", "Text"], + value="Markdown", + label="Export Format" + ) + export_btn = gr.Button("šŸ’¾ Export", variant="primary") + + with gr.Column(): + export_status = gr.Markdown("Ready to export") + export_file = gr.File(label="Download Export") + + export_btn.click( + fn=app.export_knowledge, + inputs=[format_choice], + outputs=[export_status, export_file] + ) + + # Tab 6: Manage Documents + with gr.Tab("šŸ—‚ļø Manage Documents"): + gr.Markdown("### Manage your document library") + + with gr.Row(): + with gr.Column(): + gr.Markdown("#### Delete Document") + delete_doc_selector = gr.Dropdown( + choices=[], + label="Select Document to Delete", + info="Choose a document to remove from knowledge base" + ) + with gr.Row(): + refresh_delete_btn = gr.Button("šŸ”„ Refresh List") + delete_btn = gr.Button("šŸ—‘ļø Delete Document", variant="stop") + delete_status = gr.Markdown("") + + with gr.Column(): + gr.Markdown("#### Clear All Documents") + gr.Markdown("āš ļø **Warning:** This will delete your entire knowledge base!") + clear_confirm = gr.Textbox( + label="Type 'DELETE ALL' to confirm", + placeholder="DELETE ALL" + ) + clear_all_btn = gr.Button("šŸ—‘ļø Clear All Documents", variant="stop") + clear_status = gr.Markdown("") + + def confirm_and_clear(confirm_text): + if confirm_text.strip() == "DELETE ALL": + return app.clear_all_documents() + else: + return "āš ļø Please type 'DELETE ALL' to confirm" + + delete_btn.click( + fn=app.delete_document, + inputs=[delete_doc_selector], + outputs=[delete_status, delete_doc_selector] + ) + + refresh_delete_btn.click( + fn=app.get_document_list, + outputs=[delete_doc_selector] + ) + + clear_all_btn.click( + fn=confirm_and_clear, + inputs=[clear_confirm], + outputs=[clear_status] + ) + + # Tab 7: Statistics + with gr.Tab("šŸ“Š Statistics"): + gr.Markdown("### Knowledge Base Overview") + + stats_output = gr.Markdown() + stats_btn = gr.Button("šŸ”„ Refresh Statistics", variant="primary") + + stats_btn.click( + fn=app.get_statistics, + outputs=[stats_output] + ) + + # Auto-load stats on tab open + interface.load( + fn=app.get_statistics, + outputs=[stats_output] + ) + + # Footer + gr.HTML(""" +
+

šŸ”’ All processing happens locally on your machine • Your data never leaves your computer

+

Powered by Ollama, ChromaDB, and Sentence Transformers

+
+ """) + + return interface + + +if __name__ == "__main__": + logger.info("Starting KnowledgeHub...") + + # Create and launch interface + interface = create_ui() + interface.launch( + server_name="127.0.0.1", + server_port=7860, + share=False, + inbrowser=True + ) diff --git a/community-contributions/sach91-bootcamp/week8/models/__init__.py b/community-contributions/sach91-bootcamp/week8/models/__init__.py new file mode 100644 index 0000000..74b9a97 --- /dev/null +++ b/community-contributions/sach91-bootcamp/week8/models/__init__.py @@ -0,0 +1,13 @@ +""" +models +""" +from .knowledge_graph import KnowledgeGraph +from .document import Document, DocumentChunk, SearchResult, Summary + +__all__ = [ + 'KnowledgeGraph', + 'Document', + 'DocumentChunk', + 'SearchResult', + 'Summary' +] diff --git a/community-contributions/sach91-bootcamp/week8/models/document.py b/community-contributions/sach91-bootcamp/week8/models/document.py new file mode 100644 index 0000000..eed3e17 --- /dev/null +++ b/community-contributions/sach91-bootcamp/week8/models/document.py @@ -0,0 +1,82 @@ +""" +Document data models +""" +from dataclasses import dataclass, field +from typing import List, Dict, Optional +from datetime import datetime + +@dataclass +class DocumentChunk: + """Represents a chunk of a document""" + id: str + document_id: str + content: str + chunk_index: int + metadata: Dict = field(default_factory=dict) + + def __str__(self): + preview = self.content[:100] + "..." if len(self.content) > 100 else self.content + return f"Chunk {self.chunk_index}: {preview}" + +@dataclass +class Document: + """Represents a complete document""" + id: str + filename: str + filepath: str + content: str + chunks: List[DocumentChunk] + metadata: Dict = field(default_factory=dict) + created_at: datetime = field(default_factory=datetime.now) + + @property + def num_chunks(self) -> int: + return len(self.chunks) + + @property + def total_chars(self) -> int: + return len(self.content) + + @property + def extension(self) -> str: + return self.metadata.get('extension', '') + + def __str__(self): + return f"Document: {self.filename} ({self.num_chunks} chunks, {self.total_chars} chars)" + + def to_dict(self) -> Dict: + """Convert to dictionary for storage""" + return { + 'id': self.id, + 'filename': self.filename, + 'filepath': self.filepath, + 'content': self.content[:500] + '...' if len(self.content) > 500 else self.content, + 'num_chunks': self.num_chunks, + 'total_chars': self.total_chars, + 'extension': self.extension, + 'created_at': self.created_at.isoformat(), + 'metadata': self.metadata + } + +@dataclass +class SearchResult: + """Represents a search result from the vector database""" + chunk: DocumentChunk + score: float + document_id: str + document_name: str + + def __str__(self): + return f"{self.document_name} (score: {self.score:.2f})" + +@dataclass +class Summary: + """Represents a document summary""" + document_id: str + document_name: str + summary_text: str + key_points: List[str] = field(default_factory=list) + created_at: datetime = field(default_factory=datetime.now) + + def __str__(self): + return f"Summary of {self.document_name}: {self.summary_text[:100]}..." diff --git a/community-contributions/sach91-bootcamp/week8/models/knowledge_graph.py b/community-contributions/sach91-bootcamp/week8/models/knowledge_graph.py new file mode 100644 index 0000000..a1caad0 --- /dev/null +++ b/community-contributions/sach91-bootcamp/week8/models/knowledge_graph.py @@ -0,0 +1,110 @@ +""" +Knowledge Graph data models +""" +from dataclasses import dataclass, field +from typing import List, Dict, Set +from datetime import datetime + +@dataclass +class KnowledgeNode: + """Represents a concept or entity in the knowledge graph""" + id: str + name: str + node_type: str # 'document', 'concept', 'entity', 'topic' + description: str = "" + metadata: Dict = field(default_factory=dict) + created_at: datetime = field(default_factory=datetime.now) + + def __str__(self): + return f"{self.node_type.capitalize()}: {self.name}" + +@dataclass +class KnowledgeEdge: + """Represents a relationship between nodes""" + source_id: str + target_id: str + relationship: str # 'related_to', 'cites', 'contains', 'similar_to' + weight: float = 1.0 + metadata: Dict = field(default_factory=dict) + + def __str__(self): + return f"{self.source_id} --[{self.relationship}]--> {self.target_id}" + +@dataclass +class KnowledgeGraph: + """Represents the complete knowledge graph""" + nodes: Dict[str, KnowledgeNode] = field(default_factory=dict) + edges: List[KnowledgeEdge] = field(default_factory=list) + + def add_node(self, node: KnowledgeNode): + """Add a node to the graph""" + self.nodes[node.id] = node + + def add_edge(self, edge: KnowledgeEdge): + """Add an edge to the graph""" + if edge.source_id in self.nodes and edge.target_id in self.nodes: + self.edges.append(edge) + + def get_neighbors(self, node_id: str) -> List[str]: + """Get all nodes connected to a given node""" + neighbors = set() + for edge in self.edges: + if edge.source_id == node_id: + neighbors.add(edge.target_id) + elif edge.target_id == node_id: + neighbors.add(edge.source_id) + return list(neighbors) + + def get_related_documents(self, node_id: str, max_depth: int = 2) -> Set[str]: + """Get all documents related to a node within max_depth hops""" + related = set() + visited = set() + queue = [(node_id, 0)] + + while queue: + current_id, depth = queue.pop(0) + + if current_id in visited or depth > max_depth: + continue + + visited.add(current_id) + + # If this is a document node, add it + if current_id in self.nodes and self.nodes[current_id].node_type == 'document': + related.add(current_id) + + # Add neighbors to queue + if depth < max_depth: + for neighbor_id in self.get_neighbors(current_id): + if neighbor_id not in visited: + queue.append((neighbor_id, depth + 1)) + + return related + + def to_networkx(self): + """Convert to NetworkX graph for visualization""" + try: + import networkx as nx + + G = nx.Graph() + + # Add nodes + for node_id, node in self.nodes.items(): + G.add_node(node_id, + name=node.name, + type=node.node_type, + description=node.description) + + # Add edges + for edge in self.edges: + G.add_edge(edge.source_id, edge.target_id, + relationship=edge.relationship, + weight=edge.weight) + + return G + + except ImportError: + return None + + def __str__(self): + return f"KnowledgeGraph: {len(self.nodes)} nodes, {len(self.edges)} edges" diff --git a/community-contributions/sach91-bootcamp/week8/requirements.txt b/community-contributions/sach91-bootcamp/week8/requirements.txt new file mode 100644 index 0000000..ae8fac8 --- /dev/null +++ b/community-contributions/sach91-bootcamp/week8/requirements.txt @@ -0,0 +1,26 @@ +# Core Dependencies +gradio>=4.0.0 +chromadb>=0.4.0 +sentence-transformers>=2.2.0 +python-dotenv>=1.0.0 + +# Document Processing +pypdf>=3.0.0 +python-docx>=1.0.0 +markdown>=3.4.0 +beautifulsoup4>=4.12.0 + +# Data Processing +numpy>=1.24.0 +pandas>=2.0.0 +tqdm>=4.65.0 + +# Visualization +plotly>=5.14.0 +networkx>=3.0 + +# Ollama Client +requests>=2.31.0 + +# Optional but useful +scikit-learn>=1.3.0 diff --git a/community-contributions/sach91-bootcamp/week8/start.bat b/community-contributions/sach91-bootcamp/week8/start.bat new file mode 100644 index 0000000..7803b79 --- /dev/null +++ b/community-contributions/sach91-bootcamp/week8/start.bat @@ -0,0 +1,71 @@ +@echo off +REM KnowledgeHub Startup Script for Windows + +echo 🧠 Starting KnowledgeHub... +echo. + +REM Check if Ollama is installed +where ollama >nul 2>nul +if %errorlevel% neq 0 ( + echo āŒ Ollama is not installed or not in PATH + echo Please install Ollama from https://ollama.com/download + pause + exit /b 1 +) + +REM Check Python +where python >nul 2>nul +if %errorlevel% neq 0 ( + echo āŒ Python is not installed or not in PATH + echo Please install Python 3.8+ from https://www.python.org/downloads/ + pause + exit /b 1 +) + +echo āœ… Prerequisites found +echo. + +REM Check if Ollama service is running +tasklist /FI "IMAGENAME eq ollama.exe" 2>NUL | find /I /N "ollama.exe">NUL +if "%ERRORLEVEL%"=="1" ( + echo āš ļø Ollama is not running. Please start Ollama first. + echo You can start it from the Start menu or by running: ollama serve + pause + exit /b 1 +) + +echo āœ… Ollama is running +echo. + +REM Check if model exists +ollama list | find "llama3.2" >nul +if %errorlevel% neq 0 ( + echo šŸ“„ Llama 3.2 model not found. Pulling model... + echo This may take a few minutes on first run... + ollama pull llama3.2 +) + +echo āœ… Model ready +echo. + +REM Install dependencies +echo šŸ” Checking dependencies... +python -c "import gradio" 2>nul +if %errorlevel% neq 0 ( + echo šŸ“¦ Installing dependencies... + pip install -r requirements.txt +) + +echo āœ… Dependencies ready +echo. + +REM Launch application +echo šŸš€ Launching KnowledgeHub... +echo The application will open in your browser at http://127.0.0.1:7860 +echo. +echo Press Ctrl+C to stop the application +echo. + +python app.py + +pause diff --git a/community-contributions/sach91-bootcamp/week8/start.sh b/community-contributions/sach91-bootcamp/week8/start.sh new file mode 100755 index 0000000..067147d --- /dev/null +++ b/community-contributions/sach91-bootcamp/week8/start.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +# KnowledgeHub Startup Script + +echo "🧠 Starting KnowledgeHub..." +echo "" + +# Check if Ollama is running +if ! pgrep -x "ollama" > /dev/null; then + echo "āš ļø Ollama is not running. Starting Ollama..." + ollama serve & + sleep 3 +fi + +# Check if llama3.2 model exists +if ! ollama list | grep -q "llama3.2"; then + echo "šŸ“„ Llama 3.2 model not found. Pulling model..." + echo "This may take a few minutes on first run..." + ollama pull llama3.2 +fi + +echo "āœ… Ollama is ready" +echo "" + +# Check Python dependencies +echo "šŸ” Checking dependencies..." +if ! python -c "import gradio" 2>/dev/null; then + echo "šŸ“¦ Installing dependencies..." + pip install -r requirements.txt +fi + +echo "āœ… Dependencies ready" +echo "" + +# Launch the application +echo "šŸš€ Launching KnowledgeHub..." +echo "The application will open in your browser at http://127.0.0.1:7860" +echo "" +echo "Press Ctrl+C to stop the application" +echo "" + +python app.py diff --git a/community-contributions/sach91-bootcamp/week8/utils/__init__.py b/community-contributions/sach91-bootcamp/week8/utils/__init__.py new file mode 100644 index 0000000..76c3e16 --- /dev/null +++ b/community-contributions/sach91-bootcamp/week8/utils/__init__.py @@ -0,0 +1,12 @@ +""" +models +""" +from .document_parser import DocumentParser +from .embeddings import EmbeddingModel +from .ollama_client import OllamaClient + +__all__ = [ + 'DocumentParser', + 'EmbeddingModel', + 'OllamaClient' +] diff --git a/community-contributions/sach91-bootcamp/week8/utils/document_parser.py b/community-contributions/sach91-bootcamp/week8/utils/document_parser.py new file mode 100644 index 0000000..e3f7a10 --- /dev/null +++ b/community-contributions/sach91-bootcamp/week8/utils/document_parser.py @@ -0,0 +1,218 @@ +""" +Document Parser - Extract text from various document formats +""" +import os +from typing import List, Dict, Optional +import logging +from pathlib import Path + +logger = logging.getLogger(__name__) + +class DocumentParser: + """Parse various document formats into text chunks""" + + SUPPORTED_FORMATS = ['.pdf', '.docx', '.txt', '.md', '.html', '.py'] + + def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200): + """ + Initialize document parser + + Args: + chunk_size: Maximum characters per chunk + chunk_overlap: Overlap between chunks for context preservation + """ + self.chunk_size = chunk_size + self.chunk_overlap = chunk_overlap + + def parse_file(self, file_path: str) -> Dict: + """ + Parse a file and return structured document data + + Args: + file_path: Path to the file + + Returns: + Dictionary with document metadata and chunks + """ + path = Path(file_path) + + if not path.exists(): + raise FileNotFoundError(f"File not found: {file_path}") + + extension = path.suffix.lower() + + if extension not in self.SUPPORTED_FORMATS: + raise ValueError(f"Unsupported format: {extension}") + + # Extract text based on file type + if extension == '.pdf': + text = self._parse_pdf(file_path) + elif extension == '.docx': + text = self._parse_docx(file_path) + elif extension == '.txt' or extension == '.py': + text = self._parse_txt(file_path) + elif extension == '.md': + text = self._parse_markdown(file_path) + elif extension == '.html': + text = self._parse_html(file_path) + else: + text = "" + + # Create chunks + chunks = self._create_chunks(text) + + return { + 'filename': path.name, + 'filepath': str(path.absolute()), + 'extension': extension, + 'text': text, + 'chunks': chunks, + 'num_chunks': len(chunks), + 'total_chars': len(text) + } + + def _parse_pdf(self, file_path: str) -> str: + """Extract text from PDF""" + try: + from pypdf import PdfReader + + reader = PdfReader(file_path) + text = "" + + for page in reader.pages: + text += page.extract_text() + "\n\n" + + return text.strip() + + except ImportError: + logger.error("pypdf not installed. Install with: pip install pypdf") + return "" + except Exception as e: + logger.error(f"Error parsing PDF: {e}") + return "" + + def _parse_docx(self, file_path: str) -> str: + """Extract text from DOCX""" + try: + from docx import Document + + doc = Document(file_path) + text = "\n\n".join([para.text for para in doc.paragraphs if para.text.strip()]) + + return text.strip() + + except ImportError: + logger.error("python-docx not installed. Install with: pip install python-docx") + return "" + except Exception as e: + logger.error(f"Error parsing DOCX: {e}") + return "" + + def _parse_txt(self, file_path: str) -> str: + """Extract text from TXT""" + try: + with open(file_path, 'r', encoding='utf-8') as f: + return f.read().strip() + except Exception as e: + logger.error(f"Error parsing TXT: {e}") + return "" + + def _parse_markdown(self, file_path: str) -> str: + """Extract text from Markdown""" + try: + import markdown + from bs4 import BeautifulSoup + + with open(file_path, 'r', encoding='utf-8') as f: + md_text = f.read() + + # Convert markdown to HTML then extract text + html = markdown.markdown(md_text) + soup = BeautifulSoup(html, 'html.parser') + text = soup.get_text() + + return text.strip() + + except ImportError: + # Fallback: just read as plain text + return self._parse_txt(file_path) + except Exception as e: + logger.error(f"Error parsing Markdown: {e}") + return "" + + def _parse_html(self, file_path: str) -> str: + """Extract text from HTML""" + try: + from bs4 import BeautifulSoup + + with open(file_path, 'r', encoding='utf-8') as f: + html = f.read() + + soup = BeautifulSoup(html, 'html.parser') + + # Remove script and style elements + for script in soup(["script", "style"]): + script.decompose() + + text = soup.get_text() + + # Clean up whitespace + lines = (line.strip() for line in text.splitlines()) + chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) + text = '\n'.join(chunk for chunk in chunks if chunk) + + return text.strip() + + except ImportError: + logger.error("beautifulsoup4 not installed. Install with: pip install beautifulsoup4") + return "" + except Exception as e: + logger.error(f"Error parsing HTML: {e}") + return "" + + def _create_chunks(self, text: str) -> List[str]: + """ + Split text into overlapping chunks + + Args: + text: Full text to chunk + + Returns: + List of text chunks + """ + if not text: + return [] + + chunks = [] + start = 0 + text_length = len(text) + + while start < text_length: + logger.info(f'Processing chunk at {start}, for len {text_length}.') + + end = start + self.chunk_size + + # If this isn't the last chunk, try to break at a sentence or paragraph + if end < text_length: + # Look for paragraph break first + break_pos = text.rfind('\n\n', start, end) + if break_pos == -1: + # Look for sentence break + break_pos = text.rfind('. ', start, end) + if break_pos == -1: + # Look for any space + break_pos = text.rfind(' ', start, end) + + if break_pos != -1 and break_pos > start and break_pos > end - self.chunk_overlap: + end = break_pos + 1 + + chunk = text[start:end].strip() + if chunk: + chunks.append(chunk) + + # Move start position with overlap + start = end - self.chunk_overlap + if start < 0: + start = 0 + + return chunks diff --git a/community-contributions/sach91-bootcamp/week8/utils/embeddings.py b/community-contributions/sach91-bootcamp/week8/utils/embeddings.py new file mode 100644 index 0000000..8ac2ea1 --- /dev/null +++ b/community-contributions/sach91-bootcamp/week8/utils/embeddings.py @@ -0,0 +1,84 @@ +""" +Embeddings utility using sentence-transformers +""" +from sentence_transformers import SentenceTransformer +import numpy as np +from typing import List, Union +import logging + +logger = logging.getLogger(__name__) + +class EmbeddingModel: + """Wrapper for sentence transformer embeddings""" + + def __init__(self, model_name: str = "sentence-transformers/all-MiniLM-L6-v2"): + """ + Initialize embedding model + + Args: + model_name: HuggingFace model name for embeddings + """ + self.model_name = model_name + logger.info(f"Loading embedding model: {model_name}") + self.model = SentenceTransformer(model_name) + self.dimension = self.model.get_sentence_embedding_dimension() + logger.info(f"Embedding dimension: {self.dimension}") + + def embed(self, texts: Union[str, List[str]]) -> np.ndarray: + """ + Generate embeddings for text(s) + + Args: + texts: Single text or list of texts + + Returns: + Numpy array of embeddings + """ + if isinstance(texts, str): + texts = [texts] + + embeddings = self.model.encode(texts, show_progress_bar=False) + return embeddings + + def embed_query(self, query: str) -> List[float]: + """ + Embed a single query - returns as list for ChromaDB compatibility + + Args: + query: Query text + + Returns: + List of floats representing the embedding + """ + embedding = self.model.encode([query], show_progress_bar=False)[0] + return embedding.tolist() + + def embed_documents(self, documents: List[str]) -> List[List[float]]: + """ + Embed multiple documents - returns as list of lists for ChromaDB + + Args: + documents: List of document texts + + Returns: + List of embeddings (each as list of floats) + """ + embeddings = self.model.encode(documents, show_progress_bar=False) + return embeddings.tolist() + + def similarity(self, text1: str, text2: str) -> float: + """ + Calculate cosine similarity between two texts + + Args: + text1: First text + text2: Second text + + Returns: + Similarity score between 0 and 1 + """ + emb1, emb2 = self.model.encode([text1, text2]) + + # Cosine similarity + similarity = np.dot(emb1, emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2)) + return float(similarity) diff --git a/community-contributions/sach91-bootcamp/week8/utils/ollama_client.py b/community-contributions/sach91-bootcamp/week8/utils/ollama_client.py new file mode 100644 index 0000000..ceeeedf --- /dev/null +++ b/community-contributions/sach91-bootcamp/week8/utils/ollama_client.py @@ -0,0 +1,107 @@ +""" +Ollama Client - Wrapper for local Ollama API +""" +import requests +import json +from typing import List, Dict, Optional +import logging + +logger = logging.getLogger(__name__) + +class OllamaClient: + """Client for interacting with local Ollama models""" + + def __init__(self, base_url: str = "http://localhost:11434", model: str = "llama3.2"): + self.base_url = base_url + self.model = model + self.api_url = f"{base_url}/api" + + def generate(self, prompt: str, system: Optional[str] = None, + temperature: float = 0.7, max_tokens: int = 2048) -> str: + """Generate text from a prompt""" + try: + payload = { + "model": self.model, + "prompt": prompt, + "stream": False, + "options": { + "temperature": temperature, + "num_predict": max_tokens + } + } + + if system: + payload["system"] = system + + response = requests.post( + f"{self.api_url}/generate", + json=payload, + timeout=1200 + ) + response.raise_for_status() + + result = response.json() + return result.get("response", "").strip() + + except requests.exceptions.RequestException as e: + logger.error(f"Ollama API error: {e}") + return f"Error: Unable to connect to Ollama. Is it running? ({str(e)})" + + def chat(self, messages: List[Dict[str, str]], + temperature: float = 0.7, max_tokens: int = 2048) -> str: + """Chat completion with message history""" + try: + payload = { + "model": self.model, + "messages": messages, + "stream": False, + "options": { + "temperature": temperature, + "num_predict": max_tokens + } + } + + response = requests.post( + f"{self.api_url}/chat", + json=payload, + timeout=1200 + ) + response.raise_for_status() + + result = response.json() + return result.get("message", {}).get("content", "").strip() + + except requests.exceptions.RequestException as e: + logger.error(f"Ollama API error: {e}") + return f"Error: Unable to connect to Ollama. Is it running? ({str(e)})" + + def check_connection(self) -> bool: + """Check if Ollama is running and model is available""" + try: + response = requests.get(f"{self.base_url}/api/tags", timeout=5) + response.raise_for_status() + + models = response.json().get("models", []) + model_names = [m["name"] for m in models] + + if self.model not in model_names: + logger.warning(f"Model {self.model} not found. Available: {model_names}") + return False + + return True + + except requests.exceptions.RequestException as e: + logger.error(f"Cannot connect to Ollama: {e}") + return False + + def list_models(self) -> List[str]: + """List available Ollama models""" + try: + response = requests.get(f"{self.base_url}/api/tags", timeout=5) + response.raise_for_status() + + models = response.json().get("models", []) + return [m["name"] for m in models] + + except requests.exceptions.RequestException: + return [] diff --git a/community-contributions/sach91-bootcamp/week8/verify_setup.py b/community-contributions/sach91-bootcamp/week8/verify_setup.py new file mode 100644 index 0000000..1a8753e --- /dev/null +++ b/community-contributions/sach91-bootcamp/week8/verify_setup.py @@ -0,0 +1,129 @@ +""" +Setup Verification Script for KnowledgeHub +Run this to check if everything is configured correctly +""" +import sys +import os + +print("šŸ” KnowledgeHub Setup Verification\n") +print("=" * 60) + +# Check Python version +print(f"āœ“ Python version: {sys.version}") +print(f"āœ“ Python executable: {sys.executable}") +print(f"āœ“ Current directory: {os.getcwd()}") +print() + +# Check directory structure +print("šŸ“ Checking directory structure...") +required_dirs = ['agents', 'models', 'utils'] +for dir_name in required_dirs: + if os.path.isdir(dir_name): + init_file = os.path.join(dir_name, '__init__.py') + if os.path.exists(init_file): + print(f" āœ“ {dir_name}/ exists with __init__.py") + else: + print(f" āš ļø {dir_name}/ exists but missing __init__.py") + else: + print(f" āŒ {dir_name}/ directory not found") +print() + +# Check required files +print("šŸ“„ Checking required files...") +required_files = ['app.py', 'requirements.txt'] +for file_name in required_files: + if os.path.exists(file_name): + print(f" āœ“ {file_name} exists") + else: + print(f" āŒ {file_name} not found") +print() + +# Try importing modules +print("šŸ“¦ Testing imports...") +errors = [] + +try: + from utils import OllamaClient, EmbeddingModel, DocumentParser + print(" āœ“ utils module imported successfully") +except ImportError as e: + print(f" āŒ Cannot import utils: {e}") + errors.append(str(e)) + +try: + from models import Document, DocumentChunk, SearchResult, Summary + print(" āœ“ models module imported successfully") +except ImportError as e: + print(f" āŒ Cannot import models: {e}") + errors.append(str(e)) + +try: + from agents import ( + IngestionAgent, QuestionAgent, SummaryAgent, + ConnectionAgent, ExportAgent + ) + print(" āœ“ agents module imported successfully") +except ImportError as e: + print(f" āŒ Cannot import agents: {e}") + errors.append(str(e)) + +print() + +# Check dependencies +print("šŸ“š Checking Python dependencies...") +required_packages = [ + 'gradio', 'chromadb', 'sentence_transformers', + 'requests', 'numpy', 'tqdm' +] + +missing_packages = [] +for package in required_packages: + try: + __import__(package.replace('-', '_')) + print(f" āœ“ {package} installed") + except ImportError: + print(f" āŒ {package} not installed") + missing_packages.append(package) + +print() + +# Check Ollama +print("šŸ¤– Checking Ollama...") +try: + import requests + response = requests.get('http://localhost:11434/api/tags', timeout=2) + if response.status_code == 200: + print(" āœ“ Ollama is running") + models = response.json().get('models', []) + if models: + print(f" āœ“ Available models: {[m['name'] for m in models]}") + if any('llama3.2' in m['name'] for m in models): + print(" āœ“ llama3.2 model found") + else: + print(" āš ļø llama3.2 model not found. Run: ollama pull llama3.2") + else: + print(" āš ļø No models found. Run: ollama pull llama3.2") + else: + print(" āš ļø Ollama responded but with error") +except Exception as e: + print(f" āŒ Cannot connect to Ollama: {e}") + print(" Start Ollama with: ollama serve") + +print() +print("=" * 60) + +# Final summary +if errors or missing_packages: + print("\nāš ļø ISSUES FOUND:\n") + if errors: + print("Import Errors:") + for error in errors: + print(f" - {error}") + if missing_packages: + print("\nMissing Packages:") + print(f" Run: pip install {' '.join(missing_packages)}") + print("\nšŸ’” Fix these issues before running app.py") +else: + print("\nāœ… All checks passed! You're ready to run:") + print(" python app.py") + +print()