sach91 bootcamp week8 exercise

2025-10-30 15:42:04 +05:30
parent 3fa7a3dad5
commit ef48ed539d
20 changed files with 3124 additions and 0 deletions
--- a/community-contributions/sach91-bootcamp/week8/agents/ingestion_agent.py
+++ b/community-contributions/sach91-bootcamp/week8/agents/ingestion_agent.py
@@ -0,0 +1,157 @@
+"""
+Ingestion Agent - Processes and stores documents in the vector database
+"""
+import logging
+from typing import Dict, List
+import uuid
+from datetime import datetime
+
+from agents.base_agent import BaseAgent
+from models.document import Document, DocumentChunk
+from utils.document_parser import DocumentParser
+from utils.embeddings import EmbeddingModel
+import chromadb
+
+logger = logging.getLogger(__name__)
+
+class IngestionAgent(BaseAgent):
+    """Agent responsible for ingesting and storing documents"""
+    
+    def __init__(self, collection: chromadb.Collection, 
+                 embedding_model: EmbeddingModel,
+                 llm_client=None, model: str = "llama3.2"):
+        """
+        Initialize ingestion agent
+        
+        Args:
+            collection: ChromaDB collection for storage
+            embedding_model: Model for generating embeddings
+            llm_client: Optional shared LLM client
+            model: Ollama model name
+        """
+        super().__init__(name="IngestionAgent", llm_client=llm_client, model=model)
+        
+        self.collection = collection
+        self.embedding_model = embedding_model
+        self.parser = DocumentParser(chunk_size=1000, chunk_overlap=200)
+        
+        logger.info(f"{self.name} ready with ChromaDB collection")
+    
+    def process(self, file_path: str) -> Document:
+        """
+        Process and ingest a document
+        
+        Args:
+            file_path: Path to the document file
+            
+        Returns:
+            Document object with metadata
+        """
+        logger.info(f"{self.name} processing: {file_path}")
+        
+        # Parse the document
+        parsed = self.parser.parse_file(file_path)
+        
+        # Generate document ID
+        doc_id = str(uuid.uuid4())
+        
+        # Create document chunks
+        chunks = []
+        chunk_texts = []
+        chunk_ids = []
+        chunk_metadatas = []
+        
+        for i, chunk_text in enumerate(parsed['chunks']):
+            chunk_id = f"{doc_id}_chunk_{i}"
+            
+            chunk = DocumentChunk(
+                id=chunk_id,
+                document_id=doc_id,
+                content=chunk_text,
+                chunk_index=i,
+                metadata={
+                    'filename': parsed['filename'],
+                    'extension': parsed['extension'],
+                    'total_chunks': len(parsed['chunks'])
+                }
+            )
+            
+            chunks.append(chunk)
+            chunk_texts.append(chunk_text)
+            chunk_ids.append(chunk_id)
+            chunk_metadatas.append({
+                'document_id': doc_id,
+                'filename': parsed['filename'],
+                'chunk_index': i,
+                'extension': parsed['extension']
+            })
+        
+        # Generate embeddings
+        logger.info(f"{self.name} generating embeddings for {len(chunks)} chunks")
+        embeddings = self.embedding_model.embed_documents(chunk_texts)
+        
+        # Store in ChromaDB
+        logger.info(f"{self.name} storing in ChromaDB")
+        self.collection.add(
+            ids=chunk_ids,
+            documents=chunk_texts,
+            embeddings=embeddings,
+            metadatas=chunk_metadatas
+        )
+        
+        # Create document object
+        document = Document(
+            id=doc_id,
+            filename=parsed['filename'],
+            filepath=parsed['filepath'],
+            content=parsed['text'],
+            chunks=chunks,
+            metadata={
+                'extension': parsed['extension'],
+                'num_chunks': len(chunks),
+                'total_chars': parsed['total_chars']
+            },
+            created_at=datetime.now()
+        )
+        
+        logger.info(f"{self.name} successfully ingested: {document}")
+        return document
+    
+    def get_statistics(self) -> Dict:
+        """Get statistics about stored documents"""
+        try:
+            count = self.collection.count()
+            return {
+                'total_chunks': count,
+                'collection_name': self.collection.name
+            }
+        except Exception as e:
+            logger.error(f"Error getting statistics: {e}")
+            return {'total_chunks': 0, 'error': str(e)}
+    
+    def delete_document(self, document_id: str) -> bool:
+        """
+        Delete all chunks of a document
+        
+        Args:
+            document_id: ID of document to delete
+            
+        Returns:
+            True if successful
+        """
+        try:
+            # Get all chunk IDs for this document
+            results = self.collection.get(
+                where={"document_id": document_id}
+            )
+            
+            if results['ids']:
+                self.collection.delete(ids=results['ids'])
+                logger.info(f"{self.name} deleted document {document_id}")
+                return True
+            
+            return False
+            
+        except Exception as e:
+            logger.error(f"Error deleting document: {e}")
+            return False