LLM_Engineering_OLD/community-contributions/sach91-bootcamp/week8/agents/ingestion_agent.py

"""
Ingestion Agent - Processes and stores documents in the vector database
"""
import logging
from typing import Dict, List
import uuid
from datetime import datetime

from agents.base_agent import BaseAgent
from models.document import Document, DocumentChunk
from utils.document_parser import DocumentParser
from utils.embeddings import EmbeddingModel
import chromadb

logger = logging.getLogger(__name__)

class IngestionAgent(BaseAgent):
    """Agent responsible for ingesting and storing documents"""

    def __init__(self, collection: chromadb.Collection,
                 embedding_model: EmbeddingModel,
                 llm_client=None, model: str = "llama3.2"):
        """
        Initialize ingestion agent

        Args:
            collection: ChromaDB collection for storage
            embedding_model: Model for generating embeddings
            llm_client: Optional shared LLM client
            model: Ollama model name
        """
        super().__init__(name="IngestionAgent", llm_client=llm_client, model=model)

        self.collection = collection
        self.embedding_model = embedding_model
        self.parser = DocumentParser(chunk_size=1000, chunk_overlap=200)

        logger.info(f"{self.name} ready with ChromaDB collection")

    def process(self, file_path: str) -> Document:
        """
        Process and ingest a document

        Args:
            file_path: Path to the document file

        Returns:
            Document object with metadata
        """
        logger.info(f"{self.name} processing: {file_path}")

        # Parse the document
        parsed = self.parser.parse_file(file_path)

        # Generate document ID
        doc_id = str(uuid.uuid4())

        # Create document chunks
        chunks = []
        chunk_texts = []
        chunk_ids = []
        chunk_metadatas = []

        for i, chunk_text in enumerate(parsed['chunks']):
            chunk_id = f"{doc_id}_chunk_{i}"

            chunk = DocumentChunk(
                id=chunk_id,
                document_id=doc_id,
                content=chunk_text,
                chunk_index=i,
                metadata={
                    'filename': parsed['filename'],
                    'extension': parsed['extension'],
                    'total_chunks': len(parsed['chunks'])
                }
            )

            chunks.append(chunk)
            chunk_texts.append(chunk_text)
            chunk_ids.append(chunk_id)
            chunk_metadatas.append({
                'document_id': doc_id,
                'filename': parsed['filename'],
                'chunk_index': i,
                'extension': parsed['extension']
            })

        # Generate embeddings
        logger.info(f"{self.name} generating embeddings for {len(chunks)} chunks")
        embeddings = self.embedding_model.embed_documents(chunk_texts)

        # Store in ChromaDB
        logger.info(f"{self.name} storing in ChromaDB")
        self.collection.add(
            ids=chunk_ids,
            documents=chunk_texts,
            embeddings=embeddings,
            metadatas=chunk_metadatas
        )

        # Create document object
        document = Document(
            id=doc_id,
            filename=parsed['filename'],
            filepath=parsed['filepath'],
            content=parsed['text'],
            chunks=chunks,
            metadata={
                'extension': parsed['extension'],
                'num_chunks': len(chunks),
                'total_chars': parsed['total_chars']
            },
            created_at=datetime.now()
        )

        logger.info(f"{self.name} successfully ingested: {document}")
        return document

    def get_statistics(self) -> Dict:
        """Get statistics about stored documents"""
        try:
            count = self.collection.count()
            return {
                'total_chunks': count,
                'collection_name': self.collection.name
            }
        except Exception as e:
            logger.error(f"Error getting statistics: {e}")
            return {'total_chunks': 0, 'error': str(e)}

    def delete_document(self, document_id: str) -> bool:
        """
        Delete all chunks of a document

        Args:
            document_id: ID of document to delete

        Returns:
            True if successful
        """
        try:
            # Get all chunk IDs for this document
            results = self.collection.get(
                where={"document_id": document_id}
            )

            if results['ids']:
                self.collection.delete(ids=results['ids'])
                logger.info(f"{self.name} deleted document {document_id}")
                return True

            return False

        except Exception as e:
            logger.error(f"Error deleting document: {e}")
            return False