Files
LLM_Engineering_OLD/community-contributions/sach91-bootcamp/week8/agents/ingestion_agent.py
2025-10-30 15:42:04 +05:30

158 lines
4.9 KiB
Python

"""
Ingestion Agent - Processes and stores documents in the vector database
"""
import logging
from typing import Dict, List
import uuid
from datetime import datetime
from agents.base_agent import BaseAgent
from models.document import Document, DocumentChunk
from utils.document_parser import DocumentParser
from utils.embeddings import EmbeddingModel
import chromadb
logger = logging.getLogger(__name__)
class IngestionAgent(BaseAgent):
"""Agent responsible for ingesting and storing documents"""
def __init__(self, collection: chromadb.Collection,
embedding_model: EmbeddingModel,
llm_client=None, model: str = "llama3.2"):
"""
Initialize ingestion agent
Args:
collection: ChromaDB collection for storage
embedding_model: Model for generating embeddings
llm_client: Optional shared LLM client
model: Ollama model name
"""
super().__init__(name="IngestionAgent", llm_client=llm_client, model=model)
self.collection = collection
self.embedding_model = embedding_model
self.parser = DocumentParser(chunk_size=1000, chunk_overlap=200)
logger.info(f"{self.name} ready with ChromaDB collection")
def process(self, file_path: str) -> Document:
"""
Process and ingest a document
Args:
file_path: Path to the document file
Returns:
Document object with metadata
"""
logger.info(f"{self.name} processing: {file_path}")
# Parse the document
parsed = self.parser.parse_file(file_path)
# Generate document ID
doc_id = str(uuid.uuid4())
# Create document chunks
chunks = []
chunk_texts = []
chunk_ids = []
chunk_metadatas = []
for i, chunk_text in enumerate(parsed['chunks']):
chunk_id = f"{doc_id}_chunk_{i}"
chunk = DocumentChunk(
id=chunk_id,
document_id=doc_id,
content=chunk_text,
chunk_index=i,
metadata={
'filename': parsed['filename'],
'extension': parsed['extension'],
'total_chunks': len(parsed['chunks'])
}
)
chunks.append(chunk)
chunk_texts.append(chunk_text)
chunk_ids.append(chunk_id)
chunk_metadatas.append({
'document_id': doc_id,
'filename': parsed['filename'],
'chunk_index': i,
'extension': parsed['extension']
})
# Generate embeddings
logger.info(f"{self.name} generating embeddings for {len(chunks)} chunks")
embeddings = self.embedding_model.embed_documents(chunk_texts)
# Store in ChromaDB
logger.info(f"{self.name} storing in ChromaDB")
self.collection.add(
ids=chunk_ids,
documents=chunk_texts,
embeddings=embeddings,
metadatas=chunk_metadatas
)
# Create document object
document = Document(
id=doc_id,
filename=parsed['filename'],
filepath=parsed['filepath'],
content=parsed['text'],
chunks=chunks,
metadata={
'extension': parsed['extension'],
'num_chunks': len(chunks),
'total_chars': parsed['total_chars']
},
created_at=datetime.now()
)
logger.info(f"{self.name} successfully ingested: {document}")
return document
def get_statistics(self) -> Dict:
"""Get statistics about stored documents"""
try:
count = self.collection.count()
return {
'total_chunks': count,
'collection_name': self.collection.name
}
except Exception as e:
logger.error(f"Error getting statistics: {e}")
return {'total_chunks': 0, 'error': str(e)}
def delete_document(self, document_id: str) -> bool:
"""
Delete all chunks of a document
Args:
document_id: ID of document to delete
Returns:
True if successful
"""
try:
# Get all chunk IDs for this document
results = self.collection.get(
where={"document_id": document_id}
)
if results['ids']:
self.collection.delete(ids=results['ids'])
logger.info(f"{self.name} deleted document {document_id}")
return True
return False
except Exception as e:
logger.error(f"Error deleting document: {e}")
return False