sach91 bootcamp week8 exercise
This commit is contained in:
@@ -0,0 +1,157 @@
|
||||
"""
|
||||
Ingestion Agent - Processes and stores documents in the vector database
|
||||
"""
|
||||
import logging
|
||||
from typing import Dict, List
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
|
||||
from agents.base_agent import BaseAgent
|
||||
from models.document import Document, DocumentChunk
|
||||
from utils.document_parser import DocumentParser
|
||||
from utils.embeddings import EmbeddingModel
|
||||
import chromadb
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class IngestionAgent(BaseAgent):
|
||||
"""Agent responsible for ingesting and storing documents"""
|
||||
|
||||
def __init__(self, collection: chromadb.Collection,
|
||||
embedding_model: EmbeddingModel,
|
||||
llm_client=None, model: str = "llama3.2"):
|
||||
"""
|
||||
Initialize ingestion agent
|
||||
|
||||
Args:
|
||||
collection: ChromaDB collection for storage
|
||||
embedding_model: Model for generating embeddings
|
||||
llm_client: Optional shared LLM client
|
||||
model: Ollama model name
|
||||
"""
|
||||
super().__init__(name="IngestionAgent", llm_client=llm_client, model=model)
|
||||
|
||||
self.collection = collection
|
||||
self.embedding_model = embedding_model
|
||||
self.parser = DocumentParser(chunk_size=1000, chunk_overlap=200)
|
||||
|
||||
logger.info(f"{self.name} ready with ChromaDB collection")
|
||||
|
||||
def process(self, file_path: str) -> Document:
|
||||
"""
|
||||
Process and ingest a document
|
||||
|
||||
Args:
|
||||
file_path: Path to the document file
|
||||
|
||||
Returns:
|
||||
Document object with metadata
|
||||
"""
|
||||
logger.info(f"{self.name} processing: {file_path}")
|
||||
|
||||
# Parse the document
|
||||
parsed = self.parser.parse_file(file_path)
|
||||
|
||||
# Generate document ID
|
||||
doc_id = str(uuid.uuid4())
|
||||
|
||||
# Create document chunks
|
||||
chunks = []
|
||||
chunk_texts = []
|
||||
chunk_ids = []
|
||||
chunk_metadatas = []
|
||||
|
||||
for i, chunk_text in enumerate(parsed['chunks']):
|
||||
chunk_id = f"{doc_id}_chunk_{i}"
|
||||
|
||||
chunk = DocumentChunk(
|
||||
id=chunk_id,
|
||||
document_id=doc_id,
|
||||
content=chunk_text,
|
||||
chunk_index=i,
|
||||
metadata={
|
||||
'filename': parsed['filename'],
|
||||
'extension': parsed['extension'],
|
||||
'total_chunks': len(parsed['chunks'])
|
||||
}
|
||||
)
|
||||
|
||||
chunks.append(chunk)
|
||||
chunk_texts.append(chunk_text)
|
||||
chunk_ids.append(chunk_id)
|
||||
chunk_metadatas.append({
|
||||
'document_id': doc_id,
|
||||
'filename': parsed['filename'],
|
||||
'chunk_index': i,
|
||||
'extension': parsed['extension']
|
||||
})
|
||||
|
||||
# Generate embeddings
|
||||
logger.info(f"{self.name} generating embeddings for {len(chunks)} chunks")
|
||||
embeddings = self.embedding_model.embed_documents(chunk_texts)
|
||||
|
||||
# Store in ChromaDB
|
||||
logger.info(f"{self.name} storing in ChromaDB")
|
||||
self.collection.add(
|
||||
ids=chunk_ids,
|
||||
documents=chunk_texts,
|
||||
embeddings=embeddings,
|
||||
metadatas=chunk_metadatas
|
||||
)
|
||||
|
||||
# Create document object
|
||||
document = Document(
|
||||
id=doc_id,
|
||||
filename=parsed['filename'],
|
||||
filepath=parsed['filepath'],
|
||||
content=parsed['text'],
|
||||
chunks=chunks,
|
||||
metadata={
|
||||
'extension': parsed['extension'],
|
||||
'num_chunks': len(chunks),
|
||||
'total_chars': parsed['total_chars']
|
||||
},
|
||||
created_at=datetime.now()
|
||||
)
|
||||
|
||||
logger.info(f"{self.name} successfully ingested: {document}")
|
||||
return document
|
||||
|
||||
def get_statistics(self) -> Dict:
|
||||
"""Get statistics about stored documents"""
|
||||
try:
|
||||
count = self.collection.count()
|
||||
return {
|
||||
'total_chunks': count,
|
||||
'collection_name': self.collection.name
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting statistics: {e}")
|
||||
return {'total_chunks': 0, 'error': str(e)}
|
||||
|
||||
def delete_document(self, document_id: str) -> bool:
|
||||
"""
|
||||
Delete all chunks of a document
|
||||
|
||||
Args:
|
||||
document_id: ID of document to delete
|
||||
|
||||
Returns:
|
||||
True if successful
|
||||
"""
|
||||
try:
|
||||
# Get all chunk IDs for this document
|
||||
results = self.collection.get(
|
||||
where={"document_id": document_id}
|
||||
)
|
||||
|
||||
if results['ids']:
|
||||
self.collection.delete(ids=results['ids'])
|
||||
logger.info(f"{self.name} deleted document {document_id}")
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error deleting document: {e}")
|
||||
return False
|
||||
Reference in New Issue
Block a user