158 lines
4.9 KiB
Python
158 lines
4.9 KiB
Python
"""
|
|
Ingestion Agent - Processes and stores documents in the vector database
|
|
"""
|
|
import logging
|
|
from typing import Dict, List
|
|
import uuid
|
|
from datetime import datetime
|
|
|
|
from agents.base_agent import BaseAgent
|
|
from models.document import Document, DocumentChunk
|
|
from utils.document_parser import DocumentParser
|
|
from utils.embeddings import EmbeddingModel
|
|
import chromadb
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class IngestionAgent(BaseAgent):
|
|
"""Agent responsible for ingesting and storing documents"""
|
|
|
|
def __init__(self, collection: chromadb.Collection,
|
|
embedding_model: EmbeddingModel,
|
|
llm_client=None, model: str = "llama3.2"):
|
|
"""
|
|
Initialize ingestion agent
|
|
|
|
Args:
|
|
collection: ChromaDB collection for storage
|
|
embedding_model: Model for generating embeddings
|
|
llm_client: Optional shared LLM client
|
|
model: Ollama model name
|
|
"""
|
|
super().__init__(name="IngestionAgent", llm_client=llm_client, model=model)
|
|
|
|
self.collection = collection
|
|
self.embedding_model = embedding_model
|
|
self.parser = DocumentParser(chunk_size=1000, chunk_overlap=200)
|
|
|
|
logger.info(f"{self.name} ready with ChromaDB collection")
|
|
|
|
def process(self, file_path: str) -> Document:
|
|
"""
|
|
Process and ingest a document
|
|
|
|
Args:
|
|
file_path: Path to the document file
|
|
|
|
Returns:
|
|
Document object with metadata
|
|
"""
|
|
logger.info(f"{self.name} processing: {file_path}")
|
|
|
|
# Parse the document
|
|
parsed = self.parser.parse_file(file_path)
|
|
|
|
# Generate document ID
|
|
doc_id = str(uuid.uuid4())
|
|
|
|
# Create document chunks
|
|
chunks = []
|
|
chunk_texts = []
|
|
chunk_ids = []
|
|
chunk_metadatas = []
|
|
|
|
for i, chunk_text in enumerate(parsed['chunks']):
|
|
chunk_id = f"{doc_id}_chunk_{i}"
|
|
|
|
chunk = DocumentChunk(
|
|
id=chunk_id,
|
|
document_id=doc_id,
|
|
content=chunk_text,
|
|
chunk_index=i,
|
|
metadata={
|
|
'filename': parsed['filename'],
|
|
'extension': parsed['extension'],
|
|
'total_chunks': len(parsed['chunks'])
|
|
}
|
|
)
|
|
|
|
chunks.append(chunk)
|
|
chunk_texts.append(chunk_text)
|
|
chunk_ids.append(chunk_id)
|
|
chunk_metadatas.append({
|
|
'document_id': doc_id,
|
|
'filename': parsed['filename'],
|
|
'chunk_index': i,
|
|
'extension': parsed['extension']
|
|
})
|
|
|
|
# Generate embeddings
|
|
logger.info(f"{self.name} generating embeddings for {len(chunks)} chunks")
|
|
embeddings = self.embedding_model.embed_documents(chunk_texts)
|
|
|
|
# Store in ChromaDB
|
|
logger.info(f"{self.name} storing in ChromaDB")
|
|
self.collection.add(
|
|
ids=chunk_ids,
|
|
documents=chunk_texts,
|
|
embeddings=embeddings,
|
|
metadatas=chunk_metadatas
|
|
)
|
|
|
|
# Create document object
|
|
document = Document(
|
|
id=doc_id,
|
|
filename=parsed['filename'],
|
|
filepath=parsed['filepath'],
|
|
content=parsed['text'],
|
|
chunks=chunks,
|
|
metadata={
|
|
'extension': parsed['extension'],
|
|
'num_chunks': len(chunks),
|
|
'total_chars': parsed['total_chars']
|
|
},
|
|
created_at=datetime.now()
|
|
)
|
|
|
|
logger.info(f"{self.name} successfully ingested: {document}")
|
|
return document
|
|
|
|
def get_statistics(self) -> Dict:
|
|
"""Get statistics about stored documents"""
|
|
try:
|
|
count = self.collection.count()
|
|
return {
|
|
'total_chunks': count,
|
|
'collection_name': self.collection.name
|
|
}
|
|
except Exception as e:
|
|
logger.error(f"Error getting statistics: {e}")
|
|
return {'total_chunks': 0, 'error': str(e)}
|
|
|
|
def delete_document(self, document_id: str) -> bool:
|
|
"""
|
|
Delete all chunks of a document
|
|
|
|
Args:
|
|
document_id: ID of document to delete
|
|
|
|
Returns:
|
|
True if successful
|
|
"""
|
|
try:
|
|
# Get all chunk IDs for this document
|
|
results = self.collection.get(
|
|
where={"document_id": document_id}
|
|
)
|
|
|
|
if results['ids']:
|
|
self.collection.delete(ids=results['ids'])
|
|
logger.info(f"{self.name} deleted document {document_id}")
|
|
return True
|
|
|
|
return False
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error deleting document: {e}")
|
|
return False
|