Files
LLM_Engineering_OLD/community-contributions/sach91-bootcamp/week8/models/document.py
2025-10-30 15:42:04 +05:30

83 lines
2.3 KiB
Python

"""
Document data models
"""
from dataclasses import dataclass, field
from typing import List, Dict, Optional
from datetime import datetime
@dataclass
class DocumentChunk:
"""Represents a chunk of a document"""
id: str
document_id: str
content: str
chunk_index: int
metadata: Dict = field(default_factory=dict)
def __str__(self):
preview = self.content[:100] + "..." if len(self.content) > 100 else self.content
return f"Chunk {self.chunk_index}: {preview}"
@dataclass
class Document:
"""Represents a complete document"""
id: str
filename: str
filepath: str
content: str
chunks: List[DocumentChunk]
metadata: Dict = field(default_factory=dict)
created_at: datetime = field(default_factory=datetime.now)
@property
def num_chunks(self) -> int:
return len(self.chunks)
@property
def total_chars(self) -> int:
return len(self.content)
@property
def extension(self) -> str:
return self.metadata.get('extension', '')
def __str__(self):
return f"Document: {self.filename} ({self.num_chunks} chunks, {self.total_chars} chars)"
def to_dict(self) -> Dict:
"""Convert to dictionary for storage"""
return {
'id': self.id,
'filename': self.filename,
'filepath': self.filepath,
'content': self.content[:500] + '...' if len(self.content) > 500 else self.content,
'num_chunks': self.num_chunks,
'total_chars': self.total_chars,
'extension': self.extension,
'created_at': self.created_at.isoformat(),
'metadata': self.metadata
}
@dataclass
class SearchResult:
"""Represents a search result from the vector database"""
chunk: DocumentChunk
score: float
document_id: str
document_name: str
def __str__(self):
return f"{self.document_name} (score: {self.score:.2f})"
@dataclass
class Summary:
"""Represents a document summary"""
document_id: str
document_name: str
summary_text: str
key_points: List[str] = field(default_factory=list)
created_at: datetime = field(default_factory=datetime.now)
def __str__(self):
return f"Summary of {self.document_name}: {self.summary_text[:100]}..."