83 lines
2.3 KiB
Python
83 lines
2.3 KiB
Python
"""
|
|
Document data models
|
|
"""
|
|
from dataclasses import dataclass, field
|
|
from typing import List, Dict, Optional
|
|
from datetime import datetime
|
|
|
|
@dataclass
|
|
class DocumentChunk:
|
|
"""Represents a chunk of a document"""
|
|
id: str
|
|
document_id: str
|
|
content: str
|
|
chunk_index: int
|
|
metadata: Dict = field(default_factory=dict)
|
|
|
|
def __str__(self):
|
|
preview = self.content[:100] + "..." if len(self.content) > 100 else self.content
|
|
return f"Chunk {self.chunk_index}: {preview}"
|
|
|
|
@dataclass
|
|
class Document:
|
|
"""Represents a complete document"""
|
|
id: str
|
|
filename: str
|
|
filepath: str
|
|
content: str
|
|
chunks: List[DocumentChunk]
|
|
metadata: Dict = field(default_factory=dict)
|
|
created_at: datetime = field(default_factory=datetime.now)
|
|
|
|
@property
|
|
def num_chunks(self) -> int:
|
|
return len(self.chunks)
|
|
|
|
@property
|
|
def total_chars(self) -> int:
|
|
return len(self.content)
|
|
|
|
@property
|
|
def extension(self) -> str:
|
|
return self.metadata.get('extension', '')
|
|
|
|
def __str__(self):
|
|
return f"Document: {self.filename} ({self.num_chunks} chunks, {self.total_chars} chars)"
|
|
|
|
def to_dict(self) -> Dict:
|
|
"""Convert to dictionary for storage"""
|
|
return {
|
|
'id': self.id,
|
|
'filename': self.filename,
|
|
'filepath': self.filepath,
|
|
'content': self.content[:500] + '...' if len(self.content) > 500 else self.content,
|
|
'num_chunks': self.num_chunks,
|
|
'total_chars': self.total_chars,
|
|
'extension': self.extension,
|
|
'created_at': self.created_at.isoformat(),
|
|
'metadata': self.metadata
|
|
}
|
|
|
|
@dataclass
|
|
class SearchResult:
|
|
"""Represents a search result from the vector database"""
|
|
chunk: DocumentChunk
|
|
score: float
|
|
document_id: str
|
|
document_name: str
|
|
|
|
def __str__(self):
|
|
return f"{self.document_name} (score: {self.score:.2f})"
|
|
|
|
@dataclass
|
|
class Summary:
|
|
"""Represents a document summary"""
|
|
document_id: str
|
|
document_name: str
|
|
summary_text: str
|
|
key_points: List[str] = field(default_factory=list)
|
|
created_at: datetime = field(default_factory=datetime.now)
|
|
|
|
def __str__(self):
|
|
return f"Summary of {self.document_name}: {self.summary_text[:100]}..."
|