sach91 bootcamp week8 exercise

This commit is contained in:
sach91
2025-10-30 15:42:04 +05:30
parent 3fa7a3dad5
commit ef48ed539d
20 changed files with 3124 additions and 0 deletions

View File

@@ -0,0 +1,12 @@
"""
models
"""
from .document_parser import DocumentParser
from .embeddings import EmbeddingModel
from .ollama_client import OllamaClient
__all__ = [
'DocumentParser',
'EmbeddingModel',
'OllamaClient'
]

View File

@@ -0,0 +1,218 @@
"""
Document Parser - Extract text from various document formats
"""
import os
from typing import List, Dict, Optional
import logging
from pathlib import Path
logger = logging.getLogger(__name__)
class DocumentParser:
"""Parse various document formats into text chunks"""
SUPPORTED_FORMATS = ['.pdf', '.docx', '.txt', '.md', '.html', '.py']
def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200):
"""
Initialize document parser
Args:
chunk_size: Maximum characters per chunk
chunk_overlap: Overlap between chunks for context preservation
"""
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
def parse_file(self, file_path: str) -> Dict:
"""
Parse a file and return structured document data
Args:
file_path: Path to the file
Returns:
Dictionary with document metadata and chunks
"""
path = Path(file_path)
if not path.exists():
raise FileNotFoundError(f"File not found: {file_path}")
extension = path.suffix.lower()
if extension not in self.SUPPORTED_FORMATS:
raise ValueError(f"Unsupported format: {extension}")
# Extract text based on file type
if extension == '.pdf':
text = self._parse_pdf(file_path)
elif extension == '.docx':
text = self._parse_docx(file_path)
elif extension == '.txt' or extension == '.py':
text = self._parse_txt(file_path)
elif extension == '.md':
text = self._parse_markdown(file_path)
elif extension == '.html':
text = self._parse_html(file_path)
else:
text = ""
# Create chunks
chunks = self._create_chunks(text)
return {
'filename': path.name,
'filepath': str(path.absolute()),
'extension': extension,
'text': text,
'chunks': chunks,
'num_chunks': len(chunks),
'total_chars': len(text)
}
def _parse_pdf(self, file_path: str) -> str:
"""Extract text from PDF"""
try:
from pypdf import PdfReader
reader = PdfReader(file_path)
text = ""
for page in reader.pages:
text += page.extract_text() + "\n\n"
return text.strip()
except ImportError:
logger.error("pypdf not installed. Install with: pip install pypdf")
return ""
except Exception as e:
logger.error(f"Error parsing PDF: {e}")
return ""
def _parse_docx(self, file_path: str) -> str:
"""Extract text from DOCX"""
try:
from docx import Document
doc = Document(file_path)
text = "\n\n".join([para.text for para in doc.paragraphs if para.text.strip()])
return text.strip()
except ImportError:
logger.error("python-docx not installed. Install with: pip install python-docx")
return ""
except Exception as e:
logger.error(f"Error parsing DOCX: {e}")
return ""
def _parse_txt(self, file_path: str) -> str:
"""Extract text from TXT"""
try:
with open(file_path, 'r', encoding='utf-8') as f:
return f.read().strip()
except Exception as e:
logger.error(f"Error parsing TXT: {e}")
return ""
def _parse_markdown(self, file_path: str) -> str:
"""Extract text from Markdown"""
try:
import markdown
from bs4 import BeautifulSoup
with open(file_path, 'r', encoding='utf-8') as f:
md_text = f.read()
# Convert markdown to HTML then extract text
html = markdown.markdown(md_text)
soup = BeautifulSoup(html, 'html.parser')
text = soup.get_text()
return text.strip()
except ImportError:
# Fallback: just read as plain text
return self._parse_txt(file_path)
except Exception as e:
logger.error(f"Error parsing Markdown: {e}")
return ""
def _parse_html(self, file_path: str) -> str:
"""Extract text from HTML"""
try:
from bs4 import BeautifulSoup
with open(file_path, 'r', encoding='utf-8') as f:
html = f.read()
soup = BeautifulSoup(html, 'html.parser')
# Remove script and style elements
for script in soup(["script", "style"]):
script.decompose()
text = soup.get_text()
# Clean up whitespace
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = '\n'.join(chunk for chunk in chunks if chunk)
return text.strip()
except ImportError:
logger.error("beautifulsoup4 not installed. Install with: pip install beautifulsoup4")
return ""
except Exception as e:
logger.error(f"Error parsing HTML: {e}")
return ""
def _create_chunks(self, text: str) -> List[str]:
"""
Split text into overlapping chunks
Args:
text: Full text to chunk
Returns:
List of text chunks
"""
if not text:
return []
chunks = []
start = 0
text_length = len(text)
while start < text_length:
logger.info(f'Processing chunk at {start}, for len {text_length}.')
end = start + self.chunk_size
# If this isn't the last chunk, try to break at a sentence or paragraph
if end < text_length:
# Look for paragraph break first
break_pos = text.rfind('\n\n', start, end)
if break_pos == -1:
# Look for sentence break
break_pos = text.rfind('. ', start, end)
if break_pos == -1:
# Look for any space
break_pos = text.rfind(' ', start, end)
if break_pos != -1 and break_pos > start and break_pos > end - self.chunk_overlap:
end = break_pos + 1
chunk = text[start:end].strip()
if chunk:
chunks.append(chunk)
# Move start position with overlap
start = end - self.chunk_overlap
if start < 0:
start = 0
return chunks

View File

@@ -0,0 +1,84 @@
"""
Embeddings utility using sentence-transformers
"""
from sentence_transformers import SentenceTransformer
import numpy as np
from typing import List, Union
import logging
logger = logging.getLogger(__name__)
class EmbeddingModel:
"""Wrapper for sentence transformer embeddings"""
def __init__(self, model_name: str = "sentence-transformers/all-MiniLM-L6-v2"):
"""
Initialize embedding model
Args:
model_name: HuggingFace model name for embeddings
"""
self.model_name = model_name
logger.info(f"Loading embedding model: {model_name}")
self.model = SentenceTransformer(model_name)
self.dimension = self.model.get_sentence_embedding_dimension()
logger.info(f"Embedding dimension: {self.dimension}")
def embed(self, texts: Union[str, List[str]]) -> np.ndarray:
"""
Generate embeddings for text(s)
Args:
texts: Single text or list of texts
Returns:
Numpy array of embeddings
"""
if isinstance(texts, str):
texts = [texts]
embeddings = self.model.encode(texts, show_progress_bar=False)
return embeddings
def embed_query(self, query: str) -> List[float]:
"""
Embed a single query - returns as list for ChromaDB compatibility
Args:
query: Query text
Returns:
List of floats representing the embedding
"""
embedding = self.model.encode([query], show_progress_bar=False)[0]
return embedding.tolist()
def embed_documents(self, documents: List[str]) -> List[List[float]]:
"""
Embed multiple documents - returns as list of lists for ChromaDB
Args:
documents: List of document texts
Returns:
List of embeddings (each as list of floats)
"""
embeddings = self.model.encode(documents, show_progress_bar=False)
return embeddings.tolist()
def similarity(self, text1: str, text2: str) -> float:
"""
Calculate cosine similarity between two texts
Args:
text1: First text
text2: Second text
Returns:
Similarity score between 0 and 1
"""
emb1, emb2 = self.model.encode([text1, text2])
# Cosine similarity
similarity = np.dot(emb1, emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2))
return float(similarity)

View File

@@ -0,0 +1,107 @@
"""
Ollama Client - Wrapper for local Ollama API
"""
import requests
import json
from typing import List, Dict, Optional
import logging
logger = logging.getLogger(__name__)
class OllamaClient:
"""Client for interacting with local Ollama models"""
def __init__(self, base_url: str = "http://localhost:11434", model: str = "llama3.2"):
self.base_url = base_url
self.model = model
self.api_url = f"{base_url}/api"
def generate(self, prompt: str, system: Optional[str] = None,
temperature: float = 0.7, max_tokens: int = 2048) -> str:
"""Generate text from a prompt"""
try:
payload = {
"model": self.model,
"prompt": prompt,
"stream": False,
"options": {
"temperature": temperature,
"num_predict": max_tokens
}
}
if system:
payload["system"] = system
response = requests.post(
f"{self.api_url}/generate",
json=payload,
timeout=1200
)
response.raise_for_status()
result = response.json()
return result.get("response", "").strip()
except requests.exceptions.RequestException as e:
logger.error(f"Ollama API error: {e}")
return f"Error: Unable to connect to Ollama. Is it running? ({str(e)})"
def chat(self, messages: List[Dict[str, str]],
temperature: float = 0.7, max_tokens: int = 2048) -> str:
"""Chat completion with message history"""
try:
payload = {
"model": self.model,
"messages": messages,
"stream": False,
"options": {
"temperature": temperature,
"num_predict": max_tokens
}
}
response = requests.post(
f"{self.api_url}/chat",
json=payload,
timeout=1200
)
response.raise_for_status()
result = response.json()
return result.get("message", {}).get("content", "").strip()
except requests.exceptions.RequestException as e:
logger.error(f"Ollama API error: {e}")
return f"Error: Unable to connect to Ollama. Is it running? ({str(e)})"
def check_connection(self) -> bool:
"""Check if Ollama is running and model is available"""
try:
response = requests.get(f"{self.base_url}/api/tags", timeout=5)
response.raise_for_status()
models = response.json().get("models", [])
model_names = [m["name"] for m in models]
if self.model not in model_names:
logger.warning(f"Model {self.model} not found. Available: {model_names}")
return False
return True
except requests.exceptions.RequestException as e:
logger.error(f"Cannot connect to Ollama: {e}")
return False
def list_models(self) -> List[str]:
"""List available Ollama models"""
try:
response = requests.get(f"{self.base_url}/api/tags", timeout=5)
response.raise_for_status()
models = response.json().get("models", [])
return [m["name"] for m in models]
except requests.exceptions.RequestException:
return []