sach91 bootcamp week8 exercise
This commit is contained in:
@@ -0,0 +1,12 @@
|
||||
"""
|
||||
models
|
||||
"""
|
||||
from .document_parser import DocumentParser
|
||||
from .embeddings import EmbeddingModel
|
||||
from .ollama_client import OllamaClient
|
||||
|
||||
__all__ = [
|
||||
'DocumentParser',
|
||||
'EmbeddingModel',
|
||||
'OllamaClient'
|
||||
]
|
||||
@@ -0,0 +1,218 @@
|
||||
"""
|
||||
Document Parser - Extract text from various document formats
|
||||
"""
|
||||
import os
|
||||
from typing import List, Dict, Optional
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class DocumentParser:
|
||||
"""Parse various document formats into text chunks"""
|
||||
|
||||
SUPPORTED_FORMATS = ['.pdf', '.docx', '.txt', '.md', '.html', '.py']
|
||||
|
||||
def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200):
|
||||
"""
|
||||
Initialize document parser
|
||||
|
||||
Args:
|
||||
chunk_size: Maximum characters per chunk
|
||||
chunk_overlap: Overlap between chunks for context preservation
|
||||
"""
|
||||
self.chunk_size = chunk_size
|
||||
self.chunk_overlap = chunk_overlap
|
||||
|
||||
def parse_file(self, file_path: str) -> Dict:
|
||||
"""
|
||||
Parse a file and return structured document data
|
||||
|
||||
Args:
|
||||
file_path: Path to the file
|
||||
|
||||
Returns:
|
||||
Dictionary with document metadata and chunks
|
||||
"""
|
||||
path = Path(file_path)
|
||||
|
||||
if not path.exists():
|
||||
raise FileNotFoundError(f"File not found: {file_path}")
|
||||
|
||||
extension = path.suffix.lower()
|
||||
|
||||
if extension not in self.SUPPORTED_FORMATS:
|
||||
raise ValueError(f"Unsupported format: {extension}")
|
||||
|
||||
# Extract text based on file type
|
||||
if extension == '.pdf':
|
||||
text = self._parse_pdf(file_path)
|
||||
elif extension == '.docx':
|
||||
text = self._parse_docx(file_path)
|
||||
elif extension == '.txt' or extension == '.py':
|
||||
text = self._parse_txt(file_path)
|
||||
elif extension == '.md':
|
||||
text = self._parse_markdown(file_path)
|
||||
elif extension == '.html':
|
||||
text = self._parse_html(file_path)
|
||||
else:
|
||||
text = ""
|
||||
|
||||
# Create chunks
|
||||
chunks = self._create_chunks(text)
|
||||
|
||||
return {
|
||||
'filename': path.name,
|
||||
'filepath': str(path.absolute()),
|
||||
'extension': extension,
|
||||
'text': text,
|
||||
'chunks': chunks,
|
||||
'num_chunks': len(chunks),
|
||||
'total_chars': len(text)
|
||||
}
|
||||
|
||||
def _parse_pdf(self, file_path: str) -> str:
|
||||
"""Extract text from PDF"""
|
||||
try:
|
||||
from pypdf import PdfReader
|
||||
|
||||
reader = PdfReader(file_path)
|
||||
text = ""
|
||||
|
||||
for page in reader.pages:
|
||||
text += page.extract_text() + "\n\n"
|
||||
|
||||
return text.strip()
|
||||
|
||||
except ImportError:
|
||||
logger.error("pypdf not installed. Install with: pip install pypdf")
|
||||
return ""
|
||||
except Exception as e:
|
||||
logger.error(f"Error parsing PDF: {e}")
|
||||
return ""
|
||||
|
||||
def _parse_docx(self, file_path: str) -> str:
|
||||
"""Extract text from DOCX"""
|
||||
try:
|
||||
from docx import Document
|
||||
|
||||
doc = Document(file_path)
|
||||
text = "\n\n".join([para.text for para in doc.paragraphs if para.text.strip()])
|
||||
|
||||
return text.strip()
|
||||
|
||||
except ImportError:
|
||||
logger.error("python-docx not installed. Install with: pip install python-docx")
|
||||
return ""
|
||||
except Exception as e:
|
||||
logger.error(f"Error parsing DOCX: {e}")
|
||||
return ""
|
||||
|
||||
def _parse_txt(self, file_path: str) -> str:
|
||||
"""Extract text from TXT"""
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
return f.read().strip()
|
||||
except Exception as e:
|
||||
logger.error(f"Error parsing TXT: {e}")
|
||||
return ""
|
||||
|
||||
def _parse_markdown(self, file_path: str) -> str:
|
||||
"""Extract text from Markdown"""
|
||||
try:
|
||||
import markdown
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
md_text = f.read()
|
||||
|
||||
# Convert markdown to HTML then extract text
|
||||
html = markdown.markdown(md_text)
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
text = soup.get_text()
|
||||
|
||||
return text.strip()
|
||||
|
||||
except ImportError:
|
||||
# Fallback: just read as plain text
|
||||
return self._parse_txt(file_path)
|
||||
except Exception as e:
|
||||
logger.error(f"Error parsing Markdown: {e}")
|
||||
return ""
|
||||
|
||||
def _parse_html(self, file_path: str) -> str:
|
||||
"""Extract text from HTML"""
|
||||
try:
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
html = f.read()
|
||||
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
|
||||
# Remove script and style elements
|
||||
for script in soup(["script", "style"]):
|
||||
script.decompose()
|
||||
|
||||
text = soup.get_text()
|
||||
|
||||
# Clean up whitespace
|
||||
lines = (line.strip() for line in text.splitlines())
|
||||
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
|
||||
text = '\n'.join(chunk for chunk in chunks if chunk)
|
||||
|
||||
return text.strip()
|
||||
|
||||
except ImportError:
|
||||
logger.error("beautifulsoup4 not installed. Install with: pip install beautifulsoup4")
|
||||
return ""
|
||||
except Exception as e:
|
||||
logger.error(f"Error parsing HTML: {e}")
|
||||
return ""
|
||||
|
||||
def _create_chunks(self, text: str) -> List[str]:
|
||||
"""
|
||||
Split text into overlapping chunks
|
||||
|
||||
Args:
|
||||
text: Full text to chunk
|
||||
|
||||
Returns:
|
||||
List of text chunks
|
||||
"""
|
||||
if not text:
|
||||
return []
|
||||
|
||||
chunks = []
|
||||
start = 0
|
||||
text_length = len(text)
|
||||
|
||||
while start < text_length:
|
||||
logger.info(f'Processing chunk at {start}, for len {text_length}.')
|
||||
|
||||
end = start + self.chunk_size
|
||||
|
||||
# If this isn't the last chunk, try to break at a sentence or paragraph
|
||||
if end < text_length:
|
||||
# Look for paragraph break first
|
||||
break_pos = text.rfind('\n\n', start, end)
|
||||
if break_pos == -1:
|
||||
# Look for sentence break
|
||||
break_pos = text.rfind('. ', start, end)
|
||||
if break_pos == -1:
|
||||
# Look for any space
|
||||
break_pos = text.rfind(' ', start, end)
|
||||
|
||||
if break_pos != -1 and break_pos > start and break_pos > end - self.chunk_overlap:
|
||||
end = break_pos + 1
|
||||
|
||||
chunk = text[start:end].strip()
|
||||
if chunk:
|
||||
chunks.append(chunk)
|
||||
|
||||
# Move start position with overlap
|
||||
start = end - self.chunk_overlap
|
||||
if start < 0:
|
||||
start = 0
|
||||
|
||||
return chunks
|
||||
@@ -0,0 +1,84 @@
|
||||
"""
|
||||
Embeddings utility using sentence-transformers
|
||||
"""
|
||||
from sentence_transformers import SentenceTransformer
|
||||
import numpy as np
|
||||
from typing import List, Union
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class EmbeddingModel:
|
||||
"""Wrapper for sentence transformer embeddings"""
|
||||
|
||||
def __init__(self, model_name: str = "sentence-transformers/all-MiniLM-L6-v2"):
|
||||
"""
|
||||
Initialize embedding model
|
||||
|
||||
Args:
|
||||
model_name: HuggingFace model name for embeddings
|
||||
"""
|
||||
self.model_name = model_name
|
||||
logger.info(f"Loading embedding model: {model_name}")
|
||||
self.model = SentenceTransformer(model_name)
|
||||
self.dimension = self.model.get_sentence_embedding_dimension()
|
||||
logger.info(f"Embedding dimension: {self.dimension}")
|
||||
|
||||
def embed(self, texts: Union[str, List[str]]) -> np.ndarray:
|
||||
"""
|
||||
Generate embeddings for text(s)
|
||||
|
||||
Args:
|
||||
texts: Single text or list of texts
|
||||
|
||||
Returns:
|
||||
Numpy array of embeddings
|
||||
"""
|
||||
if isinstance(texts, str):
|
||||
texts = [texts]
|
||||
|
||||
embeddings = self.model.encode(texts, show_progress_bar=False)
|
||||
return embeddings
|
||||
|
||||
def embed_query(self, query: str) -> List[float]:
|
||||
"""
|
||||
Embed a single query - returns as list for ChromaDB compatibility
|
||||
|
||||
Args:
|
||||
query: Query text
|
||||
|
||||
Returns:
|
||||
List of floats representing the embedding
|
||||
"""
|
||||
embedding = self.model.encode([query], show_progress_bar=False)[0]
|
||||
return embedding.tolist()
|
||||
|
||||
def embed_documents(self, documents: List[str]) -> List[List[float]]:
|
||||
"""
|
||||
Embed multiple documents - returns as list of lists for ChromaDB
|
||||
|
||||
Args:
|
||||
documents: List of document texts
|
||||
|
||||
Returns:
|
||||
List of embeddings (each as list of floats)
|
||||
"""
|
||||
embeddings = self.model.encode(documents, show_progress_bar=False)
|
||||
return embeddings.tolist()
|
||||
|
||||
def similarity(self, text1: str, text2: str) -> float:
|
||||
"""
|
||||
Calculate cosine similarity between two texts
|
||||
|
||||
Args:
|
||||
text1: First text
|
||||
text2: Second text
|
||||
|
||||
Returns:
|
||||
Similarity score between 0 and 1
|
||||
"""
|
||||
emb1, emb2 = self.model.encode([text1, text2])
|
||||
|
||||
# Cosine similarity
|
||||
similarity = np.dot(emb1, emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2))
|
||||
return float(similarity)
|
||||
@@ -0,0 +1,107 @@
|
||||
"""
|
||||
Ollama Client - Wrapper for local Ollama API
|
||||
"""
|
||||
import requests
|
||||
import json
|
||||
from typing import List, Dict, Optional
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class OllamaClient:
|
||||
"""Client for interacting with local Ollama models"""
|
||||
|
||||
def __init__(self, base_url: str = "http://localhost:11434", model: str = "llama3.2"):
|
||||
self.base_url = base_url
|
||||
self.model = model
|
||||
self.api_url = f"{base_url}/api"
|
||||
|
||||
def generate(self, prompt: str, system: Optional[str] = None,
|
||||
temperature: float = 0.7, max_tokens: int = 2048) -> str:
|
||||
"""Generate text from a prompt"""
|
||||
try:
|
||||
payload = {
|
||||
"model": self.model,
|
||||
"prompt": prompt,
|
||||
"stream": False,
|
||||
"options": {
|
||||
"temperature": temperature,
|
||||
"num_predict": max_tokens
|
||||
}
|
||||
}
|
||||
|
||||
if system:
|
||||
payload["system"] = system
|
||||
|
||||
response = requests.post(
|
||||
f"{self.api_url}/generate",
|
||||
json=payload,
|
||||
timeout=1200
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
result = response.json()
|
||||
return result.get("response", "").strip()
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
logger.error(f"Ollama API error: {e}")
|
||||
return f"Error: Unable to connect to Ollama. Is it running? ({str(e)})"
|
||||
|
||||
def chat(self, messages: List[Dict[str, str]],
|
||||
temperature: float = 0.7, max_tokens: int = 2048) -> str:
|
||||
"""Chat completion with message history"""
|
||||
try:
|
||||
payload = {
|
||||
"model": self.model,
|
||||
"messages": messages,
|
||||
"stream": False,
|
||||
"options": {
|
||||
"temperature": temperature,
|
||||
"num_predict": max_tokens
|
||||
}
|
||||
}
|
||||
|
||||
response = requests.post(
|
||||
f"{self.api_url}/chat",
|
||||
json=payload,
|
||||
timeout=1200
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
result = response.json()
|
||||
return result.get("message", {}).get("content", "").strip()
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
logger.error(f"Ollama API error: {e}")
|
||||
return f"Error: Unable to connect to Ollama. Is it running? ({str(e)})"
|
||||
|
||||
def check_connection(self) -> bool:
|
||||
"""Check if Ollama is running and model is available"""
|
||||
try:
|
||||
response = requests.get(f"{self.base_url}/api/tags", timeout=5)
|
||||
response.raise_for_status()
|
||||
|
||||
models = response.json().get("models", [])
|
||||
model_names = [m["name"] for m in models]
|
||||
|
||||
if self.model not in model_names:
|
||||
logger.warning(f"Model {self.model} not found. Available: {model_names}")
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
logger.error(f"Cannot connect to Ollama: {e}")
|
||||
return False
|
||||
|
||||
def list_models(self) -> List[str]:
|
||||
"""List available Ollama models"""
|
||||
try:
|
||||
response = requests.get(f"{self.base_url}/api/tags", timeout=5)
|
||||
response.raise_for_status()
|
||||
|
||||
models = response.json().get("models", [])
|
||||
return [m["name"] for m in models]
|
||||
|
||||
except requests.exceptions.RequestException:
|
||||
return []
|
||||
Reference in New Issue
Block a user