sach91 bootcamp week8 exercise

2025-10-30 15:42:04 +05:30
parent 3fa7a3dad5
commit ef48ed539d
20 changed files with 3124 additions and 0 deletions
--- a/community-contributions/sach91-bootcamp/week8/utils/init.py
+++ b/community-contributions/sach91-bootcamp/week8/utils/init.py
@@ -0,0 +1,12 @@
+"""
+models
+"""
+from .document_parser import DocumentParser
+from .embeddings import EmbeddingModel
+from .ollama_client import OllamaClient
+
+__all__ = [
+    'DocumentParser',
+    'EmbeddingModel',
+    'OllamaClient'
+]
--- a/community-contributions/sach91-bootcamp/week8/utils/document_parser.py
+++ b/community-contributions/sach91-bootcamp/week8/utils/document_parser.py
@@ -0,0 +1,218 @@
+"""
+Document Parser - Extract text from various document formats
+"""
+import os
+from typing import List, Dict, Optional
+import logging
+from pathlib import Path
+
+logger = logging.getLogger(__name__)
+
+class DocumentParser:
+    """Parse various document formats into text chunks"""
+    
+    SUPPORTED_FORMATS = ['.pdf', '.docx', '.txt', '.md', '.html', '.py']
+    
+    def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200):
+        """
+        Initialize document parser
+        
+        Args:
+            chunk_size: Maximum characters per chunk
+            chunk_overlap: Overlap between chunks for context preservation
+        """
+        self.chunk_size = chunk_size
+        self.chunk_overlap = chunk_overlap
+    
+    def parse_file(self, file_path: str) -> Dict:
+        """
+        Parse a file and return structured document data
+        
+        Args:
+            file_path: Path to the file
+            
+        Returns:
+            Dictionary with document metadata and chunks
+        """
+        path = Path(file_path)
+        
+        if not path.exists():
+            raise FileNotFoundError(f"File not found: {file_path}")
+        
+        extension = path.suffix.lower()
+        
+        if extension not in self.SUPPORTED_FORMATS:
+            raise ValueError(f"Unsupported format: {extension}")
+        
+        # Extract text based on file type
+        if extension == '.pdf':
+            text = self._parse_pdf(file_path)
+        elif extension == '.docx':
+            text = self._parse_docx(file_path)
+        elif extension == '.txt' or extension == '.py':
+            text = self._parse_txt(file_path)
+        elif extension == '.md':
+            text = self._parse_markdown(file_path)
+        elif extension == '.html':
+            text = self._parse_html(file_path)
+        else:
+            text = ""
+        
+        # Create chunks
+        chunks = self._create_chunks(text)
+        
+        return {
+            'filename': path.name,
+            'filepath': str(path.absolute()),
+            'extension': extension,
+            'text': text,
+            'chunks': chunks,
+            'num_chunks': len(chunks),
+            'total_chars': len(text)
+        }
+    
+    def _parse_pdf(self, file_path: str) -> str:
+        """Extract text from PDF"""
+        try:
+            from pypdf import PdfReader
+            
+            reader = PdfReader(file_path)
+            text = ""
+            
+            for page in reader.pages:
+                text += page.extract_text() + "\n\n"
+            
+            return text.strip()
+        
+        except ImportError:
+            logger.error("pypdf not installed. Install with: pip install pypdf")
+            return ""
+        except Exception as e:
+            logger.error(f"Error parsing PDF: {e}")
+            return ""
+    
+    def _parse_docx(self, file_path: str) -> str:
+        """Extract text from DOCX"""
+        try:
+            from docx import Document
+            
+            doc = Document(file_path)
+            text = "\n\n".join([para.text for para in doc.paragraphs if para.text.strip()])
+            
+            return text.strip()
+        
+        except ImportError:
+            logger.error("python-docx not installed. Install with: pip install python-docx")
+            return ""
+        except Exception as e:
+            logger.error(f"Error parsing DOCX: {e}")
+            return ""
+    
+    def _parse_txt(self, file_path: str) -> str:
+        """Extract text from TXT"""
+        try:
+            with open(file_path, 'r', encoding='utf-8') as f:
+                return f.read().strip()
+        except Exception as e:
+            logger.error(f"Error parsing TXT: {e}")
+            return ""
+    
+    def _parse_markdown(self, file_path: str) -> str:
+        """Extract text from Markdown"""
+        try:
+            import markdown
+            from bs4 import BeautifulSoup
+            
+            with open(file_path, 'r', encoding='utf-8') as f:
+                md_text = f.read()
+            
+            # Convert markdown to HTML then extract text
+            html = markdown.markdown(md_text)
+            soup = BeautifulSoup(html, 'html.parser')
+            text = soup.get_text()
+            
+            return text.strip()
+        
+        except ImportError:
+            # Fallback: just read as plain text
+            return self._parse_txt(file_path)
+        except Exception as e:
+            logger.error(f"Error parsing Markdown: {e}")
+            return ""
+    
+    def _parse_html(self, file_path: str) -> str:
+        """Extract text from HTML"""
+        try:
+            from bs4 import BeautifulSoup
+            
+            with open(file_path, 'r', encoding='utf-8') as f:
+                html = f.read()
+            
+            soup = BeautifulSoup(html, 'html.parser')
+            
+            # Remove script and style elements
+            for script in soup(["script", "style"]):
+                script.decompose()
+            
+            text = soup.get_text()
+            
+            # Clean up whitespace
+            lines = (line.strip() for line in text.splitlines())
+            chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
+            text = '\n'.join(chunk for chunk in chunks if chunk)
+            
+            return text.strip()
+        
+        except ImportError:
+            logger.error("beautifulsoup4 not installed. Install with: pip install beautifulsoup4")
+            return ""
+        except Exception as e:
+            logger.error(f"Error parsing HTML: {e}")
+            return ""
+    
+    def _create_chunks(self, text: str) -> List[str]:
+        """
+        Split text into overlapping chunks
+        
+        Args:
+            text: Full text to chunk
+            
+        Returns:
+            List of text chunks
+        """
+        if not text:
+            return []
+        
+        chunks = []
+        start = 0
+        text_length = len(text)
+        
+        while start < text_length:
+            logger.info(f'Processing chunk at {start}, for len {text_length}.')
+
+            end = start + self.chunk_size
+            
+            # If this isn't the last chunk, try to break at a sentence or paragraph
+            if end < text_length:
+                # Look for paragraph break first
+                break_pos = text.rfind('\n\n', start, end)
+                if break_pos == -1:
+                    # Look for sentence break
+                    break_pos = text.rfind('. ', start, end)
+                if break_pos == -1:
+                    # Look for any space
+                    break_pos = text.rfind(' ', start, end)
+                
+                if break_pos != -1 and break_pos > start and break_pos > end - self.chunk_overlap:
+                    end = break_pos + 1
+            
+            chunk = text[start:end].strip()
+            if chunk:
+                chunks.append(chunk)
+            
+            # Move start position with overlap
+            start = end - self.chunk_overlap
+            if start < 0:
+                start = 0
+        
+        return chunks
--- a/community-contributions/sach91-bootcamp/week8/utils/embeddings.py
+++ b/community-contributions/sach91-bootcamp/week8/utils/embeddings.py
@@ -0,0 +1,84 @@
+"""
+Embeddings utility using sentence-transformers
+"""
+from sentence_transformers import SentenceTransformer
+import numpy as np
+from typing import List, Union
+import logging
+
+logger = logging.getLogger(__name__)
+
+class EmbeddingModel:
+    """Wrapper for sentence transformer embeddings"""
+    
+    def __init__(self, model_name: str = "sentence-transformers/all-MiniLM-L6-v2"):
+        """
+        Initialize embedding model
+        
+        Args:
+            model_name: HuggingFace model name for embeddings
+        """
+        self.model_name = model_name
+        logger.info(f"Loading embedding model: {model_name}")
+        self.model = SentenceTransformer(model_name)
+        self.dimension = self.model.get_sentence_embedding_dimension()
+        logger.info(f"Embedding dimension: {self.dimension}")
+    
+    def embed(self, texts: Union[str, List[str]]) -> np.ndarray:
+        """
+        Generate embeddings for text(s)
+        
+        Args:
+            texts: Single text or list of texts
+            
+        Returns:
+            Numpy array of embeddings
+        """
+        if isinstance(texts, str):
+            texts = [texts]
+        
+        embeddings = self.model.encode(texts, show_progress_bar=False)
+        return embeddings
+    
+    def embed_query(self, query: str) -> List[float]:
+        """
+        Embed a single query - returns as list for ChromaDB compatibility
+        
+        Args:
+            query: Query text
+            
+        Returns:
+            List of floats representing the embedding
+        """
+        embedding = self.model.encode([query], show_progress_bar=False)[0]
+        return embedding.tolist()
+    
+    def embed_documents(self, documents: List[str]) -> List[List[float]]:
+        """
+        Embed multiple documents - returns as list of lists for ChromaDB
+        
+        Args:
+            documents: List of document texts
+            
+        Returns:
+            List of embeddings (each as list of floats)
+        """
+        embeddings = self.model.encode(documents, show_progress_bar=False)
+        return embeddings.tolist()
+    
+    def similarity(self, text1: str, text2: str) -> float:
+        """
+        Calculate cosine similarity between two texts
+        
+        Args:
+            text1: First text
+            text2: Second text
+            
+        Returns:
+            Similarity score between 0 and 1
+        """
+        emb1, emb2 = self.model.encode([text1, text2])
+        
+        # Cosine similarity
+        similarity = np.dot(emb1, emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2))
+        return float(similarity)
--- a/community-contributions/sach91-bootcamp/week8/utils/ollama_client.py
+++ b/community-contributions/sach91-bootcamp/week8/utils/ollama_client.py
@@ -0,0 +1,107 @@
+"""
+Ollama Client - Wrapper for local Ollama API
+"""
+import requests
+import json
+from typing import List, Dict, Optional
+import logging
+
+logger = logging.getLogger(__name__)
+
+class OllamaClient:
+    """Client for interacting with local Ollama models"""
+    
+    def __init__(self, base_url: str = "http://localhost:11434", model: str = "llama3.2"):
+        self.base_url = base_url
+        self.model = model
+        self.api_url = f"{base_url}/api"
+        
+    def generate(self, prompt: str, system: Optional[str] = None, 
+                 temperature: float = 0.7, max_tokens: int = 2048) -> str:
+        """Generate text from a prompt"""
+        try:
+            payload = {
+                "model": self.model,
+                "prompt": prompt,
+                "stream": False,
+                "options": {
+                    "temperature": temperature,
+                    "num_predict": max_tokens
+                }
+            }
+            
+            if system:
+                payload["system"] = system
+                
+            response = requests.post(
+                f"{self.api_url}/generate",
+                json=payload,
+                timeout=1200
+            )
+            response.raise_for_status()
+            
+            result = response.json()
+            return result.get("response", "").strip()
+            
+        except requests.exceptions.RequestException as e:
+            logger.error(f"Ollama API error: {e}")
+            return f"Error: Unable to connect to Ollama. Is it running? ({str(e)})"
+    
+    def chat(self, messages: List[Dict[str, str]], 
+             temperature: float = 0.7, max_tokens: int = 2048) -> str:
+        """Chat completion with message history"""
+        try:
+            payload = {
+                "model": self.model,
+                "messages": messages,
+                "stream": False,
+                "options": {
+                    "temperature": temperature,
+                    "num_predict": max_tokens
+                }
+            }
+            
+            response = requests.post(
+                f"{self.api_url}/chat",
+                json=payload,
+                timeout=1200
+            )
+            response.raise_for_status()
+            
+            result = response.json()
+            return result.get("message", {}).get("content", "").strip()
+            
+        except requests.exceptions.RequestException as e:
+            logger.error(f"Ollama API error: {e}")
+            return f"Error: Unable to connect to Ollama. Is it running? ({str(e)})"
+    
+    def check_connection(self) -> bool:
+        """Check if Ollama is running and model is available"""
+        try:
+            response = requests.get(f"{self.base_url}/api/tags", timeout=5)
+            response.raise_for_status()
+            
+            models = response.json().get("models", [])
+            model_names = [m["name"] for m in models]
+            
+            if self.model not in model_names:
+                logger.warning(f"Model {self.model} not found. Available: {model_names}")
+                return False
+            
+            return True
+            
+        except requests.exceptions.RequestException as e:
+            logger.error(f"Cannot connect to Ollama: {e}")
+            return False
+    
+    def list_models(self) -> List[str]:
+        """List available Ollama models"""
+        try:
+            response = requests.get(f"{self.base_url}/api/tags", timeout=5)
+            response.raise_for_status()
+            
+            models = response.json().get("models", [])
+            return [m["name"] for m in models]
+            
+        except requests.exceptions.RequestException:
+            return []