sach91 bootcamp week8 exercise

2025-10-30 15:42:04 +05:30
parent 3fa7a3dad5
commit ef48ed539d
20 changed files with 3124 additions and 0 deletions
--- a/community-contributions/sach91-bootcamp/week8/utils/document_parser.py
+++ b/community-contributions/sach91-bootcamp/week8/utils/document_parser.py
@@ -0,0 +1,218 @@
+"""
+Document Parser - Extract text from various document formats
+"""
+import os
+from typing import List, Dict, Optional
+import logging
+from pathlib import Path
+
+logger = logging.getLogger(__name__)
+
+class DocumentParser:
+    """Parse various document formats into text chunks"""
+    
+    SUPPORTED_FORMATS = ['.pdf', '.docx', '.txt', '.md', '.html', '.py']
+    
+    def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200):
+        """
+        Initialize document parser
+        
+        Args:
+            chunk_size: Maximum characters per chunk
+            chunk_overlap: Overlap between chunks for context preservation
+        """
+        self.chunk_size = chunk_size
+        self.chunk_overlap = chunk_overlap
+    
+    def parse_file(self, file_path: str) -> Dict:
+        """
+        Parse a file and return structured document data
+        
+        Args:
+            file_path: Path to the file
+            
+        Returns:
+            Dictionary with document metadata and chunks
+        """
+        path = Path(file_path)
+        
+        if not path.exists():
+            raise FileNotFoundError(f"File not found: {file_path}")
+        
+        extension = path.suffix.lower()
+        
+        if extension not in self.SUPPORTED_FORMATS:
+            raise ValueError(f"Unsupported format: {extension}")
+        
+        # Extract text based on file type
+        if extension == '.pdf':
+            text = self._parse_pdf(file_path)
+        elif extension == '.docx':
+            text = self._parse_docx(file_path)
+        elif extension == '.txt' or extension == '.py':
+            text = self._parse_txt(file_path)
+        elif extension == '.md':
+            text = self._parse_markdown(file_path)
+        elif extension == '.html':
+            text = self._parse_html(file_path)
+        else:
+            text = ""
+        
+        # Create chunks
+        chunks = self._create_chunks(text)
+        
+        return {
+            'filename': path.name,
+            'filepath': str(path.absolute()),
+            'extension': extension,
+            'text': text,
+            'chunks': chunks,
+            'num_chunks': len(chunks),
+            'total_chars': len(text)
+        }
+    
+    def _parse_pdf(self, file_path: str) -> str:
+        """Extract text from PDF"""
+        try:
+            from pypdf import PdfReader
+            
+            reader = PdfReader(file_path)
+            text = ""
+            
+            for page in reader.pages:
+                text += page.extract_text() + "\n\n"
+            
+            return text.strip()
+        
+        except ImportError:
+            logger.error("pypdf not installed. Install with: pip install pypdf")
+            return ""
+        except Exception as e:
+            logger.error(f"Error parsing PDF: {e}")
+            return ""
+    
+    def _parse_docx(self, file_path: str) -> str:
+        """Extract text from DOCX"""
+        try:
+            from docx import Document
+            
+            doc = Document(file_path)
+            text = "\n\n".join([para.text for para in doc.paragraphs if para.text.strip()])
+            
+            return text.strip()
+        
+        except ImportError:
+            logger.error("python-docx not installed. Install with: pip install python-docx")
+            return ""
+        except Exception as e:
+            logger.error(f"Error parsing DOCX: {e}")
+            return ""
+    
+    def _parse_txt(self, file_path: str) -> str:
+        """Extract text from TXT"""
+        try:
+            with open(file_path, 'r', encoding='utf-8') as f:
+                return f.read().strip()
+        except Exception as e:
+            logger.error(f"Error parsing TXT: {e}")
+            return ""
+    
+    def _parse_markdown(self, file_path: str) -> str:
+        """Extract text from Markdown"""
+        try:
+            import markdown
+            from bs4 import BeautifulSoup
+            
+            with open(file_path, 'r', encoding='utf-8') as f:
+                md_text = f.read()
+            
+            # Convert markdown to HTML then extract text
+            html = markdown.markdown(md_text)
+            soup = BeautifulSoup(html, 'html.parser')
+            text = soup.get_text()
+            
+            return text.strip()
+        
+        except ImportError:
+            # Fallback: just read as plain text
+            return self._parse_txt(file_path)
+        except Exception as e:
+            logger.error(f"Error parsing Markdown: {e}")
+            return ""
+    
+    def _parse_html(self, file_path: str) -> str:
+        """Extract text from HTML"""
+        try:
+            from bs4 import BeautifulSoup
+            
+            with open(file_path, 'r', encoding='utf-8') as f:
+                html = f.read()
+            
+            soup = BeautifulSoup(html, 'html.parser')
+            
+            # Remove script and style elements
+            for script in soup(["script", "style"]):
+                script.decompose()
+            
+            text = soup.get_text()
+            
+            # Clean up whitespace
+            lines = (line.strip() for line in text.splitlines())
+            chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
+            text = '\n'.join(chunk for chunk in chunks if chunk)
+            
+            return text.strip()
+        
+        except ImportError:
+            logger.error("beautifulsoup4 not installed. Install with: pip install beautifulsoup4")
+            return ""
+        except Exception as e:
+            logger.error(f"Error parsing HTML: {e}")
+            return ""
+    
+    def _create_chunks(self, text: str) -> List[str]:
+        """
+        Split text into overlapping chunks
+        
+        Args:
+            text: Full text to chunk
+            
+        Returns:
+            List of text chunks
+        """
+        if not text:
+            return []
+        
+        chunks = []
+        start = 0
+        text_length = len(text)
+        
+        while start < text_length:
+            logger.info(f'Processing chunk at {start}, for len {text_length}.')
+
+            end = start + self.chunk_size
+            
+            # If this isn't the last chunk, try to break at a sentence or paragraph
+            if end < text_length:
+                # Look for paragraph break first
+                break_pos = text.rfind('\n\n', start, end)
+                if break_pos == -1:
+                    # Look for sentence break
+                    break_pos = text.rfind('. ', start, end)
+                if break_pos == -1:
+                    # Look for any space
+                    break_pos = text.rfind(' ', start, end)
+                
+                if break_pos != -1 and break_pos > start and break_pos > end - self.chunk_overlap:
+                    end = break_pos + 1
+            
+            chunk = text[start:end].strip()
+            if chunk:
+                chunks.append(chunk)
+            
+            # Move start position with overlap
+            start = end - self.chunk_overlap
+            if start < 0:
+                start = 0
+        
+        return chunks