"""
Document Parser - Extract text from various document formats
"""
import os
from typing import List, Dict, Optional
import logging
from pathlib import Path

logger = logging.getLogger(__name__)

class DocumentParser:
    """Parse various document formats into text chunks"""
    
    SUPPORTED_FORMATS = ['.pdf', '.docx', '.txt', '.md', '.html', '.py']
    
    def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200):
        """
        Initialize document parser
        
        Args:
            chunk_size: Maximum characters per chunk
            chunk_overlap: Overlap between chunks for context preservation
        """
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
    
    def parse_file(self, file_path: str) -> Dict:
        """
        Parse a file and return structured document data
        
        Args:
            file_path: Path to the file
            
        Returns:
            Dictionary with document metadata and chunks
        """
        path = Path(file_path)
        
        if not path.exists():
            raise FileNotFoundError(f"File not found: {file_path}")
        
        extension = path.suffix.lower()
        
        if extension not in self.SUPPORTED_FORMATS:
            raise ValueError(f"Unsupported format: {extension}")
        
        # Extract text based on file type
        if extension == '.pdf':
            text = self._parse_pdf(file_path)
        elif extension == '.docx':
            text = self._parse_docx(file_path)
        elif extension == '.txt' or extension == '.py':
            text = self._parse_txt(file_path)
        elif extension == '.md':
            text = self._parse_markdown(file_path)
        elif extension == '.html':
            text = self._parse_html(file_path)
        else:
            text = ""
        
        # Create chunks
        chunks = self._create_chunks(text)
        
        return {
            'filename': path.name,
            'filepath': str(path.absolute()),
            'extension': extension,
            'text': text,
            'chunks': chunks,
            'num_chunks': len(chunks),
            'total_chars': len(text)
        }
    
    def _parse_pdf(self, file_path: str) -> str:
        """Extract text from PDF"""
        try:
            from pypdf import PdfReader
            
            reader = PdfReader(file_path)
            text = ""
            
            for page in reader.pages:
                text += page.extract_text() + "\n\n"
            
            return text.strip()
        
        except ImportError:
            logger.error("pypdf not installed. Install with: pip install pypdf")
            return ""
        except Exception as e:
            logger.error(f"Error parsing PDF: {e}")
            return ""
    
    def _parse_docx(self, file_path: str) -> str:
        """Extract text from DOCX"""
        try:
            from docx import Document
            
            doc = Document(file_path)
            text = "\n\n".join([para.text for para in doc.paragraphs if para.text.strip()])
            
            return text.strip()
        
        except ImportError:
            logger.error("python-docx not installed. Install with: pip install python-docx")
            return ""
        except Exception as e:
            logger.error(f"Error parsing DOCX: {e}")
            return ""
    
    def _parse_txt(self, file_path: str) -> str:
        """Extract text from TXT"""
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                return f.read().strip()
        except Exception as e:
            logger.error(f"Error parsing TXT: {e}")
            return ""
    
    def _parse_markdown(self, file_path: str) -> str:
        """Extract text from Markdown"""
        try:
            import markdown
            from bs4 import BeautifulSoup
            
            with open(file_path, 'r', encoding='utf-8') as f:
                md_text = f.read()
            
            # Convert markdown to HTML then extract text
            html = markdown.markdown(md_text)
            soup = BeautifulSoup(html, 'html.parser')
            text = soup.get_text()
            
            return text.strip()
        
        except ImportError:
            # Fallback: just read as plain text
            return self._parse_txt(file_path)
        except Exception as e:
            logger.error(f"Error parsing Markdown: {e}")
            return ""
    
    def _parse_html(self, file_path: str) -> str:
        """Extract text from HTML"""
        try:
            from bs4 import BeautifulSoup
            
            with open(file_path, 'r', encoding='utf-8') as f:
                html = f.read()
            
            soup = BeautifulSoup(html, 'html.parser')
            
            # Remove script and style elements
            for script in soup(["script", "style"]):
                script.decompose()
            
            text = soup.get_text()
            
            # Clean up whitespace
            lines = (line.strip() for line in text.splitlines())
            chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
            text = '\n'.join(chunk for chunk in chunks if chunk)
            
            return text.strip()
        
        except ImportError:
            logger.error("beautifulsoup4 not installed. Install with: pip install beautifulsoup4")
            return ""
        except Exception as e:
            logger.error(f"Error parsing HTML: {e}")
            return ""
    
    def _create_chunks(self, text: str) -> List[str]:
        """
        Split text into overlapping chunks
        
        Args:
            text: Full text to chunk
            
        Returns:
            List of text chunks
        """
        if not text:
            return []
        
        chunks = []
        start = 0
        text_length = len(text)
        
        while start < text_length:
            logger.info(f'Processing chunk at {start}, for len {text_length}.')

            end = start + self.chunk_size
            
            # If this isn't the last chunk, try to break at a sentence or paragraph
            if end < text_length:
                # Look for paragraph break first
                break_pos = text.rfind('\n\n', start, end)
                if break_pos == -1:
                    # Look for sentence break
                    break_pos = text.rfind('. ', start, end)
                if break_pos == -1:
                    # Look for any space
                    break_pos = text.rfind(' ', start, end)
                
                if break_pos != -1 and break_pos > start and break_pos > end - self.chunk_overlap:
                    end = break_pos + 1
            
            chunk = text[start:end].strip()
            if chunk:
                chunks.append(chunk)
            
            # Move start position with overlap
            start = end - self.chunk_overlap
            if start < 0:
                start = 0
        
        return chunks