#!/usr/bin/env python3 """ Simple All-in-One RAG System for Personal Data Handles .docx files, creates sample CV, and provides interactive interface """ import os import sys from pathlib import Path # Install required packages if not already installed try: from langchain_community.vectorstores import Chroma from langchain.docstore.document import Document from langchain_huggingface import HuggingFaceEmbeddings from langchain_community.document_loaders import PyPDFLoader from langchain.text_splitter import CharacterTextSplitter except ImportError: print("Installing required packages...") os.system("pip install langchain-huggingface pypdf") from langchain_community.vectorstores import Chroma from langchain.docstore.document import Document from langchain_huggingface import HuggingFaceEmbeddings from langchain_community.document_loaders import PyPDFLoader from langchain.text_splitter import CharacterTextSplitter def create_sample_cv(): """Create a sample CV text file""" sample_cv = """ CURRICULUM VITAE - MUAWIYA PERSONAL INFORMATION Name: Muawiya Email: muawiya@example.com Phone: +1234567890 Location: [Your Location] PROFESSIONAL SUMMARY Enthusiastic developer and student with a passion for technology and programming. Currently learning Django framework and web development. Active participant in the LLM engineering community and working on personal projects. EDUCATION - Currently pursuing studies in Computer Science/Programming - Learning Django web framework - Studying web development and programming concepts TECHNICAL SKILLS - Python Programming - Django Web Framework - Virtual Environment Management - Git and GitHub - Database Management with Django - Basic Web Development CURRENT PROJECTS - Learning Django through practical exercises - Building web applications - Working on LLM engineering projects - Contributing to community projects - Personal data management and RAG systems LEARNING GOALS - Master Django framework - Build full-stack web applications - Learn machine learning and AI - Contribute to open source projects - Develop expertise in modern web technologies INTERESTS - Web Development - Artificial Intelligence - Machine Learning - Open Source Software - Technology and Programming LANGUAGES - English - [Add other languages if applicable] CERTIFICATIONS - [Add any relevant certifications] REFERENCES Available upon request """ # Create Personal directory if it doesn't exist personal_dir = Path("Personal") personal_dir.mkdir(exist_ok=True) # Create the sample CV file cv_file = personal_dir / "CV_Muawiya.txt" with open(cv_file, 'w', encoding='utf-8') as f: f.write(sample_cv.strip()) print(f"āœ… Created sample CV: {cv_file}") return cv_file def load_documents(): """Load all documents from Personal directory""" documents = [] input_path = Path("Personal") # Supported file extensions text_extensions = {'.txt', '.md', '.log', '.csv', '.json'} pdf_extensions = {'.pdf'} print(f"šŸ” Scanning directory: {input_path}") for file_path in input_path.rglob("*"): if file_path.is_file(): file_ext = file_path.suffix.lower() try: if file_ext in text_extensions: # Handle text files with open(file_path, "r", encoding="utf-8", errors='ignore') as f: content = f.read().strip() if content and len(content) > 10: documents.append(Document( page_content=content, metadata={"source": str(file_path.relative_to(input_path)), "type": "text"} )) print(f" āœ… Loaded: {file_path.name} ({len(content)} chars)") elif file_ext in pdf_extensions: # Handle PDF files try: loader = PyPDFLoader(str(file_path)) pdf_docs = loader.load() valid_docs = 0 for doc in pdf_docs: if doc.page_content.strip() and len(doc.page_content.strip()) > 10: doc.metadata["source"] = str(file_path.relative_to(input_path)) doc.metadata["type"] = "pdf" documents.append(doc) valid_docs += 1 if valid_docs > 0: print(f" āœ… Loaded PDF: {file_path.name} ({valid_docs} pages with content)") except Exception as e: print(f" āš ļø Skipped PDF: {file_path.name} (error: {e})") except Exception as e: print(f" āŒ Error processing {file_path.name}: {e}") return documents def create_rag_system(): """Create the RAG system with all documents""" print("šŸš€ Creating RAG System") print("=" * 50) # Step 1: Create sample CV if it doesn't exist cv_file = Path("Personal/CV_Muawiya.txt") if not cv_file.exists(): print("šŸ“ Creating sample CV...") create_sample_cv() # Step 2: Load all documents documents = load_documents() print(f"\nšŸ“Š Loaded {len(documents)} documents") if len(documents) == 0: print("āŒ No documents found! Creating sample document...") sample_content = "This is a sample document for testing the RAG system." documents.append(Document( page_content=sample_content, metadata={"source": "sample.txt", "type": "sample"} )) # Step 3: Load embedding model print("\nšŸ¤– Loading embedding model...") embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") # Step 4: Split documents into chunks print("āœ‚ļø Splitting documents into chunks...") text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50) chunks = text_splitter.split_documents(documents) print(f"šŸ“ Created {len(chunks)} chunks") # Step 5: Create vectorstore print("šŸ—„ļø Creating vector database...") db_path = "chroma_failures_ds" vectorstore = Chroma.from_documents(documents=chunks, embedding=embedding_model, persist_directory=db_path) print(f"āœ… Vectorstore created with {vectorstore._collection.count()} documents") return vectorstore def search_documents(vectorstore, query, k=5): """Search documents with similarity scores - get more results for better filtering""" try: results = vectorstore.similarity_search_with_score(query, k=k) return results except Exception as e: print(f"āŒ Error searching: {e}") return [] def display_results(results, query): """Display search results with relevance filtering""" print(f"\nšŸ” Results for: '{query}'") print("=" * 60) if not results: print("āŒ No results found.") return # Filter results by relevance (only show relevant ones) relevant_results = [] irrelevant_results = [] for doc, score in results: # Chroma uses cosine distance, so lower score = more similar # Convert to relevance score (0-1, where 1 is most relevant) # For cosine distance: 0 = identical, 2 = completely different relevance = 1 - (score / 2) # Normalize to 0-1 range if relevance > 0.3: # Show results with >30% relevance relevant_results.append((doc, score, relevance)) else: irrelevant_results.append((doc, score, relevance)) # Show relevant results if relevant_results: print(f"\nāœ… Relevant Results ({len(relevant_results)} found):") print("-" * 50) # Group results by source to avoid duplicates seen_sources = set() unique_results = [] for doc, score, relevance in relevant_results: source = doc.metadata.get('source', 'Unknown') if source not in seen_sources: seen_sources.add(source) unique_results.append((doc, score, relevance)) for i, (doc, score, relevance) in enumerate(unique_results, 1): print(f"\nšŸ“„ Result {i} (Relevance: {relevance:.2f})") print(f"šŸ“ Source: {doc.metadata.get('source', 'Unknown')}") print(f"šŸ“ Type: {doc.metadata.get('type', 'Unknown')}") print("-" * 40) # Display content - show more content for better context content = doc.page_content.strip() if len(content) > 500: # Show more content content = content[:500] + "..." lines = content.split('\n') for line in lines[:12]: # Show more lines if line.strip(): print(f" {line.strip()}") if len(lines) > 12: print(f" ... ({len(lines) - 12} more lines)") # Show summary if there were duplicates if len(relevant_results) > len(unique_results): print(f"\nšŸ’” Note: {len(relevant_results) - len(unique_results)} duplicate results from same sources were combined.") # Show summary of irrelevant results if irrelevant_results: print(f"\nāš ļø Low Relevance Results ({len(irrelevant_results)} filtered out):") print("-" * 50) print("These results had low similarity to your query and were filtered out.") for i, (doc, score, relevance) in enumerate(irrelevant_results[:2], 1): # Show first 2 source = doc.metadata.get('source', 'Unknown') print(f" {i}. {source} (Relevance: {relevance:.2f})") if len(irrelevant_results) > 2: print(f" ... and {len(irrelevant_results) - 2} more") # If no relevant results found if not relevant_results: print(f"\nāŒ No relevant results found for '{query}'") print("šŸ’” Your documents contain:") print(" • Personal CV information") print(" • Django commands and setup instructions") print(" • GitHub recovery codes") print(" • Various PDF documents") print("\nšŸ” Try asking about:") print(" • Muawiya's personal information") print(" • Muawiya's skills and experience") print(" • Django project creation") print(" • Django commands") print(" • Virtual environment setup") def interactive_query(vectorstore): """Interactive query interface""" print("\nšŸŽÆ Interactive Query Interface") print("=" * 50) print("šŸ’” Example questions:") print(" • 'Who is Muawiya?'") print(" • 'What are Muawiya's skills?'") print(" • 'What is Muawiya's education?'") print(" • 'How do I create a Django project?'") print(" • 'What are the Django commands?'") print(" • 'quit' to exit") print("=" * 50) while True: try: query = input("\nā“ Ask a question: ").strip() if query.lower() in ['quit', 'exit', 'q']: print("šŸ‘‹ Goodbye!") break if not query: print("āš ļø Please enter a question.") continue print(f"\nšŸ” Searching for: '{query}'") results = search_documents(vectorstore, query, k=5) display_results(results, query) except KeyboardInterrupt: print("\n\nšŸ‘‹ Goodbye!") break except Exception as e: print(f"āŒ Error: {e}") def main(): """Main function - everything in one place""" print("šŸš€ Simple All-in-One RAG System") print("=" * 60) # Create the RAG system vectorstore = create_rag_system() print(f"\nšŸŽ‰ RAG system is ready!") print(f"šŸ“ Database location: chroma_failures_ds") # Start interactive interface interactive_query(vectorstore) if __name__ == "__main__": main()