Files
LLM_Engineering_OLD/week5/community-contributions/muawiya/simple_rag_system.py
2025-06-21 20:26:48 +03:00

340 lines
12 KiB
Python

#!/usr/bin/env python3
"""
Simple All-in-One RAG System for Personal Data
Handles .docx files, creates sample CV, and provides interactive interface
"""
import os
import sys
from pathlib import Path
# Install required packages if not already installed
try:
from langchain_community.vectorstores import Chroma
from langchain.docstore.document import Document
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
except ImportError:
print("Installing required packages...")
os.system("pip install langchain-huggingface pypdf")
from langchain_community.vectorstores import Chroma
from langchain.docstore.document import Document
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
def create_sample_cv():
"""Create a sample CV text file"""
sample_cv = """
CURRICULUM VITAE - MUAWIYA
PERSONAL INFORMATION
Name: Muawiya
Email: muawiya@example.com
Phone: +1234567890
Location: [Your Location]
PROFESSIONAL SUMMARY
Enthusiastic developer and student with a passion for technology and programming.
Currently learning Django framework and web development. Active participant in
the LLM engineering community and working on personal projects.
EDUCATION
- Currently pursuing studies in Computer Science/Programming
- Learning Django web framework
- Studying web development and programming concepts
TECHNICAL SKILLS
- Python Programming
- Django Web Framework
- Virtual Environment Management
- Git and GitHub
- Database Management with Django
- Basic Web Development
CURRENT PROJECTS
- Learning Django through practical exercises
- Building web applications
- Working on LLM engineering projects
- Contributing to community projects
- Personal data management and RAG systems
LEARNING GOALS
- Master Django framework
- Build full-stack web applications
- Learn machine learning and AI
- Contribute to open source projects
- Develop expertise in modern web technologies
INTERESTS
- Web Development
- Artificial Intelligence
- Machine Learning
- Open Source Software
- Technology and Programming
LANGUAGES
- English
- [Add other languages if applicable]
CERTIFICATIONS
- [Add any relevant certifications]
REFERENCES
Available upon request
"""
# Create Personal directory if it doesn't exist
personal_dir = Path("Personal")
personal_dir.mkdir(exist_ok=True)
# Create the sample CV file
cv_file = personal_dir / "CV_Muawiya.txt"
with open(cv_file, 'w', encoding='utf-8') as f:
f.write(sample_cv.strip())
print(f"✅ Created sample CV: {cv_file}")
return cv_file
def load_documents():
"""Load all documents from Personal directory"""
documents = []
input_path = Path("Personal")
# Supported file extensions
text_extensions = {'.txt', '.md', '.log', '.csv', '.json'}
pdf_extensions = {'.pdf'}
print(f"🔍 Scanning directory: {input_path}")
for file_path in input_path.rglob("*"):
if file_path.is_file():
file_ext = file_path.suffix.lower()
try:
if file_ext in text_extensions:
# Handle text files
with open(file_path, "r", encoding="utf-8", errors='ignore') as f:
content = f.read().strip()
if content and len(content) > 10:
documents.append(Document(
page_content=content,
metadata={"source": str(file_path.relative_to(input_path)), "type": "text"}
))
print(f" ✅ Loaded: {file_path.name} ({len(content)} chars)")
elif file_ext in pdf_extensions:
# Handle PDF files
try:
loader = PyPDFLoader(str(file_path))
pdf_docs = loader.load()
valid_docs = 0
for doc in pdf_docs:
if doc.page_content.strip() and len(doc.page_content.strip()) > 10:
doc.metadata["source"] = str(file_path.relative_to(input_path))
doc.metadata["type"] = "pdf"
documents.append(doc)
valid_docs += 1
if valid_docs > 0:
print(f" ✅ Loaded PDF: {file_path.name} ({valid_docs} pages with content)")
except Exception as e:
print(f" ⚠️ Skipped PDF: {file_path.name} (error: {e})")
except Exception as e:
print(f" ❌ Error processing {file_path.name}: {e}")
return documents
def create_rag_system():
"""Create the RAG system with all documents"""
print("🚀 Creating RAG System")
print("=" * 50)
# Step 1: Create sample CV if it doesn't exist
cv_file = Path("Personal/CV_Muawiya.txt")
if not cv_file.exists():
print("📝 Creating sample CV...")
create_sample_cv()
# Step 2: Load all documents
documents = load_documents()
print(f"\n📊 Loaded {len(documents)} documents")
if len(documents) == 0:
print("❌ No documents found! Creating sample document...")
sample_content = "This is a sample document for testing the RAG system."
documents.append(Document(
page_content=sample_content,
metadata={"source": "sample.txt", "type": "sample"}
))
# Step 3: Load embedding model
print("\n🤖 Loading embedding model...")
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
# Step 4: Split documents into chunks
print("✂️ Splitting documents into chunks...")
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = text_splitter.split_documents(documents)
print(f"📝 Created {len(chunks)} chunks")
# Step 5: Create vectorstore
print("🗄️ Creating vector database...")
db_path = "chroma_failures_ds"
vectorstore = Chroma.from_documents(documents=chunks, embedding=embedding_model, persist_directory=db_path)
print(f"✅ Vectorstore created with {vectorstore._collection.count()} documents")
return vectorstore
def search_documents(vectorstore, query, k=5):
"""Search documents with similarity scores - get more results for better filtering"""
try:
results = vectorstore.similarity_search_with_score(query, k=k)
return results
except Exception as e:
print(f"❌ Error searching: {e}")
return []
def display_results(results, query):
"""Display search results with relevance filtering"""
print(f"\n🔍 Results for: '{query}'")
print("=" * 60)
if not results:
print("❌ No results found.")
return
# Filter results by relevance (only show relevant ones)
relevant_results = []
irrelevant_results = []
for doc, score in results:
# Chroma uses cosine distance, so lower score = more similar
# Convert to relevance score (0-1, where 1 is most relevant)
# For cosine distance: 0 = identical, 2 = completely different
relevance = 1 - (score / 2) # Normalize to 0-1 range
if relevance > 0.3: # Show results with >30% relevance
relevant_results.append((doc, score, relevance))
else:
irrelevant_results.append((doc, score, relevance))
# Show relevant results
if relevant_results:
print(f"\n✅ Relevant Results ({len(relevant_results)} found):")
print("-" * 50)
# Group results by source to avoid duplicates
seen_sources = set()
unique_results = []
for doc, score, relevance in relevant_results:
source = doc.metadata.get('source', 'Unknown')
if source not in seen_sources:
seen_sources.add(source)
unique_results.append((doc, score, relevance))
for i, (doc, score, relevance) in enumerate(unique_results, 1):
print(f"\n📄 Result {i} (Relevance: {relevance:.2f})")
print(f"📁 Source: {doc.metadata.get('source', 'Unknown')}")
print(f"📝 Type: {doc.metadata.get('type', 'Unknown')}")
print("-" * 40)
# Display content - show more content for better context
content = doc.page_content.strip()
if len(content) > 500: # Show more content
content = content[:500] + "..."
lines = content.split('\n')
for line in lines[:12]: # Show more lines
if line.strip():
print(f" {line.strip()}")
if len(lines) > 12:
print(f" ... ({len(lines) - 12} more lines)")
# Show summary if there were duplicates
if len(relevant_results) > len(unique_results):
print(f"\n💡 Note: {len(relevant_results) - len(unique_results)} duplicate results from same sources were combined.")
# Show summary of irrelevant results
if irrelevant_results:
print(f"\n⚠️ Low Relevance Results ({len(irrelevant_results)} filtered out):")
print("-" * 50)
print("These results had low similarity to your query and were filtered out.")
for i, (doc, score, relevance) in enumerate(irrelevant_results[:2], 1): # Show first 2
source = doc.metadata.get('source', 'Unknown')
print(f" {i}. {source} (Relevance: {relevance:.2f})")
if len(irrelevant_results) > 2:
print(f" ... and {len(irrelevant_results) - 2} more")
# If no relevant results found
if not relevant_results:
print(f"\n❌ No relevant results found for '{query}'")
print("💡 Your documents contain:")
print(" • Personal CV information")
print(" • Django commands and setup instructions")
print(" • GitHub recovery codes")
print(" • Various PDF documents")
print("\n🔍 Try asking about:")
print(" • Muawiya's personal information")
print(" • Muawiya's skills and experience")
print(" • Django project creation")
print(" • Django commands")
print(" • Virtual environment setup")
def interactive_query(vectorstore):
"""Interactive query interface"""
print("\n🎯 Interactive Query Interface")
print("=" * 50)
print("💡 Example questions:")
print("'Who is Muawiya?'")
print("'What are Muawiya's skills?'")
print("'What is Muawiya's education?'")
print("'How do I create a Django project?'")
print("'What are the Django commands?'")
print("'quit' to exit")
print("=" * 50)
while True:
try:
query = input("\n❓ Ask a question: ").strip()
if query.lower() in ['quit', 'exit', 'q']:
print("👋 Goodbye!")
break
if not query:
print("⚠️ Please enter a question.")
continue
print(f"\n🔍 Searching for: '{query}'")
results = search_documents(vectorstore, query, k=5)
display_results(results, query)
except KeyboardInterrupt:
print("\n\n👋 Goodbye!")
break
except Exception as e:
print(f"❌ Error: {e}")
def main():
"""Main function - everything in one place"""
print("🚀 Simple All-in-One RAG System")
print("=" * 60)
# Create the RAG system
vectorstore = create_rag_system()
print(f"\n🎉 RAG system is ready!")
print(f"📁 Database location: chroma_failures_ds")
# Start interactive interface
interactive_query(vectorstore)
if __name__ == "__main__":
main()