Merge pull request #813 from iamumarjaved/WEEK5-DAY5

Knowledge Worker
2025-10-23 22:34:30 -04:00
parent 3e919ce9c7 6d8db14bc1
commit abb26e2135
1 changed files with 445 additions and 0 deletions
--- a/week5/community-contributions/w5d5_worker.py
+++ b/week5/community-contributions/w5d5_worker.py
@@ -0,0 +1,445 @@
 #!/usr/bin/env python3
 """
 Knowledge Worker with Document Upload and Google Drive Integration
 This script creates a knowledge worker that:
 1. Allows users to upload documents through a Gradio UI
 2. Integrates with Google Drive to access documents
 3. Uses Chroma vector database for efficient document retrieval
 4. Implements RAG (Retrieval Augmented Generation) for accurate responses
 The system updates its context dynamically when new documents are uploaded.
 """
 import os
 import glob
 import tempfile
 from pathlib import Path
 from dotenv import load_dotenv
 import gradio as gr
 # LangChain imports
 from langchain_community.document_loaders import DirectoryLoader, TextLoader, PyPDFLoader
 from langchain_core.documents import Document
 from langchain_openai import OpenAIEmbeddings, ChatOpenAI
 from langchain_chroma import Chroma
 # Visualization imports
 import numpy as np
 from sklearn.manifold import TSNE
 import plotly.graph_objects as go
 # Removed Google Drive API imports
 # Additional document loaders
 try:
    from langchain_community.document_loaders import Docx2txtLoader, UnstructuredExcelLoader
 except ImportError:
    print("Warning: Some document loaders not available. PDF and text files will still work.")
    Docx2txtLoader = None
    UnstructuredExcelLoader = None
 # Configuration
 MODEL = "gpt-4o-mini"  # Using a cost-effective model
 DB_NAME = "knowledge_worker_db"
 UPLOAD_FOLDER = "uploaded_documents"
 # Create upload folder if it doesn't exist
 os.makedirs(UPLOAD_FOLDER, exist_ok=True)
 # Load environment variables
 load_dotenv(override=True)
 os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')
 # Removed Google Drive credentials configuration
 # Use a simple text splitter approach
 class SimpleTextSplitter:
    def __init__(self, chunk_size=1000, chunk_overlap=200):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
    def split_documents(self, documents):
        chunks = []
        for doc in documents:
            text = doc.page_content
            start = 0
            while start < len(text):
                end = start + self.chunk_size
                chunk_text = text[start:end]
                chunk_doc = Document(page_content=chunk_text, metadata=doc.metadata.copy())
                chunks.append(chunk_doc)
                start = end - self.chunk_overlap
        return chunks
 CharacterTextSplitter = SimpleTextSplitter
 # Try different import paths for memory and chains
 try:
    from langchain.memory import ConversationBufferMemory
    from langchain.chains import ConversationalRetrievalChain
 except ImportError:
    try:
        from langchain_core.memory import ConversationBufferMemory
        from langchain_core.chains import ConversationalRetrievalChain
    except ImportError:
        try:
            from langchain_community.memory import ConversationBufferMemory
            from langchain_community.chains import ConversationalRetrievalChain
        except ImportError:
            print("Warning: Memory and chains modules not found. Creating simple alternatives.")
            # Create simple alternatives
            class ConversationBufferMemory:
                def __init__(self, memory_key='chat_history', return_messages=True):
                    self.memory_key = memory_key
                    self.return_messages = return_messages
                    self.chat_memory = []
                def save_context(self, inputs, outputs):
                    self.chat_memory.append((inputs, outputs))
                def load_memory_variables(self, inputs):
                    return {self.memory_key: self.chat_memory}
            class ConversationalRetrievalChain:
                def __init__(self, llm, retriever, memory):
                    self.llm = llm
                    self.retriever = retriever
                    self.memory = memory
                def invoke(self, inputs):
                    question = inputs.get("question", "")
                    # Simple implementation - just return a basic response
                    return {"answer": f"I received your question: {question}. This is a simplified response."}
 # Removed Google Drive Integration Functions
 # Document Processing Functions
 def get_loader_for_file(file_path):
    """
    Get the appropriate document loader based on file extension
    """
    file_extension = os.path.splitext(file_path)[1].lower()
    if file_extension == '.pdf':
        return PyPDFLoader(file_path)
    elif file_extension in ['.docx', '.doc'] and Docx2txtLoader:
        return Docx2txtLoader(file_path)
    elif file_extension in ['.xlsx', '.xls'] and UnstructuredExcelLoader:
        return UnstructuredExcelLoader(file_path)
    elif file_extension in ['.txt', '.md']:
        return TextLoader(file_path, encoding='utf-8')
    else:
        # Default to text loader for unknown types
        try:
            return TextLoader(file_path, encoding='utf-8')
        except:
            return None
 def load_document(file_path):
    """
    Load a document using the appropriate loader
    """
    loader = get_loader_for_file(file_path)
    if loader:
        try:
            return loader.load()
        except Exception as e:
            print(f"Error loading document {file_path}: {e}")
    return []
 def process_documents(documents):
    """
    Split documents into chunks for embedding
    """
    text_splitter = CharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200
    )
    chunks = text_splitter.split_documents(documents)
    return chunks
 # Knowledge Base Class
 class KnowledgeBase:
    def __init__(self, db_name=DB_NAME):
        self.db_name = db_name
        self.embeddings = OpenAIEmbeddings()
        self.vectorstore = None
        self.initialize_vectorstore()
    def initialize_vectorstore(self):
        """
        Initialize the vector store, loading from disk if it exists
        """
        if os.path.exists(self.db_name):
            self.vectorstore = Chroma(persist_directory=self.db_name, embedding_function=self.embeddings)
            print(f"Loaded existing vector store with {self.vectorstore._collection.count()} documents")
        else:
            # Create empty vectorstore
            self.vectorstore = Chroma(persist_directory=self.db_name, embedding_function=self.embeddings)
            print("Created new vector store")
    def add_documents(self, documents):
        """
        Process and add documents to the vector store
        """
        if not documents:
            return False
        chunks = process_documents(documents)
        if not chunks:
            return False
        # Add to existing vectorstore
        self.vectorstore.add_documents(chunks)
        print(f"Added {len(chunks)} chunks to vector store")
        return True
    def get_retriever(self, k=4):
        """
        Get a retriever for the vector store
        """
        return self.vectorstore.as_retriever(search_kwargs={"k": k})
    def visualize_vectors(self):
        """
        Create a 3D visualization of the vector store
        """
        try:
            collection = self.vectorstore._collection
            result = collection.get(include=['embeddings', 'documents', 'metadatas'])
            if result['embeddings'] is None or len(result['embeddings']) == 0:
                print("No embeddings found in vector store")
                return None
            vectors = np.array(result['embeddings'])
            documents = result['documents']
            metadatas = result['metadatas']
            if len(vectors) < 2:
                print("Not enough vectors for visualization (need at least 2)")
                return None
            # Get source info for coloring
            sources = [metadata.get('source', 'unknown') for metadata in metadatas]
            unique_sources = list(set(sources))
            colors = [['blue', 'green', 'red', 'orange', 'purple', 'cyan'][unique_sources.index(s) % 6] for s in sources]
            # Reduce dimensions for visualization
            # Adjust perplexity based on number of samples
            n_samples = len(vectors)
            perplexity = min(30, max(1, n_samples - 1))
            tsne = TSNE(n_components=3, random_state=42, perplexity=perplexity)
            reduced_vectors = tsne.fit_transform(vectors)
            # Create the 3D scatter plot
            fig = go.Figure(data=[go.Scatter3d(
                x=reduced_vectors[:, 0],
                y=reduced_vectors[:, 1],
                z=reduced_vectors[:, 2],
                mode='markers',
                marker=dict(size=5, color=colors, opacity=0.8),
                text=[f"Source: {s}<br>Text: {d[:100]}..." for s, d in zip(sources, documents)],
                hoverinfo='text'
            )])
            fig.update_layout(
                title='3D Vector Store Visualization',
                scene=dict(xaxis_title='x', yaxis_title='y', zaxis_title='z'),
                width=900,
                height=700,
                margin=dict(r=20, b=10, l=10, t=40)
            )
            return fig
        except Exception as e:
            print(f"Error creating visualization: {e}")
            return None
 # Simple fallback chain implementation
 class SimpleConversationalChain:
    def __init__(self, llm, retriever, memory):
        self.llm = llm
        self.retriever = retriever
        self.memory = memory
    def invoke(self, inputs):
        question = inputs.get("question", "")
        # Get relevant documents - try different methods
        try:
            docs = self.retriever.get_relevant_documents(question)
        except AttributeError:
            try:
                docs = self.retriever.invoke(question)
            except:
                docs = []
        context = "\n".join([doc.page_content for doc in docs[:3]]) if docs else "No relevant context found."
        # Create a simple prompt
        prompt = f"""Based on the following context, answer the question:
 Context: {context}
 Question: {question}
 Answer:"""
        # Get response from LLM
        response = self.llm.invoke(prompt)
        return {"answer": response.content if hasattr(response, 'content') else str(response)}
 # Chat System Class
 class ChatSystem:
    def __init__(self, knowledge_base, model_name=MODEL):
        self.knowledge_base = knowledge_base
        self.model_name = model_name
        self.llm = ChatOpenAI(temperature=0.7, model_name=self.model_name)
        self.memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
        self.conversation_chain = self._create_conversation_chain()
    def _create_conversation_chain(self):
        """
        Create a new conversation chain with the current retriever
        """
        retriever = self.knowledge_base.get_retriever()
        # Skip the problematic ConversationalRetrievalChain and use simple implementation
        print("Using simple conversational chain implementation")
        return SimpleConversationalChain(self.llm, retriever, self.memory)
    def reset_conversation(self):
        """
        Reset the conversation memory and chain
        """
        self.memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
        self.conversation_chain = self._create_conversation_chain()
        return "Conversation has been reset."
    def chat(self, question, history):
        """
        Process a question and return the answer
        """
        if not question.strip():
            return "Please ask a question."
        result = self.conversation_chain.invoke({"question": question})
        return result["answer"]
    def update_knowledge_base(self):
        """
        Update the conversation chain with the latest knowledge base
        """
        self.conversation_chain = self._create_conversation_chain()
 # UI Functions
 def handle_file_upload(files):
    """
    Process uploaded files and add them to the knowledge base
    """
    if not files:
        return "No files uploaded."
    documents = []
    for file in files:
        try:
            docs = load_document(file.name)
            if docs:
                # Add upload source metadata
                for doc in docs:
                    doc.metadata['source'] = 'upload'
                    doc.metadata['filename'] = os.path.basename(file.name)
                documents.extend(docs)
        except Exception as e:
            print(f"Error processing file {file.name}: {e}")
    if documents:
        success = kb.add_documents(documents)
        if success:
            # Update the chat system with new knowledge
            chat_system.update_knowledge_base()
            return f"Successfully processed {len(documents)} documents."
    return "No documents could be processed. Please check file formats."
 def create_ui():
    """
    Create the Gradio UI
    """
    with gr.Blocks(theme=gr.themes.Soft()) as app:
        gr.Markdown("""
        # Knowledge Worker
        Upload documents or ask questions about your knowledge base.
        """)
        with gr.Tabs():
            with gr.TabItem("Chat"):
                chatbot = gr.ChatInterface(
                    chat_system.chat,
                    chatbot=gr.Chatbot(height=500, type="messages"),
                    textbox=gr.Textbox(placeholder="Ask a question about your documents...", container=False),
                    title="Knowledge Worker Chat",
                    type="messages"
                )
                reset_btn = gr.Button("Reset Conversation")
                reset_btn.click(chat_system.reset_conversation, inputs=None, outputs=gr.Textbox())
            with gr.TabItem("Upload Documents"):
                with gr.Column():
                    file_output = gr.Textbox(label="Upload Status")
                    upload_button = gr.UploadButton(
                        "Click to Upload Files",
                        file_types=[".pdf", ".docx", ".txt", ".md", ".xlsx"],
                        file_count="multiple"
                    )
                    upload_button.upload(handle_file_upload, upload_button, file_output)
            with gr.TabItem("Visualize Knowledge"):
                visualize_btn = gr.Button("Generate Vector Visualization")
                plot_output = gr.Plot(label="Vector Space Visualization")
                visualize_btn.click(kb.visualize_vectors, inputs=None, outputs=plot_output)
    return app
 def main():
    """
    Main function to initialize and run the knowledge worker
    """
    global kb, chat_system
    print("=" * 60)
    print("Initializing Knowledge Worker...")
    print("=" * 60)
    try:
        # Initialize the knowledge base
        print("Setting up vector database...")
        kb = KnowledgeBase(DB_NAME)
        print("Vector database initialized successfully")
        # Google Drive integration removed
        # Initialize the chat system
        print("\nSetting up chat system...")
        chat_system = ChatSystem(kb)
        print("Chat system initialized successfully")
        # Launch the Gradio app
        print("\nLaunching Gradio interface...")
        print("=" * 60)
        print("The web interface will open in your browser")
        print("You can also access it at the URL shown below")
        print("=" * 60)
        app = create_ui()
        app.launch(inbrowser=True)
    except Exception as e:
        print(f"Error initializing Knowledge Worker: {e}")
        print("Please check your configuration and try again.")
        return
 if __name__ == "__main__":
    main()