From 1f3618d4446a1cabcf66fb3df8f2180c9f495680 Mon Sep 17 00:00:00 2001 From: muthama Date: Tue, 28 Oct 2025 15:53:51 +0300 Subject: [PATCH] Add week 5 exercise --- .../week5_exercise_solution-Stephen.ipynb | 243 ++++++++++++++++++ 1 file changed, 243 insertions(+) create mode 100644 week5/community-contributions/week5_exercise_solution-Stephen.ipynb diff --git a/week5/community-contributions/week5_exercise_solution-Stephen.ipynb b/week5/community-contributions/week5_exercise_solution-Stephen.ipynb new file mode 100644 index 0000000..8ee935e --- /dev/null +++ b/week5/community-contributions/week5_exercise_solution-Stephen.ipynb @@ -0,0 +1,243 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "dfe37963-1af6-44fc-a841-8e462443f5e6", + "metadata": {}, + "source": [ + "## Expert Knowledge Worker\n", + "\n", + "### A question answering agent that is an expert knowledge worker\n", + "### To be used by Anyone on their LinkedIn data\n", + "The easiest and fastest way to obtain a copy of your LinkedIn data is to initiate a data download from your Settings & Privacy page:\n", + "\n", + "1. Click the Me icon at the top of your LinkedIn homepage.\n", + "2. Select Settings & Privacy from the dropdown.\n", + "3. Click the Data Privacy on the left rail.\n", + "4 .Under the How LinkedIn uses your data section, click Get a copy of your data.\n", + "5. Select the data that you’re looking for and Request archive.\n", + "\n", + "This project will use RAG (Retrieval Augmented Generation) to ensure our question/answering assistant has high accuracy." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "ba2779af-84ef-4227-9e9e-6eaf0df87e77", + "metadata": {}, + "outputs": [], + "source": [ + "# imports\n", + "\n", + "import os\n", + "import glob\n", + "from dotenv import load_dotenv\n", + "import gradio as gr\n", + "\n", + "from langchain.document_loaders import DirectoryLoader, TextLoader\n", + "from langchain.text_splitter import CharacterTextSplitter\n", + "from langchain.schema import Document\n", + "from langchain_openai import OpenAIEmbeddings, ChatOpenAI\n", + "from langchain_chroma import Chroma\n", + "import plotly.graph_objects as go\n", + "from langchain.memory import ConversationBufferMemory\n", + "from langchain.chains import ConversationalRetrievalChain\n", + "from langchain.embeddings import HuggingFaceEmbeddings\n", + "\n", + "import matplotlib.pyplot as plt\n", + "from sklearn.manifold import TSNE\n", + "import numpy as np\n", + "\n", + "MODEL = \"gpt-4o-mini\"\n", + "db_name = \"linkedin_db\"\n", + "\n", + "load_dotenv(override=True)\n", + "os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "730711a9-6ffe-4eee-8f48-d6cfb7314905", + "metadata": {}, + "outputs": [], + "source": [ + "# Read in documents using LangChain's loaders\n", + "# Put the chunks of data into a Vector Store (Chroma) that associates a Vector Embedding with each chunk\n", + "\n", + "folders = glob.glob(\"linkedin-base/*\")\n", + "\n", + "def add_metadata(doc, doc_type):\n", + " doc.metadata[\"doc_type\"] = doc_type\n", + " return doc\n", + "\n", + "text_loader_kwargs = {'encoding': 'utf-8'}\n", + "\n", + "documents = []\n", + "for folder in folders:\n", + " doc_type = os.path.basename(folder)\n", + " loader = DirectoryLoader(folder, glob=\"**/*.md\", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)\n", + " folder_docs = loader.load()\n", + " documents.extend([add_metadata(doc, doc_type) for doc in folder_docs])\n", + "\n", + "text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=100)\n", + "chunks = text_splitter.split_documents(documents)\n", + "\n", + "embeddings = OpenAIEmbeddings()\n", + "\n", + "if os.path.exists(db_name):\n", + " Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()\n", + "\n", + "vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)\n", + "\n", + "collection = vectorstore._collection\n", + "count = collection.count()\n", + "\n", + "sample_embedding = collection.get(limit=1, include=[\"embeddings\"])[\"embeddings\"][0]\n", + "dimensions = len(sample_embedding)\n", + "\n", + "\n", + "print(f\"Total number of chunks: {len(chunks)}\")\n", + "print(f\"Document types found: {set(doc.metadata['doc_type'] for doc in documents)}\")\n", + "print(f\"Vectorstore created with {vectorstore._collection.count()} documents\")\n", + "print(f\"There are {count:,} vectors with {dimensions:,} dimensions in the vector store\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b98adf5e-d464-4bd2-9bdf-bc5b6770263b", + "metadata": {}, + "outputs": [], + "source": [ + "# 2D scatter plot\n", + "\n", + "result = collection.get(include=['embeddings', 'documents', 'metadatas'])\n", + "vectors = np.array(result['embeddings'])\n", + "documents = result['documents']\n", + "metadatas = result['metadatas']\n", + "doc_types = [metadata['doc_type'] for metadata in metadatas]\n", + "colors = [['blue', 'green', 'red'][['connections', 'recommendations', 'profiles'].index(t)] for t in doc_types]\n", + "\n", + "n = vectors.shape[0]\n", + "if n < 3:\n", + " raise ValueError(f\"t-SNE needs at least 3 samples, got {n}\")\n", + "\n", + "perp = max(5.0, min(30.0, (n - 1) / 3.0)) # always < n, within [5, 30]\n", + "\n", + "tsne = TSNE(n_components=2, random_state=42, perplexity=perp)\n", + "reduced_vectors = tsne.fit_transform(vectors)\n", + "\n", + "fig = go.Figure(data=[go.Scatter(\n", + " x=reduced_vectors[:, 0],\n", + " y=reduced_vectors[:, 1],\n", + " mode='markers',\n", + " marker=dict(size=5, color=colors, opacity=0.8),\n", + " text=[f\"Type: {t}
Text: {d[:100]}...\" for t, d in zip(doc_types, documents)],\n", + " hoverinfo='text'\n", + ")])\n", + "\n", + "fig.update_layout(\n", + " title='2D Chroma Vector Store Visualization',\n", + " scene=dict(xaxis_title='x',yaxis_title='y'),\n", + " width=800,\n", + " height=600,\n", + " margin=dict(r=20, b=10, l=10, t=40)\n", + ")\n", + "\n", + "fig.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e1418e88-acd5-460a-bf2b-4e6efc88e3dd", + "metadata": {}, + "outputs": [], + "source": [ + "# 3D scatter plot!\n", + "\n", + "n = vectors.shape[0]\n", + "if n < 3:\n", + " raise ValueError(f\"t-SNE needs at least 3 samples, got {n}\")\n", + "\n", + "perp = max(5.0, min(30.0, (n - 1) / 3.0))\n", + "\n", + "tsne = TSNE(n_components=3, random_state=42, perplexity=perp)\n", + "reduced_vectors = tsne.fit_transform(vectors)\n", + "\n", + "fig = go.Figure(data=[go.Scatter3d(\n", + " x=reduced_vectors[:, 0],\n", + " y=reduced_vectors[:, 1],\n", + " z=reduced_vectors[:, 2],\n", + " mode='markers',\n", + " marker=dict(size=5, color=colors, opacity=0.8),\n", + " text=[f\"Type: {t}
Text: {d[:100]}...\" for t, d in zip(doc_types, documents)],\n", + " hoverinfo='text'\n", + ")])\n", + "\n", + "fig.update_layout(\n", + " title='3D Chroma Vector Store Visualization',\n", + " scene=dict(xaxis_title='x', yaxis_title='y', zaxis_title='z'),\n", + " width=900,\n", + " height=700,\n", + " margin=dict(r=20, b=10, l=10, t=40)\n", + ")\n", + "\n", + "fig.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2136153b-d2f6-4c58-a0e3-78c3a932cf55", + "metadata": {}, + "outputs": [], + "source": [ + "# The main Langchain Abstraction are: Memory, LLM, and Retriever\n", + "llm = ChatOpenAI(temperature=0.7, model_name=MODEL)\n", + "\n", + "memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)\n", + "retriever = vectorstore.as_retriever(search_kwargs={\"k\": 25})\n", + "conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)\n", + "\n", + "def chat(question, history):\n", + " result = conversation_chain.invoke({\"question\": question})\n", + " return result[\"answer\"]\n", + "\n", + "with gr.Blocks(theme=\"gradio/monochrome\") as ui:\n", + " gr.Markdown(\n", + " \"\"\"\n", + "

Linkedin Knowledge Worker

\n", + "

Chat with your auto-generated Linkedin knowledge base

\n", + " \"\"\",\n", + " elem_id=\"title\"\n", + " )\n", + " gr.ChatInterface(chat, type=\"messages\")\n", + "\n", + "ui.launch(inbrowser=True)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}