{ "cells": [ { "cell_type": "markdown", "id": "dfe37963-1af6-44fc-a841-8e462443f5e6", "metadata": {}, "source": [ "## Expert Knowledge Worker\n", "\n", "### A question answering agent that is an expert knowledge worker\n", "### To be used by Anyone on their LinkedIn data\n", "The easiest and fastest way to obtain a copy of your LinkedIn data is to initiate a data download from your Settings & Privacy page:\n", "\n", "1. Click the Me icon at the top of your LinkedIn homepage.\n", "2. Select Settings & Privacy from the dropdown.\n", "3. Click the Data Privacy on the left rail.\n", "4 .Under the How LinkedIn uses your data section, click Get a copy of your data.\n", "5. Select the data that you’re looking for and Request archive.\n", "\n", "This project will use RAG (Retrieval Augmented Generation) to ensure our question/answering assistant has high accuracy." ] }, { "cell_type": "code", "execution_count": 17, "id": "ba2779af-84ef-4227-9e9e-6eaf0df87e77", "metadata": {}, "outputs": [], "source": [ "# imports\n", "\n", "import os\n", "import glob\n", "from dotenv import load_dotenv\n", "import gradio as gr\n", "\n", "from langchain.document_loaders import DirectoryLoader, TextLoader\n", "from langchain.text_splitter import CharacterTextSplitter\n", "from langchain.schema import Document\n", "from langchain_openai import OpenAIEmbeddings, ChatOpenAI\n", "from langchain_chroma import Chroma\n", "import plotly.graph_objects as go\n", "from langchain.memory import ConversationBufferMemory\n", "from langchain.chains import ConversationalRetrievalChain\n", "from langchain.embeddings import HuggingFaceEmbeddings\n", "\n", "import matplotlib.pyplot as plt\n", "from sklearn.manifold import TSNE\n", "import numpy as np\n", "\n", "MODEL = \"gpt-4o-mini\"\n", "db_name = \"linkedin_db\"\n", "\n", "load_dotenv(override=True)\n", "os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')" ] }, { "cell_type": "code", "execution_count": null, "id": "730711a9-6ffe-4eee-8f48-d6cfb7314905", "metadata": {}, "outputs": [], "source": [ "# Read in documents using LangChain's loaders\n", "# Put the chunks of data into a Vector Store (Chroma) that associates a Vector Embedding with each chunk\n", "\n", "folders = glob.glob(\"linkedin-base/*\")\n", "\n", "def add_metadata(doc, doc_type):\n", " doc.metadata[\"doc_type\"] = doc_type\n", " return doc\n", "\n", "text_loader_kwargs = {'encoding': 'utf-8'}\n", "\n", "documents = []\n", "for folder in folders:\n", " doc_type = os.path.basename(folder)\n", " loader = DirectoryLoader(folder, glob=\"**/*.md\", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)\n", " folder_docs = loader.load()\n", " documents.extend([add_metadata(doc, doc_type) for doc in folder_docs])\n", "\n", "text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=100)\n", "chunks = text_splitter.split_documents(documents)\n", "\n", "embeddings = OpenAIEmbeddings()\n", "\n", "if os.path.exists(db_name):\n", " Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()\n", "\n", "vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)\n", "\n", "collection = vectorstore._collection\n", "count = collection.count()\n", "\n", "sample_embedding = collection.get(limit=1, include=[\"embeddings\"])[\"embeddings\"][0]\n", "dimensions = len(sample_embedding)\n", "\n", "\n", "print(f\"Total number of chunks: {len(chunks)}\")\n", "print(f\"Document types found: {set(doc.metadata['doc_type'] for doc in documents)}\")\n", "print(f\"Vectorstore created with {vectorstore._collection.count()} documents\")\n", "print(f\"There are {count:,} vectors with {dimensions:,} dimensions in the vector store\")" ] }, { "cell_type": "code", "execution_count": null, "id": "b98adf5e-d464-4bd2-9bdf-bc5b6770263b", "metadata": {}, "outputs": [], "source": [ "# 2D scatter plot\n", "\n", "result = collection.get(include=['embeddings', 'documents', 'metadatas'])\n", "vectors = np.array(result['embeddings'])\n", "documents = result['documents']\n", "metadatas = result['metadatas']\n", "doc_types = [metadata['doc_type'] for metadata in metadatas]\n", "colors = [['blue', 'green', 'red'][['connections', 'recommendations', 'profiles'].index(t)] for t in doc_types]\n", "\n", "n = vectors.shape[0]\n", "if n < 3:\n", " raise ValueError(f\"t-SNE needs at least 3 samples, got {n}\")\n", "\n", "perp = max(5.0, min(30.0, (n - 1) / 3.0)) # always < n, within [5, 30]\n", "\n", "tsne = TSNE(n_components=2, random_state=42, perplexity=perp)\n", "reduced_vectors = tsne.fit_transform(vectors)\n", "\n", "fig = go.Figure(data=[go.Scatter(\n", " x=reduced_vectors[:, 0],\n", " y=reduced_vectors[:, 1],\n", " mode='markers',\n", " marker=dict(size=5, color=colors, opacity=0.8),\n", " text=[f\"Type: {t}
Text: {d[:100]}...\" for t, d in zip(doc_types, documents)],\n", " hoverinfo='text'\n", ")])\n", "\n", "fig.update_layout(\n", " title='2D Chroma Vector Store Visualization',\n", " scene=dict(xaxis_title='x',yaxis_title='y'),\n", " width=800,\n", " height=600,\n", " margin=dict(r=20, b=10, l=10, t=40)\n", ")\n", "\n", "fig.show()" ] }, { "cell_type": "code", "execution_count": null, "id": "e1418e88-acd5-460a-bf2b-4e6efc88e3dd", "metadata": {}, "outputs": [], "source": [ "# 3D scatter plot!\n", "\n", "n = vectors.shape[0]\n", "if n < 3:\n", " raise ValueError(f\"t-SNE needs at least 3 samples, got {n}\")\n", "\n", "perp = max(5.0, min(30.0, (n - 1) / 3.0))\n", "\n", "tsne = TSNE(n_components=3, random_state=42, perplexity=perp)\n", "reduced_vectors = tsne.fit_transform(vectors)\n", "\n", "fig = go.Figure(data=[go.Scatter3d(\n", " x=reduced_vectors[:, 0],\n", " y=reduced_vectors[:, 1],\n", " z=reduced_vectors[:, 2],\n", " mode='markers',\n", " marker=dict(size=5, color=colors, opacity=0.8),\n", " text=[f\"Type: {t}
Text: {d[:100]}...\" for t, d in zip(doc_types, documents)],\n", " hoverinfo='text'\n", ")])\n", "\n", "fig.update_layout(\n", " title='3D Chroma Vector Store Visualization',\n", " scene=dict(xaxis_title='x', yaxis_title='y', zaxis_title='z'),\n", " width=900,\n", " height=700,\n", " margin=dict(r=20, b=10, l=10, t=40)\n", ")\n", "\n", "fig.show()" ] }, { "cell_type": "code", "execution_count": null, "id": "2136153b-d2f6-4c58-a0e3-78c3a932cf55", "metadata": {}, "outputs": [], "source": [ "# The main Langchain Abstraction are: Memory, LLM, and Retriever\n", "llm = ChatOpenAI(temperature=0.7, model_name=MODEL)\n", "\n", "memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)\n", "retriever = vectorstore.as_retriever(search_kwargs={\"k\": 25})\n", "conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)\n", "\n", "def chat(question, history):\n", " result = conversation_chain.invoke({\"question\": question})\n", " return result[\"answer\"]\n", "\n", "with gr.Blocks(theme=\"gradio/monochrome\") as ui:\n", " gr.Markdown(\n", " \"\"\"\n", "

Linkedin Knowledge Worker

\n", "

Chat with your auto-generated Linkedin knowledge base

\n", " \"\"\",\n", " elem_id=\"title\"\n", " )\n", " gr.ChatInterface(chat, type=\"messages\")\n", "\n", "ui.launch(inbrowser=True)" ] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.12" } }, "nbformat": 4, "nbformat_minor": 5 }