Files
LLM_Engineering_OLD/week5/community-contributions/week5_exercise_solution-Stephen.ipynb
2025-10-28 15:53:51 +03:00

244 lines
8.5 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
{
"cells": [
{
"cell_type": "markdown",
"id": "dfe37963-1af6-44fc-a841-8e462443f5e6",
"metadata": {},
"source": [
"## Expert Knowledge Worker\n",
"\n",
"### A question answering agent that is an expert knowledge worker\n",
"### To be used by Anyone on their LinkedIn data\n",
"The easiest and fastest way to obtain a copy of your LinkedIn data is to initiate a data download from your Settings & Privacy page:\n",
"\n",
"1. Click the Me icon at the top of your LinkedIn homepage.\n",
"2. Select Settings & Privacy from the dropdown.\n",
"3. Click the Data Privacy on the left rail.\n",
"4 .Under the How LinkedIn uses your data section, click Get a copy of your data.\n",
"5. Select the data that youre looking for and Request archive.\n",
"\n",
"This project will use RAG (Retrieval Augmented Generation) to ensure our question/answering assistant has high accuracy."
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "ba2779af-84ef-4227-9e9e-6eaf0df87e77",
"metadata": {},
"outputs": [],
"source": [
"# imports\n",
"\n",
"import os\n",
"import glob\n",
"from dotenv import load_dotenv\n",
"import gradio as gr\n",
"\n",
"from langchain.document_loaders import DirectoryLoader, TextLoader\n",
"from langchain.text_splitter import CharacterTextSplitter\n",
"from langchain.schema import Document\n",
"from langchain_openai import OpenAIEmbeddings, ChatOpenAI\n",
"from langchain_chroma import Chroma\n",
"import plotly.graph_objects as go\n",
"from langchain.memory import ConversationBufferMemory\n",
"from langchain.chains import ConversationalRetrievalChain\n",
"from langchain.embeddings import HuggingFaceEmbeddings\n",
"\n",
"import matplotlib.pyplot as plt\n",
"from sklearn.manifold import TSNE\n",
"import numpy as np\n",
"\n",
"MODEL = \"gpt-4o-mini\"\n",
"db_name = \"linkedin_db\"\n",
"\n",
"load_dotenv(override=True)\n",
"os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "730711a9-6ffe-4eee-8f48-d6cfb7314905",
"metadata": {},
"outputs": [],
"source": [
"# Read in documents using LangChain's loaders\n",
"# Put the chunks of data into a Vector Store (Chroma) that associates a Vector Embedding with each chunk\n",
"\n",
"folders = glob.glob(\"linkedin-base/*\")\n",
"\n",
"def add_metadata(doc, doc_type):\n",
" doc.metadata[\"doc_type\"] = doc_type\n",
" return doc\n",
"\n",
"text_loader_kwargs = {'encoding': 'utf-8'}\n",
"\n",
"documents = []\n",
"for folder in folders:\n",
" doc_type = os.path.basename(folder)\n",
" loader = DirectoryLoader(folder, glob=\"**/*.md\", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)\n",
" folder_docs = loader.load()\n",
" documents.extend([add_metadata(doc, doc_type) for doc in folder_docs])\n",
"\n",
"text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=100)\n",
"chunks = text_splitter.split_documents(documents)\n",
"\n",
"embeddings = OpenAIEmbeddings()\n",
"\n",
"if os.path.exists(db_name):\n",
" Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()\n",
"\n",
"vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)\n",
"\n",
"collection = vectorstore._collection\n",
"count = collection.count()\n",
"\n",
"sample_embedding = collection.get(limit=1, include=[\"embeddings\"])[\"embeddings\"][0]\n",
"dimensions = len(sample_embedding)\n",
"\n",
"\n",
"print(f\"Total number of chunks: {len(chunks)}\")\n",
"print(f\"Document types found: {set(doc.metadata['doc_type'] for doc in documents)}\")\n",
"print(f\"Vectorstore created with {vectorstore._collection.count()} documents\")\n",
"print(f\"There are {count:,} vectors with {dimensions:,} dimensions in the vector store\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b98adf5e-d464-4bd2-9bdf-bc5b6770263b",
"metadata": {},
"outputs": [],
"source": [
"# 2D scatter plot\n",
"\n",
"result = collection.get(include=['embeddings', 'documents', 'metadatas'])\n",
"vectors = np.array(result['embeddings'])\n",
"documents = result['documents']\n",
"metadatas = result['metadatas']\n",
"doc_types = [metadata['doc_type'] for metadata in metadatas]\n",
"colors = [['blue', 'green', 'red'][['connections', 'recommendations', 'profiles'].index(t)] for t in doc_types]\n",
"\n",
"n = vectors.shape[0]\n",
"if n < 3:\n",
" raise ValueError(f\"t-SNE needs at least 3 samples, got {n}\")\n",
"\n",
"perp = max(5.0, min(30.0, (n - 1) / 3.0)) # always < n, within [5, 30]\n",
"\n",
"tsne = TSNE(n_components=2, random_state=42, perplexity=perp)\n",
"reduced_vectors = tsne.fit_transform(vectors)\n",
"\n",
"fig = go.Figure(data=[go.Scatter(\n",
" x=reduced_vectors[:, 0],\n",
" y=reduced_vectors[:, 1],\n",
" mode='markers',\n",
" marker=dict(size=5, color=colors, opacity=0.8),\n",
" text=[f\"Type: {t}<br>Text: {d[:100]}...\" for t, d in zip(doc_types, documents)],\n",
" hoverinfo='text'\n",
")])\n",
"\n",
"fig.update_layout(\n",
" title='2D Chroma Vector Store Visualization',\n",
" scene=dict(xaxis_title='x',yaxis_title='y'),\n",
" width=800,\n",
" height=600,\n",
" margin=dict(r=20, b=10, l=10, t=40)\n",
")\n",
"\n",
"fig.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e1418e88-acd5-460a-bf2b-4e6efc88e3dd",
"metadata": {},
"outputs": [],
"source": [
"# 3D scatter plot!\n",
"\n",
"n = vectors.shape[0]\n",
"if n < 3:\n",
" raise ValueError(f\"t-SNE needs at least 3 samples, got {n}\")\n",
"\n",
"perp = max(5.0, min(30.0, (n - 1) / 3.0))\n",
"\n",
"tsne = TSNE(n_components=3, random_state=42, perplexity=perp)\n",
"reduced_vectors = tsne.fit_transform(vectors)\n",
"\n",
"fig = go.Figure(data=[go.Scatter3d(\n",
" x=reduced_vectors[:, 0],\n",
" y=reduced_vectors[:, 1],\n",
" z=reduced_vectors[:, 2],\n",
" mode='markers',\n",
" marker=dict(size=5, color=colors, opacity=0.8),\n",
" text=[f\"Type: {t}<br>Text: {d[:100]}...\" for t, d in zip(doc_types, documents)],\n",
" hoverinfo='text'\n",
")])\n",
"\n",
"fig.update_layout(\n",
" title='3D Chroma Vector Store Visualization',\n",
" scene=dict(xaxis_title='x', yaxis_title='y', zaxis_title='z'),\n",
" width=900,\n",
" height=700,\n",
" margin=dict(r=20, b=10, l=10, t=40)\n",
")\n",
"\n",
"fig.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2136153b-d2f6-4c58-a0e3-78c3a932cf55",
"metadata": {},
"outputs": [],
"source": [
"# The main Langchain Abstraction are: Memory, LLM, and Retriever\n",
"llm = ChatOpenAI(temperature=0.7, model_name=MODEL)\n",
"\n",
"memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)\n",
"retriever = vectorstore.as_retriever(search_kwargs={\"k\": 25})\n",
"conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)\n",
"\n",
"def chat(question, history):\n",
" result = conversation_chain.invoke({\"question\": question})\n",
" return result[\"answer\"]\n",
"\n",
"with gr.Blocks(theme=\"gradio/monochrome\") as ui:\n",
" gr.Markdown(\n",
" \"\"\"\n",
" <h2 style=\"color: #f5f5f5;\">Linkedin Knowledge Worker</h2>\n",
" <p style=\"color: #f5f5f5;\">Chat with your auto-generated Linkedin knowledge base </p>\n",
" \"\"\",\n",
" elem_id=\"title\"\n",
" )\n",
" gr.ChatInterface(chat, type=\"messages\")\n",
"\n",
"ui.launch(inbrowser=True)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}