182 lines
5.1 KiB
Plaintext
182 lines
5.1 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "b577c1be-f7a4-4549-8d27-30cb35407225",
|
|
"metadata": {},
|
|
"source": [
|
|
"# The Price is Right\n",
|
|
"\n",
|
|
"Today we build a more complex solution for estimating prices of goods.\n",
|
|
"\n",
|
|
"1. Day 2.0 notebook: create a RAG database with our 400,000 training data\n",
|
|
"2. Day 2.1 notebook: visualize in 2D\n",
|
|
"3. Day 2.2 notebook: visualize in 3D\n",
|
|
"4. Day 2.3 notebook: build and test a RAG pipeline with GPT-4o-mini\n",
|
|
"5. Day 2.4 notebook: (a) bring back our Random Forest pricer (b) Create a Ensemble pricer that allows contributions from all the pricers\n",
|
|
"\n",
|
|
"Phew! That's a lot to get through in one day!\n",
|
|
"\n",
|
|
"## PLEASE NOTE:\n",
|
|
"\n",
|
|
"We already have a very powerful product estimator with our proprietary, fine-tuned LLM. Most people would be very satisfied with that! The main reason we're adding these extra steps is to deepen your expertise with RAG and with Agentic workflows.\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "993a2a24-1a58-42be-8034-6d116fb8d786",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# imports\n",
|
|
"\n",
|
|
"import os\n",
|
|
"import re\n",
|
|
"import math\n",
|
|
"import json\n",
|
|
"from tqdm import tqdm\n",
|
|
"import random\n",
|
|
"from dotenv import load_dotenv\n",
|
|
"from huggingface_hub import login\n",
|
|
"import numpy as np\n",
|
|
"import pickle\n",
|
|
"from sentence_transformers import SentenceTransformer\n",
|
|
"from datasets import load_dataset\n",
|
|
"import chromadb\n",
|
|
"from sklearn.manifold import TSNE\n",
|
|
"import plotly.graph_objects as go"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "1cc1fe53-612f-4228-aa02-8758f4c2098f",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# It is very fun turning this up to 400_000 and seeing the full dataset visualized,\n",
|
|
"# but it almost crashes my box every time so do that at your own risk!! 10_000 is safe!\n",
|
|
"\n",
|
|
"MAXIMUM_DATAPOINTS = 30_000"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "f4aab95e-d719-4476-b6e7-e248120df25a",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"DB = \"products_vectorstore\"\n",
|
|
"client = chromadb.PersistentClient(path=DB)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "5f95dafd-ab80-464e-ba8a-dec7a2424780",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"collection = client.get_or_create_collection('products')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "525fc313-8a16-4ac0-8c42-6a6d1ba1c9b8",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"CATEGORIES = ['Appliances', 'Automotive', 'Cell_Phones_and_Accessories', 'Electronics','Musical_Instruments', 'Office_Products', 'Tools_and_Home_Improvement', 'Toys_and_Games']\n",
|
|
"COLORS = ['red', 'blue', 'brown', 'orange', 'yellow', 'green' , 'purple', 'cyan']"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "a4cf1c9a-1ced-48d4-974c-3c850905034e",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Prework\n",
|
|
"result = collection.get(include=['embeddings', 'documents', 'metadatas'], limit=MAXIMUM_DATAPOINTS)\n",
|
|
"vectors = np.array(result['embeddings'])\n",
|
|
"documents = result['documents']\n",
|
|
"categories = [metadata['category'] for metadata in result['metadatas']]\n",
|
|
"colors = [COLORS[CATEGORIES.index(c)] for c in categories]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "c54df150-c8d8-4bc3-8877-6759691eeb42",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Let's try a 2D chart\n",
|
|
"\n",
|
|
"tsne = TSNE(n_components=2, random_state=42, n_jobs=-1)\n",
|
|
"reduced_vectors = tsne.fit_transform(vectors)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "e8fb2a63-24c5-4dce-9e63-aa208272f82d",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"\n",
|
|
"# Create the 2D scatter plot\n",
|
|
"fig = go.Figure(data=[go.Scatter(\n",
|
|
" x=reduced_vectors[:, 0],\n",
|
|
" y=reduced_vectors[:, 1],\n",
|
|
" mode='markers',\n",
|
|
" marker=dict(size=3, color=colors, opacity=0.7),\n",
|
|
")])\n",
|
|
"\n",
|
|
"fig.update_layout(\n",
|
|
" title='2D Chroma Vectorstore Visualization',\n",
|
|
" scene=dict(xaxis_title='x', yaxis_title='y'),\n",
|
|
" width=1200,\n",
|
|
" height=800,\n",
|
|
" margin=dict(r=20, b=10, l=10, t=40)\n",
|
|
")\n",
|
|
"\n",
|
|
"fig.show()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "5e4ae088-3d29-45d3-87a2-fea805fe2c65",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.11.12"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|