Add Week 3 exercise (Cosmus)

This commit is contained in:
Cosmus Mutuku
2025-10-21 14:45:41 +03:00
parent 749bdb9a17
commit 181fa8b8a0

View File

@@ -0,0 +1,282 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "d15d8294-3328-4e07-ad16-8a03e9bbfdb9",
"metadata": {},
"source": []
},
{
"cell_type": "markdown",
"id": "83f28feb",
"metadata": {},
"source": [
"###Synthetic Dataset Generator with LLMs (Anthropic API)Everything runs with your Anthropic API key — no model downloads"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7510bec6",
"metadata": {},
"outputs": [],
"source": [
"# Imports and API setup\n",
"\n",
"import os\n",
"import json\n",
"import requests\n",
"import gradio as gr\n",
"from dotenv import load_dotenv"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5abc2ed3",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"API key loaded successfully!\n"
]
}
],
"source": [
"# Load variables from .env file\n",
"load_dotenv()\n",
"\n",
"# Get your Anthropic API key\n",
"API_KEY = os.getenv(\"API_KEY\")\n",
"\n",
"if not API_KEY:\n",
" raise ValueError(\" API_KEY not found. Check your .env file\")\n",
"\n",
"print(\"API key loaded successfully!\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e49ec675",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'data': [{'type': 'model', 'id': 'claude-haiku-4-5-20251001', 'display_name': 'Claude Haiku 4.5', 'created_at': '2025-10-15T00:00:00Z'}, {'type': 'model', 'id': 'claude-sonnet-4-5-20250929', 'display_name': 'Claude Sonnet 4.5', 'created_at': '2025-09-29T00:00:00Z'}, {'type': 'model', 'id': 'claude-opus-4-1-20250805', 'display_name': 'Claude Opus 4.1', 'created_at': '2025-08-05T00:00:00Z'}, {'type': 'model', 'id': 'claude-opus-4-20250514', 'display_name': 'Claude Opus 4', 'created_at': '2025-05-22T00:00:00Z'}, {'type': 'model', 'id': 'claude-sonnet-4-20250514', 'display_name': 'Claude Sonnet 4', 'created_at': '2025-05-22T00:00:00Z'}, {'type': 'model', 'id': 'claude-3-7-sonnet-20250219', 'display_name': 'Claude Sonnet 3.7', 'created_at': '2025-02-24T00:00:00Z'}, {'type': 'model', 'id': 'claude-3-5-haiku-20241022', 'display_name': 'Claude Haiku 3.5', 'created_at': '2024-10-22T00:00:00Z'}, {'type': 'model', 'id': 'claude-3-haiku-20240307', 'display_name': 'Claude Haiku 3', 'created_at': '2024-03-07T00:00:00Z'}], 'has_more': False, 'first_id': 'claude-haiku-4-5-20251001', 'last_id': 'claude-3-haiku-20240307'}\n"
]
}
],
"source": [
"# Anthropic endpoint\n",
"API_URL = \"https://api.anthropic.com/v1/messages\"\n",
"\n",
"#see the models i can have access to\n",
"r = requests.get(\n",
" \"https://api.anthropic.com/v1/models\",\n",
" headers={\n",
" \"x-api-key\": API_KEY,\n",
" \"anthropic-version\": \"2023-06-01\"\n",
" },\n",
")\n",
"print(r.json() if r.ok else r.text)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1b886ff2",
"metadata": {},
"outputs": [],
"source": [
"# Models to compare (variety)\n",
"MODELS = {\n",
" \"Claude 3 Haiku\": \"claude-3-haiku-20240307\", # fast & cheap\n",
" \"Claude Haiku 4.5\": \"claude-haiku-4-5-20251001\",\n",
" \"Claude Sonnet 4.5\": \"claude-sonnet-4-5-20250929\", # fast & cheap\n",
" \"Claude Opus 4.1\": \"claude-opus-4-1-20250805\",\n",
" \"Claude Opus 4\": \"claude-opus-4-20250514\", # fast & cheap\n",
" \"Claude Sonnet 4\": \"claude-sonnet-4-20250514\", # balanced\n",
" \"Claude Sonnet 3.7\": \"claude-3-7-sonnet-20250219\" # powerful (slowest)\n",
"}\n"
]
},
{
"cell_type": "markdown",
"id": "464ddf4c",
"metadata": {},
"source": [
"Synthetic Dataset Generation Function"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7d64bca8",
"metadata": {},
"outputs": [],
"source": [
"# Dataset generator\n",
"\n",
"def generate_dataset(topic, n_records, model_choice):\n",
" prompt = f\"\"\"\n",
"You are a data generator creating synthetic datasets.\n",
"Generate {n_records} records about {topic}.\n",
"Output only a valid JSON array (no explanations or markdown).\n",
"Each record should have 46 fields and look realistic but fake.\n",
"\"\"\"\n",
"\n",
" headers = {\n",
" \"x-api-key\": API_KEY,\n",
" \"content-type\": \"application/json\",\n",
" \"anthropic-version\": \"2023-06-01\",\n",
" }\n",
"\n",
" payload = {\n",
" \"model\": model_choice,\n",
" \"max_tokens\": 500,\n",
" \"temperature\": 0.7,\n",
" \"messages\": [{\"role\": \"user\", \"content\": prompt}],\n",
" }\n",
"\n",
" response = requests.post(API_URL, headers=headers, data=json.dumps(payload))\n",
" result = response.json()\n",
"\n",
" if \"content\" in result and len(result[\"content\"]) > 0:\n",
" return result[\"content\"][0][\"text\"]\n",
" else:\n",
" return f\"Error: {result}\"\n"
]
},
{
"cell_type": "markdown",
"id": "bac01702",
"metadata": {},
"source": [
"Gradio UI"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "857d078d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"* Running on local URL: http://127.0.0.1:7864\n",
"* To create a public link, set `share=True` in `launch()`.\n"
]
},
{
"data": {
"text/html": [
"<div><iframe src=\"http://127.0.0.1:7864/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": []
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# simple Gradio UI for dataset generation\n",
"\n",
"def ui_generate(topic, n_records, model_label):\n",
" model_id = MODELS[model_label]\n",
" n_records = min(int(n_records), 5) # limit for demo purposes\n",
" return generate_dataset(topic, n_records, model_id)\n",
"\n",
"# gradio block\n",
"with gr.Blocks(css=\".gradio-container {max-width: 600px !important; margin: auto;}\") as demo:\n",
" gr.Markdown(\"## Synthetic Dataset Generator using LLM APIs (Claude)\")\n",
"\n",
" with gr.Row():\n",
" topic = gr.Textbox(label=\"Dataset Topic\", value=\"Employee Records\")\n",
" n_records = gr.Number(label=\"Number of Records (Max 5 for demo purposes)\", value=3)\n",
"\n",
" model_choice = gr.Dropdown(\n",
" label=\"Choose Model\",\n",
" choices=list(MODELS.keys()),\n",
" value=\"Claude 3 Haiku\"\n",
" )\n",
"\n",
" btn = gr.Button(\"🚀 Generate\")\n",
"\n",
" # Scrollable, compact output area\n",
" output = gr.Code(label=\"Generated JSON Dataset\", language=\"json\", lines=15, interactive=False)\n",
"\n",
" btn.click(ui_generate, inputs=[topic, n_records, model_choice], outputs=[output])\n",
"\n",
"demo.launch()\n"
]
},
{
"cell_type": "markdown",
"id": "d50f64e1",
"metadata": {},
"source": [
"Save Output to File"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "93f73602",
"metadata": {},
"outputs": [],
"source": [
"def save_dataset_to_file(data, filename=\"synthetic_dataset.json\"):\n",
" try:\n",
" parsed = json.loads(data)\n",
" except:\n",
" print(\"Not valid JSON, saving as plain text instead.\")\n",
" with open(filename, \"w\", encoding=\"utf-8\") as f:\n",
" f.write(data)\n",
" return\n",
"\n",
" with open(filename, \"w\", encoding=\"utf-8\") as f:\n",
" json.dump(parsed, f, indent=2)\n",
" print(f\"Dataset saved as {filename}\")\n",
"\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.5"
}
},
"nbformat": 4,
"nbformat_minor": 5
}