Merge pull request #768 from CosmusMutuku/main
Week 3 exercise (Cosmus)
This commit is contained in:
282
community-contributions/Cosmus_Week3_exercise.ipynb
Normal file
282
community-contributions/Cosmus_Week3_exercise.ipynb
Normal file
@@ -0,0 +1,282 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "d15d8294-3328-4e07-ad16-8a03e9bbfdb9",
|
||||||
|
"metadata": {},
|
||||||
|
"source": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "83f28feb",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"###Synthetic Dataset Generator with LLMs (Anthropic API)Everything runs with your Anthropic API key — no model downloads"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "7510bec6",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Imports and API setup\n",
|
||||||
|
"\n",
|
||||||
|
"import os\n",
|
||||||
|
"import json\n",
|
||||||
|
"import requests\n",
|
||||||
|
"import gradio as gr\n",
|
||||||
|
"from dotenv import load_dotenv"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "5abc2ed3",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"API key loaded successfully!\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Load variables from .env file\n",
|
||||||
|
"load_dotenv()\n",
|
||||||
|
"\n",
|
||||||
|
"# Get your Anthropic API key\n",
|
||||||
|
"API_KEY = os.getenv(\"API_KEY\")\n",
|
||||||
|
"\n",
|
||||||
|
"if not API_KEY:\n",
|
||||||
|
" raise ValueError(\" API_KEY not found. Check your .env file\")\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"API key loaded successfully!\")\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "e49ec675",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"{'data': [{'type': 'model', 'id': 'claude-haiku-4-5-20251001', 'display_name': 'Claude Haiku 4.5', 'created_at': '2025-10-15T00:00:00Z'}, {'type': 'model', 'id': 'claude-sonnet-4-5-20250929', 'display_name': 'Claude Sonnet 4.5', 'created_at': '2025-09-29T00:00:00Z'}, {'type': 'model', 'id': 'claude-opus-4-1-20250805', 'display_name': 'Claude Opus 4.1', 'created_at': '2025-08-05T00:00:00Z'}, {'type': 'model', 'id': 'claude-opus-4-20250514', 'display_name': 'Claude Opus 4', 'created_at': '2025-05-22T00:00:00Z'}, {'type': 'model', 'id': 'claude-sonnet-4-20250514', 'display_name': 'Claude Sonnet 4', 'created_at': '2025-05-22T00:00:00Z'}, {'type': 'model', 'id': 'claude-3-7-sonnet-20250219', 'display_name': 'Claude Sonnet 3.7', 'created_at': '2025-02-24T00:00:00Z'}, {'type': 'model', 'id': 'claude-3-5-haiku-20241022', 'display_name': 'Claude Haiku 3.5', 'created_at': '2024-10-22T00:00:00Z'}, {'type': 'model', 'id': 'claude-3-haiku-20240307', 'display_name': 'Claude Haiku 3', 'created_at': '2024-03-07T00:00:00Z'}], 'has_more': False, 'first_id': 'claude-haiku-4-5-20251001', 'last_id': 'claude-3-haiku-20240307'}\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Anthropic endpoint\n",
|
||||||
|
"API_URL = \"https://api.anthropic.com/v1/messages\"\n",
|
||||||
|
"\n",
|
||||||
|
"#see the models i can have access to\n",
|
||||||
|
"r = requests.get(\n",
|
||||||
|
" \"https://api.anthropic.com/v1/models\",\n",
|
||||||
|
" headers={\n",
|
||||||
|
" \"x-api-key\": API_KEY,\n",
|
||||||
|
" \"anthropic-version\": \"2023-06-01\"\n",
|
||||||
|
" },\n",
|
||||||
|
")\n",
|
||||||
|
"print(r.json() if r.ok else r.text)\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "1b886ff2",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Models to compare (variety)\n",
|
||||||
|
"MODELS = {\n",
|
||||||
|
" \"Claude 3 Haiku\": \"claude-3-haiku-20240307\", # fast & cheap\n",
|
||||||
|
" \"Claude Haiku 4.5\": \"claude-haiku-4-5-20251001\",\n",
|
||||||
|
" \"Claude Sonnet 4.5\": \"claude-sonnet-4-5-20250929\", # fast & cheap\n",
|
||||||
|
" \"Claude Opus 4.1\": \"claude-opus-4-1-20250805\",\n",
|
||||||
|
" \"Claude Opus 4\": \"claude-opus-4-20250514\", # fast & cheap\n",
|
||||||
|
" \"Claude Sonnet 4\": \"claude-sonnet-4-20250514\", # balanced\n",
|
||||||
|
" \"Claude Sonnet 3.7\": \"claude-3-7-sonnet-20250219\" # powerful (slowest)\n",
|
||||||
|
"}\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "464ddf4c",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Synthetic Dataset Generation Function"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "7d64bca8",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Dataset generator\n",
|
||||||
|
"\n",
|
||||||
|
"def generate_dataset(topic, n_records, model_choice):\n",
|
||||||
|
" prompt = f\"\"\"\n",
|
||||||
|
"You are a data generator creating synthetic datasets.\n",
|
||||||
|
"Generate {n_records} records about {topic}.\n",
|
||||||
|
"Output only a valid JSON array (no explanations or markdown).\n",
|
||||||
|
"Each record should have 4–6 fields and look realistic but fake.\n",
|
||||||
|
"\"\"\"\n",
|
||||||
|
"\n",
|
||||||
|
" headers = {\n",
|
||||||
|
" \"x-api-key\": API_KEY,\n",
|
||||||
|
" \"content-type\": \"application/json\",\n",
|
||||||
|
" \"anthropic-version\": \"2023-06-01\",\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" payload = {\n",
|
||||||
|
" \"model\": model_choice,\n",
|
||||||
|
" \"max_tokens\": 500,\n",
|
||||||
|
" \"temperature\": 0.7,\n",
|
||||||
|
" \"messages\": [{\"role\": \"user\", \"content\": prompt}],\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" response = requests.post(API_URL, headers=headers, data=json.dumps(payload))\n",
|
||||||
|
" result = response.json()\n",
|
||||||
|
"\n",
|
||||||
|
" if \"content\" in result and len(result[\"content\"]) > 0:\n",
|
||||||
|
" return result[\"content\"][0][\"text\"]\n",
|
||||||
|
" else:\n",
|
||||||
|
" return f\"Error: {result}\"\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "bac01702",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Gradio UI"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "857d078d",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"* Running on local URL: http://127.0.0.1:7864\n",
|
||||||
|
"* To create a public link, set `share=True` in `launch()`.\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<div><iframe src=\"http://127.0.0.1:7864/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
"<IPython.core.display.HTML object>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": []
|
||||||
|
},
|
||||||
|
"execution_count": 33,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# simple Gradio UI for dataset generation\n",
|
||||||
|
"\n",
|
||||||
|
"def ui_generate(topic, n_records, model_label):\n",
|
||||||
|
" model_id = MODELS[model_label]\n",
|
||||||
|
" n_records = min(int(n_records), 5) # limit for demo purposes\n",
|
||||||
|
" return generate_dataset(topic, n_records, model_id)\n",
|
||||||
|
"\n",
|
||||||
|
"# gradio block\n",
|
||||||
|
"with gr.Blocks(css=\".gradio-container {max-width: 600px !important; margin: auto;}\") as demo:\n",
|
||||||
|
" gr.Markdown(\"## Synthetic Dataset Generator using LLM APIs (Claude)\")\n",
|
||||||
|
"\n",
|
||||||
|
" with gr.Row():\n",
|
||||||
|
" topic = gr.Textbox(label=\"Dataset Topic\", value=\"Employee Records\")\n",
|
||||||
|
" n_records = gr.Number(label=\"Number of Records (Max 5 for demo purposes)\", value=3)\n",
|
||||||
|
"\n",
|
||||||
|
" model_choice = gr.Dropdown(\n",
|
||||||
|
" label=\"Choose Model\",\n",
|
||||||
|
" choices=list(MODELS.keys()),\n",
|
||||||
|
" value=\"Claude 3 Haiku\"\n",
|
||||||
|
" )\n",
|
||||||
|
"\n",
|
||||||
|
" btn = gr.Button(\"🚀 Generate\")\n",
|
||||||
|
"\n",
|
||||||
|
" # Scrollable, compact output area\n",
|
||||||
|
" output = gr.Code(label=\"Generated JSON Dataset\", language=\"json\", lines=15, interactive=False)\n",
|
||||||
|
"\n",
|
||||||
|
" btn.click(ui_generate, inputs=[topic, n_records, model_choice], outputs=[output])\n",
|
||||||
|
"\n",
|
||||||
|
"demo.launch()\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "d50f64e1",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Save Output to File"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "93f73602",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def save_dataset_to_file(data, filename=\"synthetic_dataset.json\"):\n",
|
||||||
|
" try:\n",
|
||||||
|
" parsed = json.loads(data)\n",
|
||||||
|
" except:\n",
|
||||||
|
" print(\"Not valid JSON, saving as plain text instead.\")\n",
|
||||||
|
" with open(filename, \"w\", encoding=\"utf-8\") as f:\n",
|
||||||
|
" f.write(data)\n",
|
||||||
|
" return\n",
|
||||||
|
"\n",
|
||||||
|
" with open(filename, \"w\", encoding=\"utf-8\") as f:\n",
|
||||||
|
" json.dump(parsed, f, indent=2)\n",
|
||||||
|
" print(f\"Dataset saved as {filename}\")\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": ".venv",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.13.5"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user