Files
LLM_Engineering_OLD/community-contributions/Cosmus_Week3_exercise.ipynb
2025-10-21 14:45:41 +03:00

283 lines
8.6 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
{
"cells": [
{
"cell_type": "markdown",
"id": "d15d8294-3328-4e07-ad16-8a03e9bbfdb9",
"metadata": {},
"source": []
},
{
"cell_type": "markdown",
"id": "83f28feb",
"metadata": {},
"source": [
"###Synthetic Dataset Generator with LLMs (Anthropic API)Everything runs with your Anthropic API key — no model downloads"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7510bec6",
"metadata": {},
"outputs": [],
"source": [
"# Imports and API setup\n",
"\n",
"import os\n",
"import json\n",
"import requests\n",
"import gradio as gr\n",
"from dotenv import load_dotenv"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5abc2ed3",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"API key loaded successfully!\n"
]
}
],
"source": [
"# Load variables from .env file\n",
"load_dotenv()\n",
"\n",
"# Get your Anthropic API key\n",
"API_KEY = os.getenv(\"API_KEY\")\n",
"\n",
"if not API_KEY:\n",
" raise ValueError(\" API_KEY not found. Check your .env file\")\n",
"\n",
"print(\"API key loaded successfully!\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e49ec675",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'data': [{'type': 'model', 'id': 'claude-haiku-4-5-20251001', 'display_name': 'Claude Haiku 4.5', 'created_at': '2025-10-15T00:00:00Z'}, {'type': 'model', 'id': 'claude-sonnet-4-5-20250929', 'display_name': 'Claude Sonnet 4.5', 'created_at': '2025-09-29T00:00:00Z'}, {'type': 'model', 'id': 'claude-opus-4-1-20250805', 'display_name': 'Claude Opus 4.1', 'created_at': '2025-08-05T00:00:00Z'}, {'type': 'model', 'id': 'claude-opus-4-20250514', 'display_name': 'Claude Opus 4', 'created_at': '2025-05-22T00:00:00Z'}, {'type': 'model', 'id': 'claude-sonnet-4-20250514', 'display_name': 'Claude Sonnet 4', 'created_at': '2025-05-22T00:00:00Z'}, {'type': 'model', 'id': 'claude-3-7-sonnet-20250219', 'display_name': 'Claude Sonnet 3.7', 'created_at': '2025-02-24T00:00:00Z'}, {'type': 'model', 'id': 'claude-3-5-haiku-20241022', 'display_name': 'Claude Haiku 3.5', 'created_at': '2024-10-22T00:00:00Z'}, {'type': 'model', 'id': 'claude-3-haiku-20240307', 'display_name': 'Claude Haiku 3', 'created_at': '2024-03-07T00:00:00Z'}], 'has_more': False, 'first_id': 'claude-haiku-4-5-20251001', 'last_id': 'claude-3-haiku-20240307'}\n"
]
}
],
"source": [
"# Anthropic endpoint\n",
"API_URL = \"https://api.anthropic.com/v1/messages\"\n",
"\n",
"#see the models i can have access to\n",
"r = requests.get(\n",
" \"https://api.anthropic.com/v1/models\",\n",
" headers={\n",
" \"x-api-key\": API_KEY,\n",
" \"anthropic-version\": \"2023-06-01\"\n",
" },\n",
")\n",
"print(r.json() if r.ok else r.text)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1b886ff2",
"metadata": {},
"outputs": [],
"source": [
"# Models to compare (variety)\n",
"MODELS = {\n",
" \"Claude 3 Haiku\": \"claude-3-haiku-20240307\", # fast & cheap\n",
" \"Claude Haiku 4.5\": \"claude-haiku-4-5-20251001\",\n",
" \"Claude Sonnet 4.5\": \"claude-sonnet-4-5-20250929\", # fast & cheap\n",
" \"Claude Opus 4.1\": \"claude-opus-4-1-20250805\",\n",
" \"Claude Opus 4\": \"claude-opus-4-20250514\", # fast & cheap\n",
" \"Claude Sonnet 4\": \"claude-sonnet-4-20250514\", # balanced\n",
" \"Claude Sonnet 3.7\": \"claude-3-7-sonnet-20250219\" # powerful (slowest)\n",
"}\n"
]
},
{
"cell_type": "markdown",
"id": "464ddf4c",
"metadata": {},
"source": [
"Synthetic Dataset Generation Function"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7d64bca8",
"metadata": {},
"outputs": [],
"source": [
"# Dataset generator\n",
"\n",
"def generate_dataset(topic, n_records, model_choice):\n",
" prompt = f\"\"\"\n",
"You are a data generator creating synthetic datasets.\n",
"Generate {n_records} records about {topic}.\n",
"Output only a valid JSON array (no explanations or markdown).\n",
"Each record should have 46 fields and look realistic but fake.\n",
"\"\"\"\n",
"\n",
" headers = {\n",
" \"x-api-key\": API_KEY,\n",
" \"content-type\": \"application/json\",\n",
" \"anthropic-version\": \"2023-06-01\",\n",
" }\n",
"\n",
" payload = {\n",
" \"model\": model_choice,\n",
" \"max_tokens\": 500,\n",
" \"temperature\": 0.7,\n",
" \"messages\": [{\"role\": \"user\", \"content\": prompt}],\n",
" }\n",
"\n",
" response = requests.post(API_URL, headers=headers, data=json.dumps(payload))\n",
" result = response.json()\n",
"\n",
" if \"content\" in result and len(result[\"content\"]) > 0:\n",
" return result[\"content\"][0][\"text\"]\n",
" else:\n",
" return f\"Error: {result}\"\n"
]
},
{
"cell_type": "markdown",
"id": "bac01702",
"metadata": {},
"source": [
"Gradio UI"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "857d078d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"* Running on local URL: http://127.0.0.1:7864\n",
"* To create a public link, set `share=True` in `launch()`.\n"
]
},
{
"data": {
"text/html": [
"<div><iframe src=\"http://127.0.0.1:7864/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": []
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# simple Gradio UI for dataset generation\n",
"\n",
"def ui_generate(topic, n_records, model_label):\n",
" model_id = MODELS[model_label]\n",
" n_records = min(int(n_records), 5) # limit for demo purposes\n",
" return generate_dataset(topic, n_records, model_id)\n",
"\n",
"# gradio block\n",
"with gr.Blocks(css=\".gradio-container {max-width: 600px !important; margin: auto;}\") as demo:\n",
" gr.Markdown(\"## Synthetic Dataset Generator using LLM APIs (Claude)\")\n",
"\n",
" with gr.Row():\n",
" topic = gr.Textbox(label=\"Dataset Topic\", value=\"Employee Records\")\n",
" n_records = gr.Number(label=\"Number of Records (Max 5 for demo purposes)\", value=3)\n",
"\n",
" model_choice = gr.Dropdown(\n",
" label=\"Choose Model\",\n",
" choices=list(MODELS.keys()),\n",
" value=\"Claude 3 Haiku\"\n",
" )\n",
"\n",
" btn = gr.Button(\"🚀 Generate\")\n",
"\n",
" # Scrollable, compact output area\n",
" output = gr.Code(label=\"Generated JSON Dataset\", language=\"json\", lines=15, interactive=False)\n",
"\n",
" btn.click(ui_generate, inputs=[topic, n_records, model_choice], outputs=[output])\n",
"\n",
"demo.launch()\n"
]
},
{
"cell_type": "markdown",
"id": "d50f64e1",
"metadata": {},
"source": [
"Save Output to File"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "93f73602",
"metadata": {},
"outputs": [],
"source": [
"def save_dataset_to_file(data, filename=\"synthetic_dataset.json\"):\n",
" try:\n",
" parsed = json.loads(data)\n",
" except:\n",
" print(\"Not valid JSON, saving as plain text instead.\")\n",
" with open(filename, \"w\", encoding=\"utf-8\") as f:\n",
" f.write(data)\n",
" return\n",
"\n",
" with open(filename, \"w\", encoding=\"utf-8\") as f:\n",
" json.dump(parsed, f, indent=2)\n",
" print(f\"Dataset saved as {filename}\")\n",
"\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.5"
}
},
"nbformat": 4,
"nbformat_minor": 5
}