{ "cells": [ { "cell_type": "markdown", "id": "d15d8294-3328-4e07-ad16-8a03e9bbfdb9", "metadata": {}, "source": [] }, { "cell_type": "markdown", "id": "83f28feb", "metadata": {}, "source": [ "###Synthetic Dataset Generator with LLMs (Anthropic API)Everything runs with your Anthropic API key — no model downloads" ] }, { "cell_type": "code", "execution_count": null, "id": "7510bec6", "metadata": {}, "outputs": [], "source": [ "# Imports and API setup\n", "\n", "import os\n", "import json\n", "import requests\n", "import gradio as gr\n", "from dotenv import load_dotenv" ] }, { "cell_type": "code", "execution_count": null, "id": "5abc2ed3", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "API key loaded successfully!\n" ] } ], "source": [ "# Load variables from .env file\n", "load_dotenv()\n", "\n", "# Get your Anthropic API key\n", "API_KEY = os.getenv(\"API_KEY\")\n", "\n", "if not API_KEY:\n", " raise ValueError(\" API_KEY not found. Check your .env file\")\n", "\n", "print(\"API key loaded successfully!\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "e49ec675", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'data': [{'type': 'model', 'id': 'claude-haiku-4-5-20251001', 'display_name': 'Claude Haiku 4.5', 'created_at': '2025-10-15T00:00:00Z'}, {'type': 'model', 'id': 'claude-sonnet-4-5-20250929', 'display_name': 'Claude Sonnet 4.5', 'created_at': '2025-09-29T00:00:00Z'}, {'type': 'model', 'id': 'claude-opus-4-1-20250805', 'display_name': 'Claude Opus 4.1', 'created_at': '2025-08-05T00:00:00Z'}, {'type': 'model', 'id': 'claude-opus-4-20250514', 'display_name': 'Claude Opus 4', 'created_at': '2025-05-22T00:00:00Z'}, {'type': 'model', 'id': 'claude-sonnet-4-20250514', 'display_name': 'Claude Sonnet 4', 'created_at': '2025-05-22T00:00:00Z'}, {'type': 'model', 'id': 'claude-3-7-sonnet-20250219', 'display_name': 'Claude Sonnet 3.7', 'created_at': '2025-02-24T00:00:00Z'}, {'type': 'model', 'id': 'claude-3-5-haiku-20241022', 'display_name': 'Claude Haiku 3.5', 'created_at': '2024-10-22T00:00:00Z'}, {'type': 'model', 'id': 'claude-3-haiku-20240307', 'display_name': 'Claude Haiku 3', 'created_at': '2024-03-07T00:00:00Z'}], 'has_more': False, 'first_id': 'claude-haiku-4-5-20251001', 'last_id': 'claude-3-haiku-20240307'}\n" ] } ], "source": [ "# Anthropic endpoint\n", "API_URL = \"https://api.anthropic.com/v1/messages\"\n", "\n", "#see the models i can have access to\n", "r = requests.get(\n", " \"https://api.anthropic.com/v1/models\",\n", " headers={\n", " \"x-api-key\": API_KEY,\n", " \"anthropic-version\": \"2023-06-01\"\n", " },\n", ")\n", "print(r.json() if r.ok else r.text)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "1b886ff2", "metadata": {}, "outputs": [], "source": [ "# Models to compare (variety)\n", "MODELS = {\n", " \"Claude 3 Haiku\": \"claude-3-haiku-20240307\", # fast & cheap\n", " \"Claude Haiku 4.5\": \"claude-haiku-4-5-20251001\",\n", " \"Claude Sonnet 4.5\": \"claude-sonnet-4-5-20250929\", # fast & cheap\n", " \"Claude Opus 4.1\": \"claude-opus-4-1-20250805\",\n", " \"Claude Opus 4\": \"claude-opus-4-20250514\", # fast & cheap\n", " \"Claude Sonnet 4\": \"claude-sonnet-4-20250514\", # balanced\n", " \"Claude Sonnet 3.7\": \"claude-3-7-sonnet-20250219\" # powerful (slowest)\n", "}\n" ] }, { "cell_type": "markdown", "id": "464ddf4c", "metadata": {}, "source": [ "Synthetic Dataset Generation Function" ] }, { "cell_type": "code", "execution_count": null, "id": "7d64bca8", "metadata": {}, "outputs": [], "source": [ "# Dataset generator\n", "\n", "def generate_dataset(topic, n_records, model_choice):\n", " prompt = f\"\"\"\n", "You are a data generator creating synthetic datasets.\n", "Generate {n_records} records about {topic}.\n", "Output only a valid JSON array (no explanations or markdown).\n", "Each record should have 4–6 fields and look realistic but fake.\n", "\"\"\"\n", "\n", " headers = {\n", " \"x-api-key\": API_KEY,\n", " \"content-type\": \"application/json\",\n", " \"anthropic-version\": \"2023-06-01\",\n", " }\n", "\n", " payload = {\n", " \"model\": model_choice,\n", " \"max_tokens\": 500,\n", " \"temperature\": 0.7,\n", " \"messages\": [{\"role\": \"user\", \"content\": prompt}],\n", " }\n", "\n", " response = requests.post(API_URL, headers=headers, data=json.dumps(payload))\n", " result = response.json()\n", "\n", " if \"content\" in result and len(result[\"content\"]) > 0:\n", " return result[\"content\"][0][\"text\"]\n", " else:\n", " return f\"Error: {result}\"\n" ] }, { "cell_type": "markdown", "id": "bac01702", "metadata": {}, "source": [ "Gradio UI" ] }, { "cell_type": "code", "execution_count": null, "id": "857d078d", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "* Running on local URL: http://127.0.0.1:7864\n", "* To create a public link, set `share=True` in `launch()`.\n" ] }, { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# simple Gradio UI for dataset generation\n", "\n", "def ui_generate(topic, n_records, model_label):\n", " model_id = MODELS[model_label]\n", " n_records = min(int(n_records), 5) # limit for demo purposes\n", " return generate_dataset(topic, n_records, model_id)\n", "\n", "# gradio block\n", "with gr.Blocks(css=\".gradio-container {max-width: 600px !important; margin: auto;}\") as demo:\n", " gr.Markdown(\"## Synthetic Dataset Generator using LLM APIs (Claude)\")\n", "\n", " with gr.Row():\n", " topic = gr.Textbox(label=\"Dataset Topic\", value=\"Employee Records\")\n", " n_records = gr.Number(label=\"Number of Records (Max 5 for demo purposes)\", value=3)\n", "\n", " model_choice = gr.Dropdown(\n", " label=\"Choose Model\",\n", " choices=list(MODELS.keys()),\n", " value=\"Claude 3 Haiku\"\n", " )\n", "\n", " btn = gr.Button(\"🚀 Generate\")\n", "\n", " # Scrollable, compact output area\n", " output = gr.Code(label=\"Generated JSON Dataset\", language=\"json\", lines=15, interactive=False)\n", "\n", " btn.click(ui_generate, inputs=[topic, n_records, model_choice], outputs=[output])\n", "\n", "demo.launch()\n" ] }, { "cell_type": "markdown", "id": "d50f64e1", "metadata": {}, "source": [ "Save Output to File" ] }, { "cell_type": "code", "execution_count": null, "id": "93f73602", "metadata": {}, "outputs": [], "source": [ "def save_dataset_to_file(data, filename=\"synthetic_dataset.json\"):\n", " try:\n", " parsed = json.loads(data)\n", " except:\n", " print(\"Not valid JSON, saving as plain text instead.\")\n", " with open(filename, \"w\", encoding=\"utf-8\") as f:\n", " f.write(data)\n", " return\n", "\n", " with open(filename, \"w\", encoding=\"utf-8\") as f:\n", " json.dump(parsed, f, indent=2)\n", " print(f\"Dataset saved as {filename}\")\n", "\n" ] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.13.5" } }, "nbformat": 4, "nbformat_minor": 5 }