LLM_Engineering_OLD/community-contributions/Cosmus_Week3_exercise.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "d15d8294-3328-4e07-ad16-8a03e9bbfdb9",
   "metadata": {},
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "83f28feb",
   "metadata": {},
   "source": [
    "###Synthetic Dataset Generator with LLMs (Anthropic API)Everything runs with your Anthropic API key — no model downloads"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7510bec6",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Imports and API setup\n",
    "\n",
    "import os\n",
    "import json\n",
    "import requests\n",
    "import gradio as gr\n",
    "from dotenv import load_dotenv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5abc2ed3",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "API key loaded successfully!\n"
     ]
    }
   ],
   "source": [
    "# Load variables from .env file\n",
    "load_dotenv()\n",
    "\n",
    "# Get your Anthropic API key\n",
    "API_KEY = os.getenv(\"API_KEY\")\n",
    "\n",
    "if not API_KEY:\n",
    "    raise ValueError(\" API_KEY not found. Check your .env file\")\n",
    "\n",
    "print(\"API key loaded successfully!\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e49ec675",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'data': [{'type': 'model', 'id': 'claude-haiku-4-5-20251001', 'display_name': 'Claude Haiku 4.5', 'created_at': '2025-10-15T00:00:00Z'}, {'type': 'model', 'id': 'claude-sonnet-4-5-20250929', 'display_name': 'Claude Sonnet 4.5', 'created_at': '2025-09-29T00:00:00Z'}, {'type': 'model', 'id': 'claude-opus-4-1-20250805', 'display_name': 'Claude Opus 4.1', 'created_at': '2025-08-05T00:00:00Z'}, {'type': 'model', 'id': 'claude-opus-4-20250514', 'display_name': 'Claude Opus 4', 'created_at': '2025-05-22T00:00:00Z'}, {'type': 'model', 'id': 'claude-sonnet-4-20250514', 'display_name': 'Claude Sonnet 4', 'created_at': '2025-05-22T00:00:00Z'}, {'type': 'model', 'id': 'claude-3-7-sonnet-20250219', 'display_name': 'Claude Sonnet 3.7', 'created_at': '2025-02-24T00:00:00Z'}, {'type': 'model', 'id': 'claude-3-5-haiku-20241022', 'display_name': 'Claude Haiku 3.5', 'created_at': '2024-10-22T00:00:00Z'}, {'type': 'model', 'id': 'claude-3-haiku-20240307', 'display_name': 'Claude Haiku 3', 'created_at': '2024-03-07T00:00:00Z'}], 'has_more': False, 'first_id': 'claude-haiku-4-5-20251001', 'last_id': 'claude-3-haiku-20240307'}\n"
     ]
    }
   ],
   "source": [
    "# Anthropic endpoint\n",
    "API_URL = \"https://api.anthropic.com/v1/messages\"\n",
    "\n",
    "#see the models i can have access to\n",
    "r = requests.get(\n",
    "    \"https://api.anthropic.com/v1/models\",\n",
    "    headers={\n",
    "        \"x-api-key\": API_KEY,\n",
    "        \"anthropic-version\": \"2023-06-01\"\n",
    "    },\n",
    ")\n",
    "print(r.json() if r.ok else r.text)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1b886ff2",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Models to compare (variety)\n",
    "MODELS = {\n",
    "    \"Claude 3 Haiku\": \"claude-3-haiku-20240307\",     # fast & cheap\n",
    "    \"Claude Haiku 4.5\": \"claude-haiku-4-5-20251001\",\n",
    "    \"Claude Sonnet 4.5\": \"claude-sonnet-4-5-20250929\",     # fast & cheap\n",
    "    \"Claude Opus 4.1\": \"claude-opus-4-1-20250805\",\n",
    "    \"Claude Opus 4\": \"claude-opus-4-20250514\",     # fast & cheap\n",
    "    \"Claude Sonnet 4\": \"claude-sonnet-4-20250514\",   # balanced\n",
    "    \"Claude Sonnet 3.7\": \"claude-3-7-sonnet-20250219\"        # powerful (slowest)\n",
    "}\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "464ddf4c",
   "metadata": {},
   "source": [
    "Synthetic Dataset Generation Function"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7d64bca8",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Dataset generator\n",
    "\n",
    "def generate_dataset(topic, n_records, model_choice):\n",
    "    prompt = f\"\"\"\n",
    "You are a data generator creating synthetic datasets.\n",
    "Generate {n_records} records about {topic}.\n",
    "Output only a valid JSON array (no explanations or markdown).\n",
    "Each record should have 4–6 fields and look realistic but fake.\n",
    "\"\"\"\n",
    "\n",
    "    headers = {\n",
    "        \"x-api-key\": API_KEY,\n",
    "        \"content-type\": \"application/json\",\n",
    "        \"anthropic-version\": \"2023-06-01\",\n",
    "    }\n",
    "\n",
    "    payload = {\n",
    "        \"model\": model_choice,\n",
    "        \"max_tokens\": 500,\n",
    "        \"temperature\": 0.7,\n",
    "        \"messages\": [{\"role\": \"user\", \"content\": prompt}],\n",
    "    }\n",
    "\n",
    "    response = requests.post(API_URL, headers=headers, data=json.dumps(payload))\n",
    "    result = response.json()\n",
    "\n",
    "    if \"content\" in result and len(result[\"content\"]) > 0:\n",
    "        return result[\"content\"][0][\"text\"]\n",
    "    else:\n",
    "        return f\"Error: {result}\"\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bac01702",
   "metadata": {},
   "source": [
    "Gradio UI"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "857d078d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "* Running on local URL:  http://127.0.0.1:7864\n",
      "* To create a public link, set `share=True` in `launch()`.\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div><iframe src=\"http://127.0.0.1:7864/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": []
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#  simple Gradio UI for dataset generation\n",
    "\n",
    "def ui_generate(topic, n_records, model_label):\n",
    "    model_id = MODELS[model_label]\n",
    "    n_records = min(int(n_records), 5)  # limit for demo purposes\n",
    "    return generate_dataset(topic, n_records, model_id)\n",
    "\n",
    "# gradio block\n",
    "with gr.Blocks(css=\".gradio-container {max-width: 600px !important; margin: auto;}\") as demo:\n",
    "    gr.Markdown(\"## Synthetic Dataset Generator using LLM APIs (Claude)\")\n",
    "\n",
    "    with gr.Row():\n",
    "        topic = gr.Textbox(label=\"Dataset Topic\", value=\"Employee Records\")\n",
    "        n_records = gr.Number(label=\"Number of Records (Max 5 for demo purposes)\", value=3)\n",
    "\n",
    "    model_choice = gr.Dropdown(\n",
    "        label=\"Choose Model\",\n",
    "        choices=list(MODELS.keys()),\n",
    "        value=\"Claude 3 Haiku\"\n",
    "    )\n",
    "\n",
    "    btn = gr.Button(\"🚀 Generate\")\n",
    "\n",
    "    # Scrollable, compact output area\n",
    "    output = gr.Code(label=\"Generated JSON Dataset\", language=\"json\", lines=15, interactive=False)\n",
    "\n",
    "    btn.click(ui_generate, inputs=[topic, n_records, model_choice], outputs=[output])\n",
    "\n",
    "demo.launch()\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d50f64e1",
   "metadata": {},
   "source": [
    "Save Output to File"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "93f73602",
   "metadata": {},
   "outputs": [],
   "source": [
    "def save_dataset_to_file(data, filename=\"synthetic_dataset.json\"):\n",
    "    try:\n",
    "        parsed = json.loads(data)\n",
    "    except:\n",
    "        print(\"Not valid JSON, saving as plain text instead.\")\n",
    "        with open(filename, \"w\", encoding=\"utf-8\") as f:\n",
    "            f.write(data)\n",
    "        return\n",
    "\n",
    "    with open(filename, \"w\", encoding=\"utf-8\") as f:\n",
    "        json.dump(parsed, f, indent=2)\n",
    "    print(f\"Dataset saved as {filename}\")\n",
    "\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}