diff --git a/community-contributions/Cosmus_Week3_exercise.ipynb b/community-contributions/Cosmus_Week3_exercise.ipynb new file mode 100644 index 0000000..04dd692 --- /dev/null +++ b/community-contributions/Cosmus_Week3_exercise.ipynb @@ -0,0 +1,282 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "d15d8294-3328-4e07-ad16-8a03e9bbfdb9", + "metadata": {}, + "source": [] + }, + { + "cell_type": "markdown", + "id": "83f28feb", + "metadata": {}, + "source": [ + "###Synthetic Dataset Generator with LLMs (Anthropic API)Everything runs with your Anthropic API key — no model downloads" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7510bec6", + "metadata": {}, + "outputs": [], + "source": [ + "# Imports and API setup\n", + "\n", + "import os\n", + "import json\n", + "import requests\n", + "import gradio as gr\n", + "from dotenv import load_dotenv" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5abc2ed3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "API key loaded successfully!\n" + ] + } + ], + "source": [ + "# Load variables from .env file\n", + "load_dotenv()\n", + "\n", + "# Get your Anthropic API key\n", + "API_KEY = os.getenv(\"API_KEY\")\n", + "\n", + "if not API_KEY:\n", + " raise ValueError(\" API_KEY not found. Check your .env file\")\n", + "\n", + "print(\"API key loaded successfully!\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e49ec675", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'data': [{'type': 'model', 'id': 'claude-haiku-4-5-20251001', 'display_name': 'Claude Haiku 4.5', 'created_at': '2025-10-15T00:00:00Z'}, {'type': 'model', 'id': 'claude-sonnet-4-5-20250929', 'display_name': 'Claude Sonnet 4.5', 'created_at': '2025-09-29T00:00:00Z'}, {'type': 'model', 'id': 'claude-opus-4-1-20250805', 'display_name': 'Claude Opus 4.1', 'created_at': '2025-08-05T00:00:00Z'}, {'type': 'model', 'id': 'claude-opus-4-20250514', 'display_name': 'Claude Opus 4', 'created_at': '2025-05-22T00:00:00Z'}, {'type': 'model', 'id': 'claude-sonnet-4-20250514', 'display_name': 'Claude Sonnet 4', 'created_at': '2025-05-22T00:00:00Z'}, {'type': 'model', 'id': 'claude-3-7-sonnet-20250219', 'display_name': 'Claude Sonnet 3.7', 'created_at': '2025-02-24T00:00:00Z'}, {'type': 'model', 'id': 'claude-3-5-haiku-20241022', 'display_name': 'Claude Haiku 3.5', 'created_at': '2024-10-22T00:00:00Z'}, {'type': 'model', 'id': 'claude-3-haiku-20240307', 'display_name': 'Claude Haiku 3', 'created_at': '2024-03-07T00:00:00Z'}], 'has_more': False, 'first_id': 'claude-haiku-4-5-20251001', 'last_id': 'claude-3-haiku-20240307'}\n" + ] + } + ], + "source": [ + "# Anthropic endpoint\n", + "API_URL = \"https://api.anthropic.com/v1/messages\"\n", + "\n", + "#see the models i can have access to\n", + "r = requests.get(\n", + " \"https://api.anthropic.com/v1/models\",\n", + " headers={\n", + " \"x-api-key\": API_KEY,\n", + " \"anthropic-version\": \"2023-06-01\"\n", + " },\n", + ")\n", + "print(r.json() if r.ok else r.text)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1b886ff2", + "metadata": {}, + "outputs": [], + "source": [ + "# Models to compare (variety)\n", + "MODELS = {\n", + " \"Claude 3 Haiku\": \"claude-3-haiku-20240307\", # fast & cheap\n", + " \"Claude Haiku 4.5\": \"claude-haiku-4-5-20251001\",\n", + " \"Claude Sonnet 4.5\": \"claude-sonnet-4-5-20250929\", # fast & cheap\n", + " \"Claude Opus 4.1\": \"claude-opus-4-1-20250805\",\n", + " \"Claude Opus 4\": \"claude-opus-4-20250514\", # fast & cheap\n", + " \"Claude Sonnet 4\": \"claude-sonnet-4-20250514\", # balanced\n", + " \"Claude Sonnet 3.7\": \"claude-3-7-sonnet-20250219\" # powerful (slowest)\n", + "}\n" + ] + }, + { + "cell_type": "markdown", + "id": "464ddf4c", + "metadata": {}, + "source": [ + "Synthetic Dataset Generation Function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7d64bca8", + "metadata": {}, + "outputs": [], + "source": [ + "# Dataset generator\n", + "\n", + "def generate_dataset(topic, n_records, model_choice):\n", + " prompt = f\"\"\"\n", + "You are a data generator creating synthetic datasets.\n", + "Generate {n_records} records about {topic}.\n", + "Output only a valid JSON array (no explanations or markdown).\n", + "Each record should have 4–6 fields and look realistic but fake.\n", + "\"\"\"\n", + "\n", + " headers = {\n", + " \"x-api-key\": API_KEY,\n", + " \"content-type\": \"application/json\",\n", + " \"anthropic-version\": \"2023-06-01\",\n", + " }\n", + "\n", + " payload = {\n", + " \"model\": model_choice,\n", + " \"max_tokens\": 500,\n", + " \"temperature\": 0.7,\n", + " \"messages\": [{\"role\": \"user\", \"content\": prompt}],\n", + " }\n", + "\n", + " response = requests.post(API_URL, headers=headers, data=json.dumps(payload))\n", + " result = response.json()\n", + "\n", + " if \"content\" in result and len(result[\"content\"]) > 0:\n", + " return result[\"content\"][0][\"text\"]\n", + " else:\n", + " return f\"Error: {result}\"\n" + ] + }, + { + "cell_type": "markdown", + "id": "bac01702", + "metadata": {}, + "source": [ + "Gradio UI" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "857d078d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "* Running on local URL: http://127.0.0.1:7864\n", + "* To create a public link, set `share=True` in `launch()`.\n" + ] + }, + { + "data": { + "text/html": [ + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# simple Gradio UI for dataset generation\n", + "\n", + "def ui_generate(topic, n_records, model_label):\n", + " model_id = MODELS[model_label]\n", + " n_records = min(int(n_records), 5) # limit for demo purposes\n", + " return generate_dataset(topic, n_records, model_id)\n", + "\n", + "# gradio block\n", + "with gr.Blocks(css=\".gradio-container {max-width: 600px !important; margin: auto;}\") as demo:\n", + " gr.Markdown(\"## Synthetic Dataset Generator using LLM APIs (Claude)\")\n", + "\n", + " with gr.Row():\n", + " topic = gr.Textbox(label=\"Dataset Topic\", value=\"Employee Records\")\n", + " n_records = gr.Number(label=\"Number of Records (Max 5 for demo purposes)\", value=3)\n", + "\n", + " model_choice = gr.Dropdown(\n", + " label=\"Choose Model\",\n", + " choices=list(MODELS.keys()),\n", + " value=\"Claude 3 Haiku\"\n", + " )\n", + "\n", + " btn = gr.Button(\"🚀 Generate\")\n", + "\n", + " # Scrollable, compact output area\n", + " output = gr.Code(label=\"Generated JSON Dataset\", language=\"json\", lines=15, interactive=False)\n", + "\n", + " btn.click(ui_generate, inputs=[topic, n_records, model_choice], outputs=[output])\n", + "\n", + "demo.launch()\n" + ] + }, + { + "cell_type": "markdown", + "id": "d50f64e1", + "metadata": {}, + "source": [ + "Save Output to File" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "93f73602", + "metadata": {}, + "outputs": [], + "source": [ + "def save_dataset_to_file(data, filename=\"synthetic_dataset.json\"):\n", + " try:\n", + " parsed = json.loads(data)\n", + " except:\n", + " print(\"Not valid JSON, saving as plain text instead.\")\n", + " with open(filename, \"w\", encoding=\"utf-8\") as f:\n", + " f.write(data)\n", + " return\n", + "\n", + " with open(filename, \"w\", encoding=\"utf-8\") as f:\n", + " json.dump(parsed, f, indent=2)\n", + " print(f\"Dataset saved as {filename}\")\n", + "\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}