Merge pull request #808 from muthash/stephen/week3-exercise-2

[Bootcamp] Week 3 Synthetic Data Generator (Stephen)
2025-10-23 09:06:59 -04:00
parent 9eb132cede 6226c3d10d
commit 6c15df1f46
1 changed files with 216 additions and 0 deletions
--- a/week3/community-contributions/week3_exercise_solution-Stephen.ipynb
+++ b/week3/community-contributions/week3_exercise_solution-Stephen.ipynb
@@ -0,0 +1,216 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "id": "c58e628f",
   "metadata": {},
   "source": [
    "\n",
    "## **Week 3 task.**\n",
    "Create your own tool that generates synthetic data/test data. Input the type of dataset or products or job postings, etc. and let the tool dream up various data samples.\n",
    "\n",
    "https://colab.research.google.com/drive/13wR4Blz3Ot_x0GOpflmvvFffm5XU3Kct?usp=sharing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "0ddde9ed",
   "metadata": {},
   "outputs": [],
   "source": [
    "# imports\n",
    "\n",
    "import os\n",
    "import requests\n",
    "import torch\n",
    "from IPython.display import Markdown, display, update_display\n",
    "from openai import OpenAI\n",
    "from huggingface_hub import login\n",
    "from huggingface_hub import login\n",
    "from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig\n",
    "from dotenv import load_dotenv\n",
    "import gradio as gr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cbbc6cc8",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "load_dotenv(override=True)\n",
    "\n",
    "openai_api_key = os.getenv('OPENAI_API_KEY')\n",
    "llama_api_key = \"ollama\"\n",
    "\n",
    "# hf_token = userdata.get('HF_TOKEN')\n",
    "# login(hf_token, add_to_git_credential=True)\n",
    "\n",
    "\n",
    "if openai_api_key:\n",
    "    print(f\"OpenAI API Key exists and begins {openai_api_key[:8]}\")\n",
    "else:\n",
    "    print(\"OpenAI API Key not set\")\n",
    "\n",
    "if llama_api_key:\n",
    "    print(f\"LLama API Key exists\")\n",
    "else:\n",
    "    print(\"LLama API Key not set\")\n",
    "    \n",
    "GPT_MODEL = \"gpt-4.1-mini\"\n",
    "LLAMA_MODEL = \"llama3.1\"\n",
    "\n",
    "\n",
    "openai = OpenAI()\n",
    "\n",
    "llama_url = \"http://localhost:11434/v1\"\n",
    "llama = OpenAI(api_key=llama_api_key, base_url=llama_url)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "ef083ec6",
   "metadata": {},
   "outputs": [],
   "source": [
    "def generate_with_gpt(user_prompt: str, num_samples: int = 5):\n",
    "    \"\"\"\n",
    "    Generates synthetic data using OpenAI's GPT.\n",
    "    Return a JSON string.\n",
    "    \"\"\"\n",
    "    if not openai:\n",
    "        return json.dumps({\"error\": \"OpenAI client not initialized. Please check your API key.\"}, indent=2)\n",
    "\n",
    "    try:\n",
    "        response = openai.chat.completions.create(\n",
    "            model=GPT_MODEL,\n",
    "            messages=[\n",
    "                {\"role\": \"system\", \"content\": f\"You are a data generation assistant. Generate a JSON array of exactly {num_samples} objects based on the user's request. The output must be valid JSON only, without any other text or formatting.\"},\n",
    "                {\"role\": \"user\", \"content\": user_prompt}\n",
    "            ],\n",
    "            response_format={\"type\": \"json_object\"}\n",
    "        )\n",
    "        \n",
    "        json_text = response.choices[0].message.content\n",
    "        return json_text\n",
    "    except APIError as e:\n",
    "        return json.dumps({\"error\": f\"Error from OpenAI API: {e.body}\"}, indent=2)\n",
    "    except Exception as e:\n",
    "        return json.dumps({\"error\": f\"An unexpected error occurred: {e}\"}, indent=2)\n",
    "\n",
    "def generate_with_gpt(user_prompt: str, num_samples: int = 5):\n",
    "    \"\"\"\n",
    "    Generates synthetic data using OpenAI's GPT.\n",
    "    Return a JSON string.\n",
    "    \"\"\"\n",
    "    if not openai:\n",
    "        return json.dumps({\"error\": \"OpenAI client not initialized. Please check your API key.\"}, indent=2)\n",
    "\n",
    "    try:\n",
    "        response = openai.chat.completions.create(\n",
    "            model=GPT_MODEL,\n",
    "            messages=[\n",
    "                {\"role\": \"system\", \"content\": f\"You are a data generation assistant. Generate a JSON array of exactly {num_samples} objects based on the user's request. The output must be valid JSON only, without any other text or formatting.\"},\n",
    "                {\"role\": \"user\", \"content\": user_prompt}\n",
    "            ],\n",
    "            response_format={\"type\": \"json_object\"}\n",
    "        )\n",
    "        \n",
    "        json_text = response.choices[0].message.content\n",
    "        return json_text\n",
    "    except APIError as e:\n",
    "        return json.dumps({\"error\": f\"Error from OpenAI API: {e.body}\"}, indent=2)\n",
    "    except Exception as e:\n",
    "        return json.dumps({\"error\": f\"An unexpected error occurred: {e}\"}, indent=2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "b98f84d8",
   "metadata": {},
   "outputs": [],
   "source": [
    "def generate_data(user_prompt, model_choice):\n",
    "    \"\"\"\n",
    "    Wrapper function that calls the appropriate generation function based on model choice.\n",
    "    \"\"\"\n",
    "    if not user_prompt:\n",
    "        return json.dumps({\"error\": \"Please provide a description for the data.\"}, indent=2)\n",
    "\n",
    "    if model_choice == f\"Hugging Face ({LLAMA_MODEL})\":\n",
    "        return generate_with_llama(user_prompt)\n",
    "    elif model_choice == f\"OpenAI ({GPT_MODEL})\":\n",
    "        return generate_with_gpt(user_prompt)\n",
    "    else:\n",
    "        return json.dumps({\"error\": \"Invalid model choice.\"}, indent=2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "adbc19a8",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Gradio UI\n",
    "with gr.Blocks(theme=gr.themes.Glass(), title=\"Synthetic Data Generator\") as ui:\n",
    "    gr.Markdown(\"# Synthetic Data Generator\")\n",
    "    gr.Markdown(\"Describe the type of data you need, select a model, and click 'Generate'.\")\n",
    "\n",
    "    with gr.Row():\n",
    "        with gr.Column(scale=3):\n",
    "            data_prompt = gr.Textbox(\n",
    "                lines=5,\n",
    "                label=\"Data Prompt\",\n",
    "                placeholder=\"e.g., a list of customer profiles with name, email, and a favorite product\"\n",
    "            )\n",
    "        \n",
    "        with gr.Column(scale=1):\n",
    "            model_choice = gr.Radio(\n",
    "                [f\"Hugging Face ({LLAMA_MODEL})\", f\"OpenAI ({GPT_MODEL})\"],\n",
    "                label=\"Choose a Model\",\n",
    "                value=f\"Hugging Face ({LLAMA_MODEL})\"\n",
    "            )\n",
    "            \n",
    "            generate_btn = gr.Button(\"Generate Data\")\n",
    "            \n",
    "    with gr.Row():\n",
    "        output_json = gr.JSON(label=\"Generated Data\")\n",
    "    \n",
    "    generate_btn.click(\n",
    "        fn=generate_data,\n",
    "        inputs=[data_prompt, model_choice],\n",
    "        outputs=output_json\n",
    "    )\n",
    "\n",
    "ui.launch(inbrowser=True, debug=True)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }