BootCamp: Solisoma(added week3 dataset generator & week4 assesment)

2025-10-22 13:56:26 +01:00
parent 9b84cc62c0
commit 079e99430a
3 changed files with 655 additions and 0 deletions
--- a/week3/community-contributions/solisoma/synthetic_dataset_generator.ipynb
+++ b/week3/community-contributions/solisoma/synthetic_dataset_generator.ipynb
@@ -0,0 +1,303 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "d5063502",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "from openai import OpenAI\n",
+    "from dotenv import load_dotenv\n",
+    "import gradio as gr"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "5c4d37fe",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "load_dotenv(override=True)\n",
+    "openai_api_key = os.getenv('OPENAI_API_KEY')\n",
+    "anthropic_api_key = os.getenv('ANTHROPIC_API_KEY')\n",
+    "google_api_key = os.getenv('GOOGLE_API_KEY')\n",
+    "ds_api_key = os.getenv('DEEPSEEK_API_KEY')\n",
+    "grok_api_key = os.getenv('GROK_API_KEY')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b21599db",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "MODEL_MAP = {\n",
+    "    \"GPT\": {\n",
+    "        \"model\": \"gpt-4o-mini\",\n",
+    "        \"key\": openai_api_key,\n",
+    "        \"endpoint\": \"https://api.openai.com/v1\",\n",
+    "    },\n",
+    "    \"CLAUDE_3_5_SONNET\": {\n",
+    "        \"model\": \"claude-3-5-sonnet-20240620\",\n",
+    "        \"key\": anthropic_api_key,\n",
+    "        \"endpoint\": \"https://api.anthropic.com/v1\"\n",
+    "    },\n",
+    "    \"Grok\": {\n",
+    "        \"model\": \"grok-beta\",\n",
+    "        \"key\": grok_api_key,\n",
+    "        \"endpoint\": \"https://api.grok.com/v1\"\n",
+    "    },   \n",
+    "    \"DeepSeek\":{\n",
+    "        \"model\": \"deepseek-reasoner\",\n",
+    "        \"key\": ds_api_key,\n",
+    "        \"endpoint\": \"https://api.deepseek.com/v1\",\n",
+    "    },\n",
+    "    \"Google\": {\n",
+    "        \"model\": \"gemini-2.0-flash-exp\",\n",
+    "        \"key\": google_api_key,\n",
+    "        \"endpoint\": \"https://generativelanguage.googleapis.com/v1beta/openai\"\n",
+    "    },\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 122,
+   "id": "82d63d13",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class GenerateSyntheticDataset:\n",
+    "  out_of_scope_response = \"I'm sorry, I can't help with that. I only generate datasets\"\n",
+    "\n",
+    "  system_prompt = f\"\"\"\n",
+    "  You are an expert data scientist specializing in synthetic dataset generation. \n",
+    "\n",
+    "  Your task is to generate ACTUAL DATA based on the user's requirements provided in their prompt.\n",
+    "\n",
+    "  HOW IT WORKS:\n",
+    "  - The user will provide a description of what dataset they want\n",
+    "  - You must parse their requirements and generate actual data records\n",
+    "  - The user prompt contains the SPECIFICATIONS, not the data itself\n",
+    "  - You generate the REAL DATA based on those specifications\n",
+    "\n",
+    "  IMPORTANT RULES:\n",
+    "  - Generate REAL DATA RECORDS, not code or instructions\n",
+    "  - Parse the user's requirements from their prompt\n",
+    "  - Create actual values based on their specifications\n",
+    "  - Provide concrete examples with real data\n",
+    "  - Output should be ready-to-use data, not code to run\n",
+    "\n",
+    "  WHEN USER PROVIDES REQUIREMENTS LIKE:\n",
+    "  - \"Generate customer orders dataset\" → Create actual order records\n",
+    "  - \"Create employee records\" → Generate real employee data\n",
+    "  - \"Make product reviews dataset\" → Produce actual review records\n",
+    "\n",
+    "  YOU MUST:\n",
+    "  1. Understand what fields/data the user wants\n",
+    "  2. Generate realistic values for those fields\n",
+    "  3. Create multiple records with varied data\n",
+    "  4. Format as structured data (JSON, CSV, etc.)\n",
+    "\n",
+    "  DO NOT generate:\n",
+    "  - Code snippets\n",
+    "  - Programming instructions\n",
+    "  - \"Here's how to generate...\" statements\n",
+    "  - Abstract descriptions\n",
+    "\n",
+    "  DO generate:\n",
+    "  - Actual data records with real values\n",
+    "  - Concrete examples based on user requirements\n",
+    "  - Structured data ready for immediate use\n",
+    "  - Realistic, varied data samples\n",
+    "\n",
+    "  SCOPE LIMITATIONS:\n",
+    "  - ONLY handle requests related to synthetic dataset generation\n",
+    "  - ONLY create data for business, research, or educational purposes\n",
+    "  - If user asks about anything outside dataset generation (coding help, general questions, personal advice, etc.), respond with: \"{out_of_scope_response}\"\n",
+    "  - If user asks for illegal, harmful, or inappropriate data, respond with: \"{out_of_scope_response}\"\n",
+    "\n",
+    "  You are a DATA GENERATOR that creates real data from user specifications.\n",
+    "  \"\"\"\n",
+    "\n",
+    "  def __init__(self, progress, model_name = MODEL_MAP[\"GPT\"]):\n",
+    "    self.progress = progress\n",
+    "    self.model_deets = model_name\n",
+    "    self.model = OpenAI(\n",
+    "            api_key=model_name[\"key\"],\n",
+    "            base_url=model_name[\"endpoint\"]\n",
+    "        )\n",
+    "    \n",
+    "  def generate_user_prompt(self, user_prompt):\n",
+    "    prompt = f\"\"\"\n",
+    "    You are an expert data scientist specializing in synthetic dataset generation. \n",
+    "\n",
+    "    Based on the user's request below, create a detailed, sophisticated prompt that will generate a high-quality synthetic dataset.\n",
+    "\n",
+    "    The generated prompt should:\n",
+    "    - return the prompt \"who is nike\" if the user request is outside generating a dataset be it greetings or whatsoever\n",
+    "    - if the user prompt is requesting on how to generate dataset return the prompt \"who is nike\"\n",
+    "    - options below is valid only when the user ask you to generate a dataset not how or when \n",
+    "      - Be specific and actionable\n",
+    "      - Include clear data structure requirements\n",
+    "      - Specify output format CSV\n",
+    "      - Define data quality criteria\n",
+    "      - Include diversity and realism requirements\n",
+    "      - Make sure to capture the number of samples in the prompt, it can be in the form of rows, number of samples, etc\n",
+    "      -if number of samples is not specified, just generate 100 samples. \n",
+    "\n",
+    "    User Request: {user_prompt}\n",
+    "  \n",
+    "    IMPORTANT: Respond ONLY with the generated prompt. Do not include any explanation, commentary, or the original request. Just provide the clean, ready-to-use prompt for dataset generation.\n",
+    "    \"\"\"\n",
+    "    response = self.model.chat.completions.create(model=self.model_deets[\"model\"], messages=[{\"role\": \"user\", \"content\": prompt}])\n",
+    "    return response.choices[0].message.content\n",
+    "\n",
+    "  def generate_synthetic_dataset(self, user_prompt):\n",
+    "    self.progress(0.7, \"Analyzing data .....\")\n",
+    "    prompt = self.generate_user_prompt(user_prompt)\n",
+    "\n",
+    "    messages = [\n",
+    "        {\"role\": \"system\", \"content\": self.system_prompt},\n",
+    "        {\"role\": \"user\", \"content\": prompt}\n",
+    "    ]\n",
+    "\n",
+    "    streamer = self.model.chat.completions.create(model=self.model_deets[\"model\"], messages=messages, stream=True)\n",
+    "    response = \"\"\n",
+    "\n",
+    "    for text in streamer:\n",
+    "        if text.choices[0].delta.content:\n",
+    "            response += text.choices[0].delta.content\n",
+    "            yield response, None\n",
+    "    \n",
+    "    if self.out_of_scope_response not in response:\n",
+    "      with open(\"dataset.csv\", \"w\") as f:\n",
+    "        response = response.replace(\"```csv\", \"\").replace(\"```\", \"\")\n",
+    "        f.write(response)\n",
+    "      yield response, \"dataset.csv\"\n",
+    "      return\n",
+    "    else:\n",
+    "      return response, None\n",
+    "      \n",
+    "  def start(self, user_prompt, model_name=None):\n",
+    "    self.progress(0.3, \"Fetching data .....\")\n",
+    "    if MODEL_MAP.get(model_name) and self.model_deets[\"model\"] != MODEL_MAP.get(model_name)[\"model\"]:\n",
+    "        self.model_deets = MODEL_MAP[model_name]\n",
+    "        self.model = OpenAI(\n",
+    "            base_url=self.model_deets[\"endpoint\"],\n",
+    "            api_key=self.model_deets[\"key\"]\n",
+    "        )\n",
+    "      \n",
+    "    stream = self.generate_synthetic_dataset(user_prompt)\n",
+    "    for chunk in stream:\n",
+    "      yield chunk\n",
+    "\n",
+    "    \n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 124,
+   "id": "b681e1ef",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class Interface:\n",
+    "    def __init__(self):\n",
+    "        \"\"\"Initializes the Gradio interface for processing audio files.\"\"\"\n",
+    "        progress=gr.Progress()\n",
+    "        self.assistant = GenerateSyntheticDataset(progress)\n",
+    "        self.iface = gr.Interface(\n",
+    "            fn=self.generate,\n",
+    "            inputs=[\n",
+    "                gr.Textbox(label=\"User Prompt\"),\n",
+    "                gr.Dropdown(\n",
+    "                  choices=MODEL_MAP.keys(),\n",
+    "                  value=\"GPT\",\n",
+    "                  label=\"Model\",\n",
+    "                )\n",
+    "            ],\n",
+    "            outputs=[\n",
+    "              gr.Markdown(label=\"Dataset\", min_height=60),\n",
+    "              gr.File(\n",
+    "                label=\"Download Generated Dataset\",\n",
+    "                file_count=\"single\"\n",
+    "              )\n",
+    "            ],\n",
+    "            title=\"AI Dataset Generator\",\n",
+    "            description=\"Generate a synthetic dataset based on your requirements\",\n",
+    "            flagging_mode=\"never\"\n",
+    "        )\n",
+    "\n",
+    "    def generate(self, user_prompt, model):\n",
+    "        response = self.assistant.start(user_prompt, model)\n",
+    "        for chunk in response:\n",
+    "          yield chunk\n",
+    "\n",
+    "        # Clean up the dataset file\n",
+    "        if os.path.exists(\"dataset.csv\"):\n",
+    "          os.remove(\"dataset.csv\")\n",
+    "\n",
+    "    def launch(self):\n",
+    "        self.iface.launch()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 125,
+   "id": "2ee97b72",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "* Running on local URL:  http://127.0.0.1:7898\n",
+      "* To create a public link, set `share=True` in `launch()`.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div><iframe src=\"http://127.0.0.1:7898/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "I = Interface()\n",
+    "I.launch()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}