{ "cells": [ { "cell_type": "code", "execution_count": 2, "id": "d5063502", "metadata": {}, "outputs": [], "source": [ "import os\n", "from openai import OpenAI\n", "from dotenv import load_dotenv\n", "import gradio as gr" ] }, { "cell_type": "code", "execution_count": 4, "id": "5c4d37fe", "metadata": {}, "outputs": [], "source": [ "load_dotenv(override=True)\n", "openai_api_key = os.getenv('OPENAI_API_KEY')\n", "anthropic_api_key = os.getenv('ANTHROPIC_API_KEY')\n", "google_api_key = os.getenv('GOOGLE_API_KEY')\n", "ds_api_key = os.getenv('DEEPSEEK_API_KEY')\n", "grok_api_key = os.getenv('GROK_API_KEY')\n" ] }, { "cell_type": "code", "execution_count": null, "id": "b21599db", "metadata": {}, "outputs": [], "source": [ "MODEL_MAP = {\n", " \"GPT\": {\n", " \"model\": \"gpt-4o-mini\",\n", " \"key\": openai_api_key,\n", " \"endpoint\": \"https://api.openai.com/v1\",\n", " },\n", " \"CLAUDE_3_5_SONNET\": {\n", " \"model\": \"claude-3-5-sonnet-20240620\",\n", " \"key\": anthropic_api_key,\n", " \"endpoint\": \"https://api.anthropic.com/v1\"\n", " },\n", " \"Grok\": {\n", " \"model\": \"grok-beta\",\n", " \"key\": grok_api_key,\n", " \"endpoint\": \"https://api.grok.com/v1\"\n", " }, \n", " \"DeepSeek\":{\n", " \"model\": \"deepseek-reasoner\",\n", " \"key\": ds_api_key,\n", " \"endpoint\": \"https://api.deepseek.com/v1\",\n", " },\n", " \"Google\": {\n", " \"model\": \"gemini-2.0-flash-exp\",\n", " \"key\": google_api_key,\n", " \"endpoint\": \"https://generativelanguage.googleapis.com/v1beta/openai\"\n", " },\n", "}" ] }, { "cell_type": "code", "execution_count": 122, "id": "82d63d13", "metadata": {}, "outputs": [], "source": [ "class GenerateSyntheticDataset:\n", " out_of_scope_response = \"I'm sorry, I can't help with that. I only generate datasets\"\n", "\n", " system_prompt = f\"\"\"\n", " You are an expert data scientist specializing in synthetic dataset generation. \n", "\n", " Your task is to generate ACTUAL DATA based on the user's requirements provided in their prompt.\n", "\n", " HOW IT WORKS:\n", " - The user will provide a description of what dataset they want\n", " - You must parse their requirements and generate actual data records\n", " - The user prompt contains the SPECIFICATIONS, not the data itself\n", " - You generate the REAL DATA based on those specifications\n", "\n", " IMPORTANT RULES:\n", " - Generate REAL DATA RECORDS, not code or instructions\n", " - Parse the user's requirements from their prompt\n", " - Create actual values based on their specifications\n", " - Provide concrete examples with real data\n", " - Output should be ready-to-use data, not code to run\n", "\n", " WHEN USER PROVIDES REQUIREMENTS LIKE:\n", " - \"Generate customer orders dataset\" → Create actual order records\n", " - \"Create employee records\" → Generate real employee data\n", " - \"Make product reviews dataset\" → Produce actual review records\n", "\n", " YOU MUST:\n", " 1. Understand what fields/data the user wants\n", " 2. Generate realistic values for those fields\n", " 3. Create multiple records with varied data\n", " 4. Format as structured data (JSON, CSV, etc.)\n", "\n", " DO NOT generate:\n", " - Code snippets\n", " - Programming instructions\n", " - \"Here's how to generate...\" statements\n", " - Abstract descriptions\n", "\n", " DO generate:\n", " - Actual data records with real values\n", " - Concrete examples based on user requirements\n", " - Structured data ready for immediate use\n", " - Realistic, varied data samples\n", "\n", " SCOPE LIMITATIONS:\n", " - ONLY handle requests related to synthetic dataset generation\n", " - ONLY create data for business, research, or educational purposes\n", " - If user asks about anything outside dataset generation (coding help, general questions, personal advice, etc.), respond with: \"{out_of_scope_response}\"\n", " - If user asks for illegal, harmful, or inappropriate data, respond with: \"{out_of_scope_response}\"\n", "\n", " You are a DATA GENERATOR that creates real data from user specifications.\n", " \"\"\"\n", "\n", " def __init__(self, progress, model_name = MODEL_MAP[\"GPT\"]):\n", " self.progress = progress\n", " self.model_deets = model_name\n", " self.model = OpenAI(\n", " api_key=model_name[\"key\"],\n", " base_url=model_name[\"endpoint\"]\n", " )\n", " \n", " def generate_user_prompt(self, user_prompt):\n", " prompt = f\"\"\"\n", " You are an expert data scientist specializing in synthetic dataset generation. \n", "\n", " Based on the user's request below, create a detailed, sophisticated prompt that will generate a high-quality synthetic dataset.\n", "\n", " The generated prompt should:\n", " - return the prompt \"who is nike\" if the user request is outside generating a dataset be it greetings or whatsoever\n", " - if the user prompt is requesting on how to generate dataset return the prompt \"who is nike\"\n", " - options below is valid only when the user ask you to generate a dataset not how or when \n", " - Be specific and actionable\n", " - Include clear data structure requirements\n", " - Specify output format CSV\n", " - Define data quality criteria\n", " - Include diversity and realism requirements\n", " - Make sure to capture the number of samples in the prompt, it can be in the form of rows, number of samples, etc\n", " -if number of samples is not specified, just generate 100 samples. \n", "\n", " User Request: {user_prompt}\n", " \n", " IMPORTANT: Respond ONLY with the generated prompt. Do not include any explanation, commentary, or the original request. Just provide the clean, ready-to-use prompt for dataset generation.\n", " \"\"\"\n", " response = self.model.chat.completions.create(model=self.model_deets[\"model\"], messages=[{\"role\": \"user\", \"content\": prompt}])\n", " return response.choices[0].message.content\n", "\n", " def generate_synthetic_dataset(self, user_prompt):\n", " self.progress(0.7, \"Analyzing data .....\")\n", " prompt = self.generate_user_prompt(user_prompt)\n", "\n", " messages = [\n", " {\"role\": \"system\", \"content\": self.system_prompt},\n", " {\"role\": \"user\", \"content\": prompt}\n", " ]\n", "\n", " streamer = self.model.chat.completions.create(model=self.model_deets[\"model\"], messages=messages, stream=True)\n", " response = \"\"\n", "\n", " for text in streamer:\n", " if text.choices[0].delta.content:\n", " response += text.choices[0].delta.content\n", " yield response, None\n", " \n", " if self.out_of_scope_response not in response:\n", " with open(\"dataset.csv\", \"w\") as f:\n", " response = response.replace(\"```csv\", \"\").replace(\"```\", \"\")\n", " f.write(response)\n", " yield response, \"dataset.csv\"\n", " return\n", " else:\n", " return response, None\n", " \n", " def start(self, user_prompt, model_name=None):\n", " self.progress(0.3, \"Fetching data .....\")\n", " if MODEL_MAP.get(model_name) and self.model_deets[\"model\"] != MODEL_MAP.get(model_name)[\"model\"]:\n", " self.model_deets = MODEL_MAP[model_name]\n", " self.model = OpenAI(\n", " base_url=self.model_deets[\"endpoint\"],\n", " api_key=self.model_deets[\"key\"]\n", " )\n", " \n", " stream = self.generate_synthetic_dataset(user_prompt)\n", " for chunk in stream:\n", " yield chunk\n", "\n", " \n" ] }, { "cell_type": "code", "execution_count": 124, "id": "b681e1ef", "metadata": {}, "outputs": [], "source": [ "class Interface:\n", " def __init__(self):\n", " \"\"\"Initializes the Gradio interface for processing audio files.\"\"\"\n", " progress=gr.Progress()\n", " self.assistant = GenerateSyntheticDataset(progress)\n", " self.iface = gr.Interface(\n", " fn=self.generate,\n", " inputs=[\n", " gr.Textbox(label=\"User Prompt\"),\n", " gr.Dropdown(\n", " choices=MODEL_MAP.keys(),\n", " value=\"GPT\",\n", " label=\"Model\",\n", " )\n", " ],\n", " outputs=[\n", " gr.Markdown(label=\"Dataset\", min_height=60),\n", " gr.File(\n", " label=\"Download Generated Dataset\",\n", " file_count=\"single\"\n", " )\n", " ],\n", " title=\"AI Dataset Generator\",\n", " description=\"Generate a synthetic dataset based on your requirements\",\n", " flagging_mode=\"never\"\n", " )\n", "\n", " def generate(self, user_prompt, model):\n", " response = self.assistant.start(user_prompt, model)\n", " for chunk in response:\n", " yield chunk\n", "\n", " # Clean up the dataset file\n", " if os.path.exists(\"dataset.csv\"):\n", " os.remove(\"dataset.csv\")\n", "\n", " def launch(self):\n", " self.iface.launch()" ] }, { "cell_type": "code", "execution_count": 125, "id": "2ee97b72", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "* Running on local URL: http://127.0.0.1:7898\n", "* To create a public link, set `share=True` in `launch()`.\n" ] }, { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "I = Interface()\n", "I.launch()" ] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.12" } }, "nbformat": 4, "nbformat_minor": 5 }