{ "cells": [ { "cell_type": "markdown", "id": "c58e628f", "metadata": {}, "source": [ "\n", "## **Week 3 task.**\n", "Create your own tool that generates synthetic data/test data. Input the type of dataset or products or job postings, etc. and let the tool dream up various data samples.\n", "\n", "https://colab.research.google.com/drive/13wR4Blz3Ot_x0GOpflmvvFffm5XU3Kct?usp=sharing" ] }, { "cell_type": "code", "execution_count": 2, "id": "0ddde9ed", "metadata": {}, "outputs": [], "source": [ "# imports\n", "\n", "import os\n", "import requests\n", "import torch\n", "from IPython.display import Markdown, display, update_display\n", "from openai import OpenAI\n", "from huggingface_hub import login\n", "from huggingface_hub import login\n", "from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig\n", "from dotenv import load_dotenv\n", "import gradio as gr" ] }, { "cell_type": "code", "execution_count": null, "id": "cbbc6cc8", "metadata": {}, "outputs": [], "source": [ "\n", "load_dotenv(override=True)\n", "\n", "openai_api_key = os.getenv('OPENAI_API_KEY')\n", "llama_api_key = \"ollama\"\n", "\n", "# hf_token = userdata.get('HF_TOKEN')\n", "# login(hf_token, add_to_git_credential=True)\n", "\n", "\n", "if openai_api_key:\n", " print(f\"OpenAI API Key exists and begins {openai_api_key[:8]}\")\n", "else:\n", " print(\"OpenAI API Key not set\")\n", "\n", "if llama_api_key:\n", " print(f\"LLama API Key exists\")\n", "else:\n", " print(\"LLama API Key not set\")\n", " \n", "GPT_MODEL = \"gpt-4.1-mini\"\n", "LLAMA_MODEL = \"llama3.1\"\n", "\n", "\n", "openai = OpenAI()\n", "\n", "llama_url = \"http://localhost:11434/v1\"\n", "llama = OpenAI(api_key=llama_api_key, base_url=llama_url)" ] }, { "cell_type": "code", "execution_count": 12, "id": "ef083ec6", "metadata": {}, "outputs": [], "source": [ "def generate_with_gpt(user_prompt: str, num_samples: int = 5):\n", " \"\"\"\n", " Generates synthetic data using OpenAI's GPT.\n", " Return a JSON string.\n", " \"\"\"\n", " if not openai:\n", " return json.dumps({\"error\": \"OpenAI client not initialized. Please check your API key.\"}, indent=2)\n", "\n", " try:\n", " response = openai.chat.completions.create(\n", " model=GPT_MODEL,\n", " messages=[\n", " {\"role\": \"system\", \"content\": f\"You are a data generation assistant. Generate a JSON array of exactly {num_samples} objects based on the user's request. The output must be valid JSON only, without any other text or formatting.\"},\n", " {\"role\": \"user\", \"content\": user_prompt}\n", " ],\n", " response_format={\"type\": \"json_object\"}\n", " )\n", " \n", " json_text = response.choices[0].message.content\n", " return json_text\n", " except APIError as e:\n", " return json.dumps({\"error\": f\"Error from OpenAI API: {e.body}\"}, indent=2)\n", " except Exception as e:\n", " return json.dumps({\"error\": f\"An unexpected error occurred: {e}\"}, indent=2)\n", "\n", "def generate_with_gpt(user_prompt: str, num_samples: int = 5):\n", " \"\"\"\n", " Generates synthetic data using OpenAI's GPT.\n", " Return a JSON string.\n", " \"\"\"\n", " if not openai:\n", " return json.dumps({\"error\": \"OpenAI client not initialized. Please check your API key.\"}, indent=2)\n", "\n", " try:\n", " response = openai.chat.completions.create(\n", " model=GPT_MODEL,\n", " messages=[\n", " {\"role\": \"system\", \"content\": f\"You are a data generation assistant. Generate a JSON array of exactly {num_samples} objects based on the user's request. The output must be valid JSON only, without any other text or formatting.\"},\n", " {\"role\": \"user\", \"content\": user_prompt}\n", " ],\n", " response_format={\"type\": \"json_object\"}\n", " )\n", " \n", " json_text = response.choices[0].message.content\n", " return json_text\n", " except APIError as e:\n", " return json.dumps({\"error\": f\"Error from OpenAI API: {e.body}\"}, indent=2)\n", " except Exception as e:\n", " return json.dumps({\"error\": f\"An unexpected error occurred: {e}\"}, indent=2)" ] }, { "cell_type": "code", "execution_count": 13, "id": "b98f84d8", "metadata": {}, "outputs": [], "source": [ "def generate_data(user_prompt, model_choice):\n", " \"\"\"\n", " Wrapper function that calls the appropriate generation function based on model choice.\n", " \"\"\"\n", " if not user_prompt:\n", " return json.dumps({\"error\": \"Please provide a description for the data.\"}, indent=2)\n", "\n", " if model_choice == f\"Hugging Face ({LLAMA_MODEL})\":\n", " return generate_with_llama(user_prompt)\n", " elif model_choice == f\"OpenAI ({GPT_MODEL})\":\n", " return generate_with_gpt(user_prompt)\n", " else:\n", " return json.dumps({\"error\": \"Invalid model choice.\"}, indent=2)" ] }, { "cell_type": "code", "execution_count": null, "id": "adbc19a8", "metadata": {}, "outputs": [], "source": [ "# Gradio UI\n", "with gr.Blocks(theme=gr.themes.Glass(), title=\"Synthetic Data Generator\") as ui:\n", " gr.Markdown(\"# Synthetic Data Generator\")\n", " gr.Markdown(\"Describe the type of data you need, select a model, and click 'Generate'.\")\n", "\n", " with gr.Row():\n", " with gr.Column(scale=3):\n", " data_prompt = gr.Textbox(\n", " lines=5,\n", " label=\"Data Prompt\",\n", " placeholder=\"e.g., a list of customer profiles with name, email, and a favorite product\"\n", " )\n", " \n", " with gr.Column(scale=1):\n", " model_choice = gr.Radio(\n", " [f\"Hugging Face ({LLAMA_MODEL})\", f\"OpenAI ({GPT_MODEL})\"],\n", " label=\"Choose a Model\",\n", " value=f\"Hugging Face ({LLAMA_MODEL})\"\n", " )\n", " \n", " generate_btn = gr.Button(\"Generate Data\")\n", " \n", " with gr.Row():\n", " output_json = gr.JSON(label=\"Generated Data\")\n", " \n", " generate_btn.click(\n", " fn=generate_data,\n", " inputs=[data_prompt, model_choice],\n", " outputs=output_json\n", " )\n", "\n", "ui.launch(inbrowser=True, debug=True)" ] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.12" } }, "nbformat": 4, "nbformat_minor": 5 }