From 756bd0b623be3b8906b677e50ad7f119ae842d14 Mon Sep 17 00:00:00 2001
From: Krabulek <ania.cielas@gmail.com>
Date: Fri, 19 Sep 2025 13:45:07 +0200
Subject: [PATCH] Week 3 excercise - Intelligent Dataset Generator

---
 .../intelligent_dataset_generator.ipynb       | 600 ++++++++++++++++++
 1 file changed, 600 insertions(+)
 create mode 100644 week3/community-contributions/intelligent_dataset_generator.ipynb

diff --git a/week3/community-contributions/intelligent_dataset_generator.ipynb b/week3/community-contributions/intelligent_dataset_generator.ipynb
new file mode 100644
index 0000000..9a374a1
--- /dev/null
+++ b/week3/community-contributions/intelligent_dataset_generator.ipynb
@@ -0,0 +1,600 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "QTJt9pwUTbHo"
+   },
+   "source": [
+    "# Intelligent Synthetic Dataset Generator\n",
+    "\n",
+    "An AI-powered tool that creates realistic synthetic datasets for any business case—whether you provide the schema or let it intelligently design one for you.\n",
+    "\n",
+    "It works with Claude, Gemini, GPT and HugginFace APIs."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "l_FljmlTUoka"
+   },
+   "source": [
+    "## Imports"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "aONqZ-SjUJdg",
+    "outputId": "1f5c7b2e-95f0-4f23-cf01-2bd5bda0807a"
+   },
+   "outputs": [],
+   "source": [
+    "!pip install -q requests bitsandbytes anthropic"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "Ub1unBFvTatE"
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import requests\n",
+    "import json\n",
+    "from google.colab import userdata\n",
+    "\n",
+    "from openai import OpenAI\n",
+    "import anthropic\n",
+    "from huggingface_hub import login\n",
+    "from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig\n",
+    "import torch\n",
+    "import pandas as pd\n",
+    "\n",
+    "import gradio as gr\n",
+    "import gc"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "viZNPtObUOcz"
+   },
+   "outputs": [],
+   "source": [
+    "hf_token = userdata.get('HF_TOKEN')\n",
+    "openai_api_key = userdata.get('OPENAI_API_KEY')\n",
+    "anthropic_api_key = userdata.get('ANTHROPIC_API_KEY')\n",
+    "google_api_key = userdata.get('GOOGLE_API_KEY')\n",
+    "\n",
+    "login(hf_token, add_to_git_credential=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "9Q94S6JTUWn5"
+   },
+   "outputs": [],
+   "source": [
+    "quant_config = BitsAndBytesConfig(\n",
+    "    load_in_4bit=True,\n",
+    "    bnb_4bit_use_double_quant=True,\n",
+    "    bnb_4bit_compute_dtype=torch.bfloat16,\n",
+    "    bnb_4bit_quant_type=\"nf4\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "mrjdVEpaUxHz"
+   },
+   "source": [
+    "## Configuration"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "LvNE6foEUPaz"
+   },
+   "outputs": [],
+   "source": [
+    "LLAMA = \"meta-llama/Meta-Llama-3.1-8B-Instruct\"\n",
+    "PHI3 = \"microsoft/Phi-3-mini-4k-instruct\"\n",
+    "GEMMA2 = \"google/gemma-2-2b-it\"\n",
+    "GPT = \"gpt-4o-mini\"\n",
+    "CLAUDE = \"claude-3-haiku-20240307\"\n",
+    "GEMINI = \"gemini-2.0-flash\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "tvafTFD8XmaO"
+   },
+   "outputs": [],
+   "source": [
+    "MODELS = {\n",
+    "    'LLama 3.1' : LLAMA,\n",
+    "    'Phi 3 mini': PHI3,\n",
+    "    'Gemma 2': GEMMA2,\n",
+    "    'GPT 4.o mini': GPT,\n",
+    "    'Claude 3 Haiku': CLAUDE,\n",
+    "    'Gemini 2.0 Flash': GEMINI,\n",
+    "}\n",
+    "\n",
+    "HF_MODELS = [LLAMA, PHI3, GEMMA2]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "2LZqA9QXXl0t"
+   },
+   "outputs": [],
+   "source": [
+    "FILE_FORMATS = [\".csv\", \".tsv\", \".jsonl\", \".json\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "d6EnN7SVXhza",
+    "outputId": "55f6ac4d-adeb-4216-b2a8-d67524b005d3"
+   },
+   "outputs": [],
+   "source": [
+    "SCHEMA = [\n",
+    "    (\"Name\", \"TEXT\", \"Name of the restaurant\", \"Blue River Bistro\"),\n",
+    "    (\"Address\", \"TEXT\", \"Restaurant address\", \"742 Evergreen Terrace, Springfield, IL 62704\"),\n",
+    "    (\"Type\", \"TEXT\", \"Kitchen type\", 'One of [\"Thai\",\"Mediterranean\",\"Vegan\",\"Steakhouse\",\"Japanese\"] or other potential types'),\n",
+    "    (\"Average Price\", \"TEXT\", \"Average meal price\", \"$45, or '--' if unknown\"),\n",
+    "    (\"Year\", \"INT\", \"Year of restaurant opening\", 2015),\n",
+    "    (\"Menu\", \"Array\", \"List of meals\", '[\"Grilled Salmon\", \"Caesar Salad\", \"Pad Thai\", \"Margherita Pizza\", ...]'),\n",
+    "]\n",
+    "\n",
+    "DEFAULT_SCHEMA_TEXT = \"\\n\".join([f\"{i+1}. {col[0]} ({col[1]}) - {col[2]}, example: {col[3]}\" for i, col in enumerate(SCHEMA)])\n",
+    "print(DEFAULT_SCHEMA_TEXT)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "W-46TDTOXiS7"
+   },
+   "outputs": [],
+   "source": [
+    "system_prompt = \"\"\"\n",
+    "You are an expert in generating synthetic datasets tailored to a given business case and user requirements.\n",
+    "If the user does not specify output columns, infer and create the most appropriate columns based on your expertise.\n",
+    "Do NOT repeat column values from one row to another. Only output valid JSONL without any comments.\"\n",
+    "\"\"\"\n",
+    "\n",
+    "\n",
+    "def get_user_prompt(business_case, schema_text, nr_records):\n",
+    "    prompt = f\"The business case is: {business_case}.\\nGenerate {nr_records} rows of data in JSONL format.\\n\"\n",
+    "\n",
+    "    if schema_text is not None:\n",
+    "      prompt += f\"Each line should be a JSON object with the following fields: \\n{schema_text}\\n\"\n",
+    "\n",
+    "    return prompt"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "gPf1GcAwhwa_"
+   },
+   "source": [
+    "## LLM handler"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "Tf-WEQUKhY-z"
+   },
+   "outputs": [],
+   "source": [
+    "def ask_gpt(model: str, user_prompt: str):\n",
+    "  client = OpenAI(api_key=openai_api_key)\n",
+    "  messages = [\n",
+    "      {\"role\": \"system\", \"content\": system_prompt},\n",
+    "      {\"role\": \"user\", \"content\": user_prompt}\n",
+    "    ]\n",
+    "  response = client.chat.completions.create(\n",
+    "      model=model,\n",
+    "      messages=messages,\n",
+    "      temperature=0.7\n",
+    "  )\n",
+    "  content = response.choices[0].message.content\n",
+    "\n",
+    "  return content"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "856pnIHahzDd"
+   },
+   "outputs": [],
+   "source": [
+    "def ask_claude(model: str, user_prompt: str):\n",
+    "  client = anthropic.Anthropic(api_key=anthropic_api_key)\n",
+    "  response = client.messages.create(\n",
+    "      model=model,\n",
+    "      messages=[{\"role\": \"user\", \"content\": user_prompt}],\n",
+    "      max_tokens=4000,\n",
+    "      temperature=0.7,\n",
+    "      system=system_prompt\n",
+    "  )\n",
+    "  content = response.content[0].text\n",
+    "\n",
+    "  return content"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "p0AfSbcBiUlg"
+   },
+   "outputs": [],
+   "source": [
+    "def ask_gemini(model: str, user_prompt: str):\n",
+    "  client = OpenAI(\n",
+    "      api_key=google_api_key,\n",
+    "      base_url=\"https://generativelanguage.googleapis.com/v1beta/openai/\"\n",
+    "  )\n",
+    "  messages = [\n",
+    "      {\"role\": \"system\", \"content\": system_prompt},\n",
+    "      {\"role\": \"user\", \"content\": user_prompt}\n",
+    "    ]\n",
+    "  response = client.chat.completions.create(\n",
+    "      model=model,\n",
+    "      messages=messages,\n",
+    "      temperature=0.7\n",
+    "  )\n",
+    "  content = response.choices[0].message.content\n",
+    "\n",
+    "  return content"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "K9LZZPJ9irrH"
+   },
+   "outputs": [],
+   "source": [
+    "def ask_hf(model: str, user_prompt: str):\n",
+    "  global tokenizer, inputs, hf_model, outputs\n",
+    "\n",
+    "  messages = [\n",
+    "        {\"role\": \"system\", \"content\": system_prompt},\n",
+    "        {\"role\": \"user\", \"content\": user_prompt}\n",
+    "      ]\n",
+    "\n",
+    "  tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)\n",
+    "  tokenizer.pad_token = tokenizer.eos_token\n",
+    "  inputs = tokenizer.apply_chat_template(messages, return_tensors=\"pt\").to(\"cuda\")\n",
+    "  if hf_model == None:\n",
+    "      hf_model = AutoModelForCausalLM.from_pretrained(model, device_map=\"auto\", quantization_config=quant_config)\n",
+    "  outputs = hf_model.generate(inputs, max_new_tokens=4000)\n",
+    "\n",
+    "  _, _, after = tokenizer.decode(outputs[0]).partition(\"assistant<|end_header_id|>\")\n",
+    "  content = after.strip()\n",
+    "\n",
+    "  return content"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "eu7Sv3bDhXdI"
+   },
+   "outputs": [],
+   "source": [
+    "def query_llm(model_name: str, user_prompt):\n",
+    "    try:\n",
+    "        model = MODELS[model_name]\n",
+    "\n",
+    "        if \"gpt\" in model.lower():\n",
+    "            response = ask_gpt(model, user_prompt)\n",
+    "\n",
+    "        elif \"claude\" in model.lower():\n",
+    "            response = ask_claude(model, user_prompt)\n",
+    "\n",
+    "        elif \"gemini\" in model.lower():\n",
+    "            response = ask_gemini(model, user_prompt)\n",
+    "\n",
+    "        elif model in HF_MODELS:\n",
+    "            response = ask_hf(model, user_prompt)\n",
+    "\n",
+    "        else:\n",
+    "            raise ValueError(f\"Unsupported model. Use one of {', '.join(MODELS.keys())}\")\n",
+    "\n",
+    "        lines = [line.strip() for line in response.strip().splitlines() if line.strip().startswith(\"{\")]\n",
+    "\n",
+    "        return [json.loads(line) for line in lines]\n",
+    "\n",
+    "    except Exception as e:\n",
+    "        raise Exception(f\"Model query failed: {str(e)}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "mxuwLUsVlBlY"
+   },
+   "source": [
+    "## Output Formatter"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "IAKfqgZIlGuP"
+   },
+   "outputs": [],
+   "source": [
+    "def save_dataset(records, file_format: str, file_name: str):\n",
+    "    df = pd.DataFrame(records)\n",
+    "    print(df.shape)\n",
+    "    if file_format == \".csv\":\n",
+    "        df.to_csv(file_name, index=False)\n",
+    "    elif file_format == \".tsv\":\n",
+    "        df.to_csv(file_name, sep=\"\\t\", index=False)\n",
+    "    elif file_format == \".jsonl\":\n",
+    "        with open(file_name, \"w\") as f:\n",
+    "            for record in records:\n",
+    "                f.write(json.dumps(record) + \"\\n\")\n",
+    "    elif file_format == \".json\":\n",
+    "        df.to_json(file_name, orient=\"records\", index=False)\n",
+    "    else:\n",
+    "        raise ValueError(\"Unsupported file format\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "gkpkQ0nal_5B"
+   },
+   "outputs": [],
+   "source": [
+    "def generate_dataset(\n",
+    "    model_name: str,\n",
+    "    business_case: str,\n",
+    "    num_records: int = 100,\n",
+    "    schema_text: str = None,\n",
+    "    file_format: str = '.jsonl',\n",
+    "    file_name: str = 'test_dataset.jsonl'\n",
+    "):\n",
+    "    \"\"\"\n",
+    "    Generates a synthetic dataset using an LLM based on the given business case and optional schema.\n",
+    "\n",
+    "    Returns:\n",
+    "        Tuple[str, pd.DataFrame | None]: A status message and a preview DataFrame (first 10 rows) if successful.\n",
+    "    \"\"\"\n",
+    "    try:\n",
+    "        # Validate number of records\n",
+    "        if num_records <= 10:\n",
+    "            return \"❌ Error: Number of records must be greater than 10.\", None\n",
+    "        if num_records > 1000:\n",
+    "            return \"❌ Error: Number of records must be less than or equal to 1000.\", None\n",
+    "\n",
+    "        # Validate file format\n",
+    "        if file_format not in FILE_FORMATS:\n",
+    "            return f\"❌ Error: Invalid file format '{file_format}'. Supported formats: {FILE_FORMATS}\", None\n",
+    "\n",
+    "        # Ensure file name has correct extension\n",
+    "        if not file_name.endswith(file_format):\n",
+    "            file_name += file_format\n",
+    "\n",
+    "        # Generate the prompt and query the model\n",
+    "        prompt = get_user_prompt(business_case, schema_text, num_records)\n",
+    "        records = query_llm(model_name, prompt)\n",
+    "\n",
+    "        if not records:\n",
+    "            return \"❌ Error: No valid records were generated by the model.\", None\n",
+    "\n",
+    "        # Save dataset\n",
+    "        save_dataset(records, file_format, file_name)\n",
+    "\n",
+    "        # Prepare preview\n",
+    "        df = pd.DataFrame(records)\n",
+    "        preview = df.head(10)\n",
+    "\n",
+    "        success_message = (\n",
+    "            f\"✅ Generated {len(records)} records successfully!\\n\"\n",
+    "            f\"📁 Saved to: {file_name}\\n\"\n",
+    "        )\n",
+    "\n",
+    "        return success_message, preview\n",
+    "\n",
+    "    except Exception as e:\n",
+    "        return f\"❌ Error: {str(e)}\", None"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 702
+    },
+    "id": "Z9WdaSfFUakj",
+    "outputId": "2fbce2c5-a6d3-4dd8-a9d2-0e38c18d202e"
+   },
+   "outputs": [],
+   "source": [
+    "with gr.Blocks(title=\"Synthetic Dataset Generator\", theme=gr.themes.Monochrome()) as interface:\n",
+    "    tokenizer = None\n",
+    "    inputs = None\n",
+    "    hf_model = None\n",
+    "    outputs = None\n",
+    "\n",
+    "    gr.Markdown(\"# Dataset Generator\")\n",
+    "    gr.Markdown(\"Generate synthetic datasets using AI models\")\n",
+    "\n",
+    "    with gr.Row():\n",
+    "        with gr.Column(scale=2):\n",
+    "            schema_input = gr.Textbox(\n",
+    "                label=\"Schema\",\n",
+    "                value=DEFAULT_SCHEMA_TEXT,\n",
+    "                lines=15,\n",
+    "                placeholder=\"Define your dataset schema here... Please follow this format: Name (TYPE) - Description, example: Example\"\n",
+    "            )\n",
+    "\n",
+    "            business_case_input = gr.Textbox(\n",
+    "                label=\"Business Case\",\n",
+    "                value=\"I want to generate restaurant dataset\",\n",
+    "                lines=1,\n",
+    "                placeholder=\"Enter business case description...\"\n",
+    "            )\n",
+    "\n",
+    "            with gr.Row():\n",
+    "                model_dropdown = gr.Dropdown(\n",
+    "                    label=\"Model\",\n",
+    "                    choices=list(MODELS.keys()),\n",
+    "                    value=list(MODELS.keys())[0],\n",
+    "                    interactive=True\n",
+    "                )\n",
+    "\n",
+    "                nr_records_input = gr.Number(\n",
+    "                    label=\"Number of records\",\n",
+    "                    value=27,\n",
+    "                    minimum=11,\n",
+    "                    maximum=1000,\n",
+    "                    step=1\n",
+    "                )\n",
+    "\n",
+    "            with gr.Row():\n",
+    "                filename_input = gr.Textbox(\n",
+    "                      label=\"Save as\",\n",
+    "                      value=\"restaurant_dataset\",\n",
+    "                      placeholder=\"Enter filename (extension will be added automatically)\"\n",
+    "                  )\n",
+    "\n",
+    "                file_format_dropdown = gr.Dropdown(\n",
+    "                    label=\"File format\",\n",
+    "                    choices=FILE_FORMATS,\n",
+    "                    value=FILE_FORMATS[0],\n",
+    "                    interactive=True\n",
+    "                )\n",
+    "\n",
+    "            generate_btn = gr.Button(\"🚀 Generate\", variant=\"secondary\", size=\"lg\")\n",
+    "\n",
+    "        with gr.Column(scale=1):\n",
+    "            gr.Markdown(\"\"\"\n",
+    "            ### 📝 Dataset Generation Instructions\n",
+    "\n",
+    "            1. **🗂 Schema** – Define your dataset structure\n",
+    "              *(default: restaurant schema provided)*\n",
+    "            2. **💡 Business Case** – Enter a prompt to guide the AI for generating data\n",
+    "            3. **🤖 Model** – Choose your AI model: GPT, Claude, Gemini, or Hugging Face\n",
+    "            4. **📊 Number of Records** – Specify entries to generate\n",
+    "              *(min: 11, max: 1000)*\n",
+    "            5. **📁 File Format** – Select output type: `.csv`, `.tsv`, `.jsonl`, or `.json`\n",
+    "            6. **💾 Save As** – Provide a filename *(extension auto-added)*\n",
+    "            7. **🚀 Generate** – Click **Generate** to create your dataset\n",
+    "\n",
+    "            ### 🔧 Requirements\n",
+    "\n",
+    "            Set API keys in Colab’s secret section:\n",
+    "              `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `GOOGLE_API_KEY`, `HF_TOKEN`\n",
+    "            \"\"\")\n",
+    "            output_status = gr.Textbox(\n",
+    "                label=\"Status\",\n",
+    "                lines=4,\n",
+    "                interactive=False\n",
+    "            )\n",
+    "\n",
+    "            output_preview = gr.Dataframe(\n",
+    "                label=\"Preview (first 10 rows)\",\n",
+    "                interactive=False,\n",
+    "                wrap=True\n",
+    "            )\n",
+    "\n",
+    "    generate_btn.click(\n",
+    "        fn=generate_dataset,\n",
+    "        inputs=[\n",
+    "            model_dropdown,\n",
+    "            business_case_input,\n",
+    "            nr_records_input,\n",
+    "            schema_input,\n",
+    "            file_format_dropdown,\n",
+    "            filename_input\n",
+    "        ],\n",
+    "        outputs=[output_status, output_preview]\n",
+    "    )\n",
+    "\n",
+    "interface.launch(debug=True)\n",
+    "\n",
+    "del tokenizer, inputs, hf_model, outputs\n",
+    "gc.collect()\n",
+    "torch.cuda.empty_cache()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "w-ewbsjInopm"
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}