From 756bd0b623be3b8906b677e50ad7f119ae842d14 Mon Sep 17 00:00:00 2001 From: Krabulek Date: Fri, 19 Sep 2025 13:45:07 +0200 Subject: [PATCH] Week 3 excercise - Intelligent Dataset Generator --- .../intelligent_dataset_generator.ipynb | 600 ++++++++++++++++++ 1 file changed, 600 insertions(+) create mode 100644 week3/community-contributions/intelligent_dataset_generator.ipynb diff --git a/week3/community-contributions/intelligent_dataset_generator.ipynb b/week3/community-contributions/intelligent_dataset_generator.ipynb new file mode 100644 index 0000000..9a374a1 --- /dev/null +++ b/week3/community-contributions/intelligent_dataset_generator.ipynb @@ -0,0 +1,600 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "QTJt9pwUTbHo" + }, + "source": [ + "# Intelligent Synthetic Dataset Generator\n", + "\n", + "An AI-powered tool that creates realistic synthetic datasets for any business case—whether you provide the schema or let it intelligently design one for you.\n", + "\n", + "It works with Claude, Gemini, GPT and HugginFace APIs." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "l_FljmlTUoka" + }, + "source": [ + "## Imports" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "aONqZ-SjUJdg", + "outputId": "1f5c7b2e-95f0-4f23-cf01-2bd5bda0807a" + }, + "outputs": [], + "source": [ + "!pip install -q requests bitsandbytes anthropic" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Ub1unBFvTatE" + }, + "outputs": [], + "source": [ + "import os\n", + "import requests\n", + "import json\n", + "from google.colab import userdata\n", + "\n", + "from openai import OpenAI\n", + "import anthropic\n", + "from huggingface_hub import login\n", + "from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig\n", + "import torch\n", + "import pandas as pd\n", + "\n", + "import gradio as gr\n", + "import gc" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "viZNPtObUOcz" + }, + "outputs": [], + "source": [ + "hf_token = userdata.get('HF_TOKEN')\n", + "openai_api_key = userdata.get('OPENAI_API_KEY')\n", + "anthropic_api_key = userdata.get('ANTHROPIC_API_KEY')\n", + "google_api_key = userdata.get('GOOGLE_API_KEY')\n", + "\n", + "login(hf_token, add_to_git_credential=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "9Q94S6JTUWn5" + }, + "outputs": [], + "source": [ + "quant_config = BitsAndBytesConfig(\n", + " load_in_4bit=True,\n", + " bnb_4bit_use_double_quant=True,\n", + " bnb_4bit_compute_dtype=torch.bfloat16,\n", + " bnb_4bit_quant_type=\"nf4\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "mrjdVEpaUxHz" + }, + "source": [ + "## Configuration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "LvNE6foEUPaz" + }, + "outputs": [], + "source": [ + "LLAMA = \"meta-llama/Meta-Llama-3.1-8B-Instruct\"\n", + "PHI3 = \"microsoft/Phi-3-mini-4k-instruct\"\n", + "GEMMA2 = \"google/gemma-2-2b-it\"\n", + "GPT = \"gpt-4o-mini\"\n", + "CLAUDE = \"claude-3-haiku-20240307\"\n", + "GEMINI = \"gemini-2.0-flash\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tvafTFD8XmaO" + }, + "outputs": [], + "source": [ + "MODELS = {\n", + " 'LLama 3.1' : LLAMA,\n", + " 'Phi 3 mini': PHI3,\n", + " 'Gemma 2': GEMMA2,\n", + " 'GPT 4.o mini': GPT,\n", + " 'Claude 3 Haiku': CLAUDE,\n", + " 'Gemini 2.0 Flash': GEMINI,\n", + "}\n", + "\n", + "HF_MODELS = [LLAMA, PHI3, GEMMA2]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2LZqA9QXXl0t" + }, + "outputs": [], + "source": [ + "FILE_FORMATS = [\".csv\", \".tsv\", \".jsonl\", \".json\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "d6EnN7SVXhza", + "outputId": "55f6ac4d-adeb-4216-b2a8-d67524b005d3" + }, + "outputs": [], + "source": [ + "SCHEMA = [\n", + " (\"Name\", \"TEXT\", \"Name of the restaurant\", \"Blue River Bistro\"),\n", + " (\"Address\", \"TEXT\", \"Restaurant address\", \"742 Evergreen Terrace, Springfield, IL 62704\"),\n", + " (\"Type\", \"TEXT\", \"Kitchen type\", 'One of [\"Thai\",\"Mediterranean\",\"Vegan\",\"Steakhouse\",\"Japanese\"] or other potential types'),\n", + " (\"Average Price\", \"TEXT\", \"Average meal price\", \"$45, or '--' if unknown\"),\n", + " (\"Year\", \"INT\", \"Year of restaurant opening\", 2015),\n", + " (\"Menu\", \"Array\", \"List of meals\", '[\"Grilled Salmon\", \"Caesar Salad\", \"Pad Thai\", \"Margherita Pizza\", ...]'),\n", + "]\n", + "\n", + "DEFAULT_SCHEMA_TEXT = \"\\n\".join([f\"{i+1}. {col[0]} ({col[1]}) - {col[2]}, example: {col[3]}\" for i, col in enumerate(SCHEMA)])\n", + "print(DEFAULT_SCHEMA_TEXT)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "W-46TDTOXiS7" + }, + "outputs": [], + "source": [ + "system_prompt = \"\"\"\n", + "You are an expert in generating synthetic datasets tailored to a given business case and user requirements.\n", + "If the user does not specify output columns, infer and create the most appropriate columns based on your expertise.\n", + "Do NOT repeat column values from one row to another. Only output valid JSONL without any comments.\"\n", + "\"\"\"\n", + "\n", + "\n", + "def get_user_prompt(business_case, schema_text, nr_records):\n", + " prompt = f\"The business case is: {business_case}.\\nGenerate {nr_records} rows of data in JSONL format.\\n\"\n", + "\n", + " if schema_text is not None:\n", + " prompt += f\"Each line should be a JSON object with the following fields: \\n{schema_text}\\n\"\n", + "\n", + " return prompt" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "gPf1GcAwhwa_" + }, + "source": [ + "## LLM handler" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Tf-WEQUKhY-z" + }, + "outputs": [], + "source": [ + "def ask_gpt(model: str, user_prompt: str):\n", + " client = OpenAI(api_key=openai_api_key)\n", + " messages = [\n", + " {\"role\": \"system\", \"content\": system_prompt},\n", + " {\"role\": \"user\", \"content\": user_prompt}\n", + " ]\n", + " response = client.chat.completions.create(\n", + " model=model,\n", + " messages=messages,\n", + " temperature=0.7\n", + " )\n", + " content = response.choices[0].message.content\n", + "\n", + " return content" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "856pnIHahzDd" + }, + "outputs": [], + "source": [ + "def ask_claude(model: str, user_prompt: str):\n", + " client = anthropic.Anthropic(api_key=anthropic_api_key)\n", + " response = client.messages.create(\n", + " model=model,\n", + " messages=[{\"role\": \"user\", \"content\": user_prompt}],\n", + " max_tokens=4000,\n", + " temperature=0.7,\n", + " system=system_prompt\n", + " )\n", + " content = response.content[0].text\n", + "\n", + " return content" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "p0AfSbcBiUlg" + }, + "outputs": [], + "source": [ + "def ask_gemini(model: str, user_prompt: str):\n", + " client = OpenAI(\n", + " api_key=google_api_key,\n", + " base_url=\"https://generativelanguage.googleapis.com/v1beta/openai/\"\n", + " )\n", + " messages = [\n", + " {\"role\": \"system\", \"content\": system_prompt},\n", + " {\"role\": \"user\", \"content\": user_prompt}\n", + " ]\n", + " response = client.chat.completions.create(\n", + " model=model,\n", + " messages=messages,\n", + " temperature=0.7\n", + " )\n", + " content = response.choices[0].message.content\n", + "\n", + " return content" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "K9LZZPJ9irrH" + }, + "outputs": [], + "source": [ + "def ask_hf(model: str, user_prompt: str):\n", + " global tokenizer, inputs, hf_model, outputs\n", + "\n", + " messages = [\n", + " {\"role\": \"system\", \"content\": system_prompt},\n", + " {\"role\": \"user\", \"content\": user_prompt}\n", + " ]\n", + "\n", + " tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)\n", + " tokenizer.pad_token = tokenizer.eos_token\n", + " inputs = tokenizer.apply_chat_template(messages, return_tensors=\"pt\").to(\"cuda\")\n", + " if hf_model == None:\n", + " hf_model = AutoModelForCausalLM.from_pretrained(model, device_map=\"auto\", quantization_config=quant_config)\n", + " outputs = hf_model.generate(inputs, max_new_tokens=4000)\n", + "\n", + " _, _, after = tokenizer.decode(outputs[0]).partition(\"assistant<|end_header_id|>\")\n", + " content = after.strip()\n", + "\n", + " return content" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "eu7Sv3bDhXdI" + }, + "outputs": [], + "source": [ + "def query_llm(model_name: str, user_prompt):\n", + " try:\n", + " model = MODELS[model_name]\n", + "\n", + " if \"gpt\" in model.lower():\n", + " response = ask_gpt(model, user_prompt)\n", + "\n", + " elif \"claude\" in model.lower():\n", + " response = ask_claude(model, user_prompt)\n", + "\n", + " elif \"gemini\" in model.lower():\n", + " response = ask_gemini(model, user_prompt)\n", + "\n", + " elif model in HF_MODELS:\n", + " response = ask_hf(model, user_prompt)\n", + "\n", + " else:\n", + " raise ValueError(f\"Unsupported model. Use one of {', '.join(MODELS.keys())}\")\n", + "\n", + " lines = [line.strip() for line in response.strip().splitlines() if line.strip().startswith(\"{\")]\n", + "\n", + " return [json.loads(line) for line in lines]\n", + "\n", + " except Exception as e:\n", + " raise Exception(f\"Model query failed: {str(e)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "mxuwLUsVlBlY" + }, + "source": [ + "## Output Formatter" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "IAKfqgZIlGuP" + }, + "outputs": [], + "source": [ + "def save_dataset(records, file_format: str, file_name: str):\n", + " df = pd.DataFrame(records)\n", + " print(df.shape)\n", + " if file_format == \".csv\":\n", + " df.to_csv(file_name, index=False)\n", + " elif file_format == \".tsv\":\n", + " df.to_csv(file_name, sep=\"\\t\", index=False)\n", + " elif file_format == \".jsonl\":\n", + " with open(file_name, \"w\") as f:\n", + " for record in records:\n", + " f.write(json.dumps(record) + \"\\n\")\n", + " elif file_format == \".json\":\n", + " df.to_json(file_name, orient=\"records\", index=False)\n", + " else:\n", + " raise ValueError(\"Unsupported file format\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "gkpkQ0nal_5B" + }, + "outputs": [], + "source": [ + "def generate_dataset(\n", + " model_name: str,\n", + " business_case: str,\n", + " num_records: int = 100,\n", + " schema_text: str = None,\n", + " file_format: str = '.jsonl',\n", + " file_name: str = 'test_dataset.jsonl'\n", + "):\n", + " \"\"\"\n", + " Generates a synthetic dataset using an LLM based on the given business case and optional schema.\n", + "\n", + " Returns:\n", + " Tuple[str, pd.DataFrame | None]: A status message and a preview DataFrame (first 10 rows) if successful.\n", + " \"\"\"\n", + " try:\n", + " # Validate number of records\n", + " if num_records <= 10:\n", + " return \"❌ Error: Number of records must be greater than 10.\", None\n", + " if num_records > 1000:\n", + " return \"❌ Error: Number of records must be less than or equal to 1000.\", None\n", + "\n", + " # Validate file format\n", + " if file_format not in FILE_FORMATS:\n", + " return f\"❌ Error: Invalid file format '{file_format}'. Supported formats: {FILE_FORMATS}\", None\n", + "\n", + " # Ensure file name has correct extension\n", + " if not file_name.endswith(file_format):\n", + " file_name += file_format\n", + "\n", + " # Generate the prompt and query the model\n", + " prompt = get_user_prompt(business_case, schema_text, num_records)\n", + " records = query_llm(model_name, prompt)\n", + "\n", + " if not records:\n", + " return \"❌ Error: No valid records were generated by the model.\", None\n", + "\n", + " # Save dataset\n", + " save_dataset(records, file_format, file_name)\n", + "\n", + " # Prepare preview\n", + " df = pd.DataFrame(records)\n", + " preview = df.head(10)\n", + "\n", + " success_message = (\n", + " f\"✅ Generated {len(records)} records successfully!\\n\"\n", + " f\"📁 Saved to: {file_name}\\n\"\n", + " )\n", + "\n", + " return success_message, preview\n", + "\n", + " except Exception as e:\n", + " return f\"❌ Error: {str(e)}\", None" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 702 + }, + "id": "Z9WdaSfFUakj", + "outputId": "2fbce2c5-a6d3-4dd8-a9d2-0e38c18d202e" + }, + "outputs": [], + "source": [ + "with gr.Blocks(title=\"Synthetic Dataset Generator\", theme=gr.themes.Monochrome()) as interface:\n", + " tokenizer = None\n", + " inputs = None\n", + " hf_model = None\n", + " outputs = None\n", + "\n", + " gr.Markdown(\"# Dataset Generator\")\n", + " gr.Markdown(\"Generate synthetic datasets using AI models\")\n", + "\n", + " with gr.Row():\n", + " with gr.Column(scale=2):\n", + " schema_input = gr.Textbox(\n", + " label=\"Schema\",\n", + " value=DEFAULT_SCHEMA_TEXT,\n", + " lines=15,\n", + " placeholder=\"Define your dataset schema here... Please follow this format: Name (TYPE) - Description, example: Example\"\n", + " )\n", + "\n", + " business_case_input = gr.Textbox(\n", + " label=\"Business Case\",\n", + " value=\"I want to generate restaurant dataset\",\n", + " lines=1,\n", + " placeholder=\"Enter business case description...\"\n", + " )\n", + "\n", + " with gr.Row():\n", + " model_dropdown = gr.Dropdown(\n", + " label=\"Model\",\n", + " choices=list(MODELS.keys()),\n", + " value=list(MODELS.keys())[0],\n", + " interactive=True\n", + " )\n", + "\n", + " nr_records_input = gr.Number(\n", + " label=\"Number of records\",\n", + " value=27,\n", + " minimum=11,\n", + " maximum=1000,\n", + " step=1\n", + " )\n", + "\n", + " with gr.Row():\n", + " filename_input = gr.Textbox(\n", + " label=\"Save as\",\n", + " value=\"restaurant_dataset\",\n", + " placeholder=\"Enter filename (extension will be added automatically)\"\n", + " )\n", + "\n", + " file_format_dropdown = gr.Dropdown(\n", + " label=\"File format\",\n", + " choices=FILE_FORMATS,\n", + " value=FILE_FORMATS[0],\n", + " interactive=True\n", + " )\n", + "\n", + " generate_btn = gr.Button(\"🚀 Generate\", variant=\"secondary\", size=\"lg\")\n", + "\n", + " with gr.Column(scale=1):\n", + " gr.Markdown(\"\"\"\n", + " ### 📝 Dataset Generation Instructions\n", + "\n", + " 1. **🗂 Schema** – Define your dataset structure\n", + " *(default: restaurant schema provided)*\n", + " 2. **💡 Business Case** – Enter a prompt to guide the AI for generating data\n", + " 3. **🤖 Model** – Choose your AI model: GPT, Claude, Gemini, or Hugging Face\n", + " 4. **📊 Number of Records** – Specify entries to generate\n", + " *(min: 11, max: 1000)*\n", + " 5. **📁 File Format** – Select output type: `.csv`, `.tsv`, `.jsonl`, or `.json`\n", + " 6. **💾 Save As** – Provide a filename *(extension auto-added)*\n", + " 7. **🚀 Generate** – Click **Generate** to create your dataset\n", + "\n", + " ### 🔧 Requirements\n", + "\n", + " Set API keys in Colab’s secret section:\n", + " `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `GOOGLE_API_KEY`, `HF_TOKEN`\n", + " \"\"\")\n", + " output_status = gr.Textbox(\n", + " label=\"Status\",\n", + " lines=4,\n", + " interactive=False\n", + " )\n", + "\n", + " output_preview = gr.Dataframe(\n", + " label=\"Preview (first 10 rows)\",\n", + " interactive=False,\n", + " wrap=True\n", + " )\n", + "\n", + " generate_btn.click(\n", + " fn=generate_dataset,\n", + " inputs=[\n", + " model_dropdown,\n", + " business_case_input,\n", + " nr_records_input,\n", + " schema_input,\n", + " file_format_dropdown,\n", + " filename_input\n", + " ],\n", + " outputs=[output_status, output_preview]\n", + " )\n", + "\n", + "interface.launch(debug=True)\n", + "\n", + "del tokenizer, inputs, hf_model, outputs\n", + "gc.collect()\n", + "torch.cuda.empty_cache()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "w-ewbsjInopm" + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "gpuType": "T4", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.13" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}