From de28580c722a8827dac9cafebd5244124a7d159f Mon Sep 17 00:00:00 2001 From: Umar Javed Date: Tue, 21 Oct 2025 17:49:37 +0500 Subject: [PATCH] Synthetic data writer --- .../w3day5_synthetic_dataset_generator.ipynb | 540 ++++++++++++++++++ 1 file changed, 540 insertions(+) create mode 100644 week3/community-contributions/w3day5_synthetic_dataset_generator.ipynb diff --git a/week3/community-contributions/w3day5_synthetic_dataset_generator.ipynb b/week3/community-contributions/w3day5_synthetic_dataset_generator.ipynb new file mode 100644 index 0000000..179db82 --- /dev/null +++ b/week3/community-contributions/w3day5_synthetic_dataset_generator.ipynb @@ -0,0 +1,540 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "%pip install -q bitsandbytes>=0.43.1 accelerate transformers torch sentencepiece" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "💻 CPU mode - loading without quantization...\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "2fa644e735144ab0a238f031bf7c6c7a", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "model.safetensors.index.json: 0%| | 0.00/23.9k [00:00\n", + "Trying alternative loading method...\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "17d3da1874734c7fbf542b239f6f5ba0", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Fetching 4 files: 0%| | 0/4 [00:00\n", + "Traceback (most recent call last):\n", + " File \"/opt/miniconda3/lib/python3.13/site-packages/tqdm/std.py\", line 1148, in __del__\n", + " self.close()\n", + " File \"/opt/miniconda3/lib/python3.13/site-packages/tqdm/notebook.py\", line 279, in close\n", + " self.disp(bar_style='danger', check_delay=False)\n", + "AttributeError: 'tqdm' object has no attribute 'disp'\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"/opt/miniconda3/lib/python3.13/site-packages/tqdm/std.py\", line 1148, in __del__\n", + " self.close()\n", + " File \"/opt/miniconda3/lib/python3.13/site-packages/tqdm/notebook.py\", line 279, in close\n", + " self.disp(bar_style='danger', check_delay=False)\n", + "AttributeError: 'tqdm' object has no attribute 'disp'\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"/opt/miniconda3/lib/python3.13/site-packages/tqdm/std.py\", line 1148, in __del__\n", + " self.close()\n", + " File \"/opt/miniconda3/lib/python3.13/site-packages/tqdm/notebook.py\", line 279, in close\n", + " self.disp(bar_style='danger', check_delay=False)\n", + "AttributeError: 'tqdm' object has no attribute 'disp'\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"/opt/miniconda3/lib/python3.13/site-packages/tqdm/std.py\", line 1148, in __del__\n", + " self.close()\n", + " File \"/opt/miniconda3/lib/python3.13/site-packages/tqdm/notebook.py\", line 279, in close\n", + " self.disp(bar_style='danger', check_delay=False)\n", + "AttributeError: 'tqdm' object has no attribute 'disp'\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"/opt/miniconda3/lib/python3.13/site-packages/tqdm/std.py\", line 1148, in __del__\n", + " self.close()\n", + " File \"/opt/miniconda3/lib/python3.13/site-packages/tqdm/notebook.py\", line 279, in close\n", + " self.disp(bar_style='danger', check_delay=False)\n", + "AttributeError: 'tqdm' object has no attribute 'disp'\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"/opt/miniconda3/lib/python3.13/site-packages/tqdm/std.py\", line 1148, in __del__\n", + " self.close()\n", + " File \"/opt/miniconda3/lib/python3.13/site-packages/tqdm/notebook.py\", line 279, in close\n", + " self.disp(bar_style='danger', check_delay=False)\n", + "AttributeError: 'tqdm' object has no attribute 'disp'\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"/opt/miniconda3/lib/python3.13/site-packages/tqdm/std.py\", line 1148, in __del__\n", + " self.close()\n", + " File \"/opt/miniconda3/lib/python3.13/site-packages/tqdm/notebook.py\", line 279, in close\n", + " self.disp(bar_style='danger', check_delay=False)\n", + "AttributeError: 'tqdm' object has no attribute 'disp'\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"/opt/miniconda3/lib/python3.13/site-packages/tqdm/std.py\", line 1148, in __del__\n", + " self.close()\n", + " File \"/opt/miniconda3/lib/python3.13/site-packages/tqdm/notebook.py\", line 279, in close\n", + " self.disp(bar_style='danger', check_delay=False)\n", + "AttributeError: 'tqdm' object has no attribute 'disp'\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Llama model completely failed: \n", + "Will use OpenAI only mode.\n" + ] + } + ], + "source": [ + "import torch\n", + "import pandas as pd\n", + "import random\n", + "from io import StringIO\n", + "from openai import OpenAI\n", + "import gradio as gr\n", + "from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig\n", + "from dotenv import load_dotenv\n", + "import os\n", + "\n", + "load_dotenv(override=True)\n", + "openai = OpenAI()\n", + "\n", + "LLAMA = \"meta-llama/Meta-Llama-3.1-8B-Instruct\"\n", + "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", + "\n", + "try:\n", + " tokenizer = AutoTokenizer.from_pretrained(LLAMA)\n", + " tokenizer.pad_token = tokenizer.eos_token\n", + " \n", + " if torch.cuda.is_available():\n", + " print(\"🚀 CUDA available - loading with quantization...\")\n", + " quant_config = BitsAndBytesConfig(\n", + " load_in_4bit=True,\n", + " bnb_4bit_use_double_quant=True,\n", + " bnb_4bit_compute_dtype=torch.bfloat16,\n", + " bnb_4bit_quant_type=\"nf4\"\n", + " )\n", + " model = AutoModelForCausalLM.from_pretrained(LLAMA, device_map=\"auto\", quantization_config=quant_config)\n", + " else:\n", + " print(\"💻 CPU mode - loading without quantization...\")\n", + " model = AutoModelForCausalLM.from_pretrained(LLAMA, device_map=\"cpu\", torch_dtype=torch.float16)\n", + " \n", + " print(\"Llama model loaded successfully!\")\n", + "except Exception as e:\n", + " print(f\"Llama model failed to load: {e}\")\n", + " print(\"Trying alternative loading method...\")\n", + " try:\n", + " tokenizer = AutoTokenizer.from_pretrained(LLAMA)\n", + " tokenizer.pad_token = tokenizer.eos_token\n", + " model = AutoModelForCausalLM.from_pretrained(LLAMA, device_map=\"cpu\", torch_dtype=torch.float32)\n", + " print(\"Llama model loaded in CPU mode!\")\n", + " except Exception as e2:\n", + " print(f\"Llama model completely failed: {e2}\")\n", + " print(\"Will use OpenAI only mode.\")\n", + " model = None\n", + " tokenizer = None\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "def generate_with_openai(dataset_type, num_records, region):\n", + " prompts = {\n", + " \"employees\": f\"Generate {num_records} synthetic employee records with {region} addresses. Include: employee_id, first_name, last_name, email, phone, department, salary, hire_date, address, city, state, country.\",\n", + " \"customers\": f\"Generate {num_records} synthetic customer records with {region} addresses. Include: customer_id, first_name, last_name, email, phone, company, address, city, state, country, registration_date.\",\n", + " \"products\": f\"Generate {num_records} synthetic product records. Include: product_id, name, category, price, description, brand, stock_quantity, supplier, created_date.\",\n", + " \"transactions\": f\"Generate {num_records} synthetic transaction records. Include: transaction_id, customer_id, product_id, amount, quantity, transaction_date, payment_method, status.\"\n", + " }\n", + " \n", + " response = openai.chat.completions.create(\n", + " model=\"gpt-4o-mini\",\n", + " messages=[\n", + " {\"role\": \"system\", \"content\": \"You are a data generation expert. Create realistic, diverse synthetic data in CSV format.\"},\n", + " {\"role\": \"user\", \"content\": prompts[dataset_type]}\n", + " ]\n", + " )\n", + " \n", + " return clean_csv_response(response.choices[0].message.content)\n", + "\n", + "def generate_with_llama(dataset_type, num_records, region):\n", + " if model is None or tokenizer is None:\n", + " return \"❌ Llama model not available. Please use OpenAI option.\"\n", + " \n", + " prompts = {\n", + " \"employees\": f\"Create {num_records} employee records with {region} addresses: employee_id, first_name, last_name, email, phone, department, salary, hire_date, address, city, state, country. Format as CSV.\",\n", + " \"customers\": f\"Create {num_records} customer records with {region} addresses: customer_id, first_name, last_name, email, phone, company, address, city, state, country, registration_date. Format as CSV.\",\n", + " \"products\": f\"Create {num_records} product records: product_id, name, category, price, description, brand, stock_quantity, supplier, created_date. Format as CSV.\",\n", + " \"transactions\": f\"Create {num_records} transaction records: transaction_id, customer_id, product_id, amount, quantity, transaction_date, payment_method, status. Format as CSV.\"\n", + " }\n", + " \n", + " try:\n", + " inputs = tokenizer(prompts[dataset_type], return_tensors=\"pt\").to(device)\n", + " \n", + " with torch.no_grad():\n", + " outputs = model.generate(\n", + " **inputs,\n", + " max_new_tokens=2048,\n", + " temperature=0.7,\n", + " do_sample=True,\n", + " pad_token_id=tokenizer.eos_token_id\n", + " )\n", + " \n", + " response = tokenizer.decode(outputs[0], skip_special_tokens=True)\n", + " return clean_csv_response(response)\n", + " except Exception as e:\n", + " return f\"❌ Error generating with Llama: {str(e)}\"\n", + "\n", + "def clean_csv_response(response):\n", + " response = response.strip()\n", + " if \"```\" in response:\n", + " response = response.split(\"```\")[1] if len(response.split(\"```\")) > 1 else response\n", + " return response\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "def generate_dataset(dataset_type, num_records, region, model_choice):\n", + " try:\n", + " if model_choice == \"OpenAI GPT-4o-mini\":\n", + " csv_data = generate_with_openai(dataset_type, num_records, region)\n", + " else:\n", + " csv_data = generate_with_llama(dataset_type, num_records, region)\n", + " \n", + " df = pd.read_csv(StringIO(csv_data))\n", + " return df, csv_data, f\"✅ Generated {len(df)} records successfully!\"\n", + " except Exception as e:\n", + " return pd.DataFrame(), \"\", f\"❌ Error: {str(e)}\"\n", + "\n", + "def download_csv(csv_data):\n", + " return csv_data if csv_data else \"\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "* Running on local URL: http://127.0.0.1:7863\n", + "* Running on public URL: https://aaf0c65f7daaafbd21.gradio.live\n", + "\n", + "This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)\n" + ] + }, + { + "data": { + "text/html": [ + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Traceback (most recent call last):\n", + " File \"/opt/miniconda3/lib/python3.13/site-packages/gradio/queueing.py\", line 759, in process_events\n", + " response = await route_utils.call_process_api(\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " ...<5 lines>...\n", + " )\n", + " ^\n", + " File \"/opt/miniconda3/lib/python3.13/site-packages/gradio/route_utils.py\", line 354, in call_process_api\n", + " output = await app.get_blocks().process_api(\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " ...<11 lines>...\n", + " )\n", + " ^\n", + " File \"/opt/miniconda3/lib/python3.13/site-packages/gradio/blocks.py\", line 2127, in process_api\n", + " data = await self.postprocess_data(block_fn, result[\"prediction\"], state)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/opt/miniconda3/lib/python3.13/site-packages/gradio/blocks.py\", line 1910, in postprocess_data\n", + " await processing_utils.async_move_files_to_cache(\n", + " ...<3 lines>...\n", + " )\n", + " File \"/opt/miniconda3/lib/python3.13/site-packages/gradio/processing_utils.py\", line 594, in async_move_files_to_cache\n", + " return await client_utils.async_traverse(\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " data, _move_to_cache, client_utils.is_file_obj_with_meta\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " )\n", + " ^\n", + " File \"/opt/miniconda3/lib/python3.13/site-packages/gradio_client/utils.py\", line 1197, in async_traverse\n", + " return await func(json_obj)\n", + " ^^^^^^^^^^^^^^^^^^^^\n", + " File \"/opt/miniconda3/lib/python3.13/site-packages/gradio/processing_utils.py\", line 560, in _move_to_cache\n", + " elif utils.is_static_file(payload):\n", + " ~~~~~~~~~~~~~~~~~~~~^^^^^^^^^\n", + " File \"/opt/miniconda3/lib/python3.13/site-packages/gradio/utils.py\", line 1191, in is_static_file\n", + " return _is_static_file(file_path, _StaticFiles.all_paths)\n", + " File \"/opt/miniconda3/lib/python3.13/site-packages/gradio/utils.py\", line 1204, in _is_static_file\n", + " if not file_path.exists():\n", + " ~~~~~~~~~~~~~~~~^^\n", + " File \"/opt/miniconda3/lib/python3.13/pathlib/_abc.py\", line 450, in exists\n", + " self.stat(follow_symlinks=follow_symlinks)\n", + " ~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/opt/miniconda3/lib/python3.13/pathlib/_local.py\", line 515, in stat\n", + " return os.stat(self, follow_symlinks=follow_symlinks)\n", + " ~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + "OSError: [Errno 63] File name too long: 'csv\\ntransaction_id,customer_id,product_id,amount,quantity,transaction_date,payment_method,status\\n1,CUST001,PROD1001,29.99,1,2023-01-15,Credit Card,Completed\\n2,CUST002,PROD1002,15.49,2,2023-01-18,Debit Card,Completed\\n3,CUST003,PROD1003,65.00,1,2023-02-01,PayPal,Pending\\n4,CUST001,PROD1004,10.99,3,2023-02-10,Credit Card,Completed\\n5,CUST004,PROD1005,45.50,1,2023-02-20,Cash,Completed\\n6,CUST005,PROD1006,89.99,1,2023-03-02,Debit Card,Completed\\n7,CUST002,PROD1007,24.99,2,2023-03-14,Credit Card,Cancelled\\n8,CUST003,PROD1008,12.50,4,2023-03-20,PayPal,Completed\\n9,CUST006,PROD1009,150.00,1,2023-04-01,Credit Card,Completed\\n10,CUST007,PROD1010,30.00,2,2023-04-10,Debit Card,Pending\\n11,CUST008,PROD1011,5.99,10,2023-04-12,Cash,Completed\\n12,CUST001,PROD1012,70.00,1,2023-05-05,Credit Card,Completed\\n13,CUST009,PROD1013,100.00,1,2023-05-15,PayPal,Completed\\n14,CUST004,PROD1014,45.00,1,2023-05-25,Credit Card,Returned\\n15,CUST002,PROD1015,7.50,5,2023-06-10,Debit Card,Completed\\n16,CUST005,PROD1016,22.00,3,2023-06-12,Cash,Completed\\n17,CUST006,PROD1017,120.00,1,2023-06-20,Credit Card,Pending\\n18,CUST008,PROD1018,80.00,1,2023-07-01,PayPal,Completed\\n19,CUST007,PROD1019,60.00,2,2023-07-05,Credit Card,Completed\\n20,CUST003,PROD1020,15.00,3,2023-07-15,Debit Card,Completed\\n'\n", + "Traceback (most recent call last):\n", + " File \"/opt/miniconda3/lib/python3.13/site-packages/gradio/queueing.py\", line 759, in process_events\n", + " response = await route_utils.call_process_api(\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " ...<5 lines>...\n", + " )\n", + " ^\n", + " File \"/opt/miniconda3/lib/python3.13/site-packages/gradio/route_utils.py\", line 354, in call_process_api\n", + " output = await app.get_blocks().process_api(\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " ...<11 lines>...\n", + " )\n", + " ^\n", + " File \"/opt/miniconda3/lib/python3.13/site-packages/gradio/blocks.py\", line 2127, in process_api\n", + " data = await self.postprocess_data(block_fn, result[\"prediction\"], state)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/opt/miniconda3/lib/python3.13/site-packages/gradio/blocks.py\", line 1910, in postprocess_data\n", + " await processing_utils.async_move_files_to_cache(\n", + " ...<3 lines>...\n", + " )\n", + " File \"/opt/miniconda3/lib/python3.13/site-packages/gradio/processing_utils.py\", line 594, in async_move_files_to_cache\n", + " return await client_utils.async_traverse(\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " data, _move_to_cache, client_utils.is_file_obj_with_meta\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " )\n", + " ^\n", + " File \"/opt/miniconda3/lib/python3.13/site-packages/gradio_client/utils.py\", line 1197, in async_traverse\n", + " return await func(json_obj)\n", + " ^^^^^^^^^^^^^^^^^^^^\n", + " File \"/opt/miniconda3/lib/python3.13/site-packages/gradio/processing_utils.py\", line 560, in _move_to_cache\n", + " elif utils.is_static_file(payload):\n", + " ~~~~~~~~~~~~~~~~~~~~^^^^^^^^^\n", + " File \"/opt/miniconda3/lib/python3.13/site-packages/gradio/utils.py\", line 1191, in is_static_file\n", + " return _is_static_file(file_path, _StaticFiles.all_paths)\n", + " File \"/opt/miniconda3/lib/python3.13/site-packages/gradio/utils.py\", line 1204, in _is_static_file\n", + " if not file_path.exists():\n", + " ~~~~~~~~~~~~~~~~^^\n", + " File \"/opt/miniconda3/lib/python3.13/pathlib/_abc.py\", line 450, in exists\n", + " self.stat(follow_symlinks=follow_symlinks)\n", + " ~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/opt/miniconda3/lib/python3.13/pathlib/_local.py\", line 515, in stat\n", + " return os.stat(self, follow_symlinks=follow_symlinks)\n", + " ~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + "OSError: [Errno 63] File name too long: 'csv\\nproduct_id,name,category,price,description,brand,stock_quantity,supplier,created_date\\nP001,Wireless Earbuds,Electronics,79.99,\"Noise-cancelling wireless earbuds with touch controls.\",\"SoundWave\",250,\"TechSupply Co.\",2023-08-15\\nP002,Men\\'s Running Shoes,Sportswear,89.99,\"Lightweight and breathable running shoes designed for comfort.\",\"FitRun\",150,\"SportyDeals\",2023-09-05\\nP003,4K Ultra HD TV,Electronics,499.99,\"55-inch 4K Ultra HD Smart LED TV with HDR.\",\"VisionMax\",80,\"HomeTech Distributors\",2023-08-20\\nP004,Coffee Maker,Home Appliances,49.99,\"Programmable coffee maker with 12-cup capacity.\",\"BrewMaster\",200,\"Kitchen Supply Inc.\",2023-07-30\\nP005,Water Bottle,Sports Equipment,19.99,\"Insulated stainless steel water bottle, keeps drinks cold for 24 hours.\",\"HydroCool\",500,\"EcoBottles\",2023-09-10\\nP006,Ergonomic Office Chair,Furniture,199.99,\"Comfortable ergonomic chair with lumbar support and adjustable height.\",\"Home Comforts\",75,\"OfficeWorks\",2023-08-28\\nP007,Smart Watch,Electronics,249.99,\"Smart watch with fitness tracking and heart rate monitor.\",\"FitTrack\",120,\"GizmoGadgets\",2023-09-12\\nP008,Yoga Mat,Sports Equipment,29.99,\"Non-slip yoga mat with extra cushioning.\",\"Zen Yoga\",350,\"Wellness Store\",2023-09-15\\nP009,Air Fryer,Home Appliances,89.99,\"Compact air fryer with multiple cooking presets.\",\"CrispyCook\",145,\"KitchenPro\",2023-08-02\\nP010,Wireless Mouse,Electronics,29.99,\"Ergonomic wireless mouse with customizable buttons.\",\"ClickTech\",300,\"Gadget World\",2023-07-25\\nP011,Spice Rack Organization Set,Home Decor,39.99,\"Rotating spice rack with 12 glass jars included.\",\"HomeChef\",210,\"OrganizeIt Co.\",2023-08-17\\nP012,Dumbbell Set,Sports Equipment,99.99,\"Adjustable dumbbell set ranging from 5 to 30 lbs.\",\"StrengthTech\",100,\"Fit Equipment\",2023-09-01\\nP013,Kids\\' Backpack,Accessories,34.99,\"Durable backpack with multiple compartments for school.\",\"KidStyle\",175,\"Backpack Haven\",2023-08-23\\nP014,Digital Camera,Electronics,399.99,\"Compact digital camera with 20 MP and full HD video.\",\"SnapShot\",60,\"Camera Boutique\",2023-09-09\\nP015,Portable Bluetooth Speaker,Electronics,59.99,\"Water-resistant Bluetooth speaker with 12 hours of playtime.\",\"SoundBox\",130,\"Audio Plus\",2023-09-14\\nP016,Electric Toothbrush,Health & Personal Care,59.99,\"Rechargeable electric toothbrush with timer and pressure sensor.\",\"DentalCare\",400,\"HealthFirst Supplies\",2023-08-30\\nP017,Tote Bag,Accessories,24.99,\"Stylish and spacious tote bag for everyday use.\",\"Chic Designs\",300,\"Fashion Hub\",2023-09-06\\nP018,Sneaker Cleaner Kit,Accessories,15.99,\"Complete shoe cleaning kit for all types of sneakers.\",\"FreshFeet\",500,\"CleanKicks\",2023-09-03\\nP019,Camping Tent,Outdoor,129.99,\"Easy setup camping tent for 4 people, weather-resistant.\",\"Outdoors Pro\",85,\"Adventure Outfitters\",2023-08-12\\nP020,LED Desk Lamp,Home Decor,39.99,\"Adjustable LED desk lamp with multiple brightness settings.\",\"BrightEase\",170,\"HomeLight Solutions\",2023-09-08\\n'\n" + ] + } + ], + "source": [ + "with gr.Blocks(\n", + " theme=gr.themes.Soft(\n", + " primary_hue=\"blue\",\n", + " neutral_hue=\"gray\",\n", + " font=[\"Inter\", \"ui-sans-serif\", \"system-ui\"]\n", + " ),\n", + " css=\"\"\"\n", + " .gradio-container { max-width: 1200px !important; margin: auto !important; }\n", + " .header { text-align: center; margin-bottom: 2em; }\n", + " .header h1 { color: #1f2937; font-size: 2.5em; margin-bottom: 0.5em; }\n", + " .header p { color: #6b7280; font-size: 1.1em; }\n", + " .generate-btn { background: linear-gradient(135deg, #3b82f6 0%, #1d4ed8 100%) !important; }\n", + " .generate-btn:hover { transform: translateY(-2px) !important; box-shadow: 0 8px 25px rgba(59, 130, 246, 0.3) !important; }\n", + " .stats-card { background: linear-gradient(135deg, #f8fafc 0%, #e2e8f0 100%); border-radius: 12px; padding: 1.5em; margin: 1em 0; }\n", + " \"\"\"\n", + ") as demo:\n", + " \n", + " gr.HTML(\"\"\"\n", + "
\n", + "

Synthetic Dataset Generator

\n", + "

Generate realistic synthetic datasets using AI models for testing and development

\n", + "
\n", + " \"\"\")\n", + " \n", + " with gr.Row():\n", + " with gr.Column(scale=1):\n", + " gr.Markdown(\"### Configuration\")\n", + " \n", + " dataset_type = gr.Dropdown(\n", + " choices=[\"employees\", \"customers\", \"products\", \"transactions\"],\n", + " value=\"employees\",\n", + " label=\"Dataset Type\",\n", + " info=\"Choose the type of data to generate\"\n", + " )\n", + " \n", + " num_records = gr.Slider(\n", + " minimum=5, maximum=100, step=5, value=20,\n", + " label=\"Number of Records\",\n", + " info=\"How many records to generate\"\n", + " )\n", + " \n", + " region = gr.Dropdown(\n", + " choices=[\"US Only\", \"International\", \"Mixed\", \"Europe\", \"Asia\"],\n", + " value=\"US Only\",\n", + " label=\"Geographic Region\",\n", + " info=\"Location for addresses and phone numbers\"\n", + " )\n", + " \n", + " model_choice = gr.Radio(\n", + " choices=[\"OpenAI GPT-4o-mini\", \"Llama 3.1 8B\"],\n", + " value=\"OpenAI GPT-4o-mini\",\n", + " label=\"AI Model\",\n", + " info=\"Choose the AI model for generation\"\n", + " )\n", + " \n", + " generate_btn = gr.Button(\n", + " \"Generate Dataset\",\n", + " variant=\"primary\",\n", + " elem_classes=\"generate-btn\",\n", + " size=\"lg\"\n", + " )\n", + " \n", + " with gr.Column(scale=2):\n", + " gr.Markdown(\"### Generated Dataset\")\n", + " \n", + " status = gr.Markdown(\"Ready to generate your dataset!\")\n", + " \n", + " dataframe_output = gr.Dataframe(\n", + " value=pd.DataFrame(),\n", + " label=\"Dataset Preview\",\n", + " wrap=True\n", + " )\n", + " \n", + " with gr.Row():\n", + " csv_output = gr.Textbox(\n", + " value=\"\",\n", + " label=\"CSV Data\",\n", + " lines=10,\n", + " max_lines=15\n", + " )\n", + " \n", + " download_btn = gr.DownloadButton(\n", + " \"Download CSV\",\n", + " elem_id=\"download-btn\"\n", + " )\n", + " \n", + " generate_btn.click(\n", + " generate_dataset,\n", + " inputs=[dataset_type, num_records, region, model_choice],\n", + " outputs=[dataframe_output, csv_output, status]\n", + " )\n", + " \n", + " csv_output.change(\n", + " download_csv,\n", + " inputs=[csv_output],\n", + " outputs=[download_btn]\n", + " )\n", + "\n", + "demo.launch(share=True, inbrowser=True)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}