LLM_Engineering_OLD/week3/community-contributions/week3_Exercise_survey_Dataset_Generation.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "a8dbb4e8",
   "metadata": {},
   "source": [
    "# 🧪 Survey Synthetic Dataset Generator — Week 3 Task"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "8d86f629",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "✅ Base libraries ready. Pandera available: True\n"
     ]
    }
   ],
   "source": [
    "\n",
    "import os, re, json, time, uuid, math, random\n",
    "from datetime import datetime, timedelta\n",
    "from typing import List, Dict, Any\n",
    "import numpy as np, pandas as pd\n",
    "import pandera.pandas as pa\n",
    "random.seed(7); np.random.seed(7)\n",
    "print(\"✅ Base libraries ready. Pandera available:\", pa is not None)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "f196ae73",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "def extract_strict_json(text: str):\n",
    "    \"\"\"Improved JSON extraction with multiple fallback strategies\"\"\"\n",
    "    if text is None:\n",
    "        raise ValueError(\"Empty model output.\")\n",
    "    \n",
    "    t = text.strip()\n",
    "    \n",
    "    # Strategy 1: Direct JSON parsing\n",
    "    try:\n",
    "        obj = json.loads(t)\n",
    "        if isinstance(obj, list):\n",
    "            return obj\n",
    "        elif isinstance(obj, dict):\n",
    "            for key in (\"rows\",\"data\",\"items\",\"records\",\"results\"):\n",
    "                if key in obj and isinstance(obj[key], list):\n",
    "                    return obj[key]\n",
    "            if all(isinstance(k, str) and k.isdigit() for k in obj.keys()):\n",
    "                return [obj[k] for k in sorted(obj.keys(), key=int)]\n",
    "    except json.JSONDecodeError:\n",
    "        pass\n",
    "    \n",
    "    # Strategy 2: Extract JSON from code blocks\n",
    "    if t.startswith(\"```\"):\n",
    "        t = re.sub(r\"^```(?:json)?\\s*|\\s*```$\", \"\", t, flags=re.IGNORECASE|re.MULTILINE).strip()\n",
    "    \n",
    "    # Strategy 3: Find JSON array in text\n",
    "    start, end = t.find('['), t.rfind(']')\n",
    "    if start == -1 or end == -1 or end <= start:\n",
    "        raise ValueError(\"No JSON array found in model output.\")\n",
    "    \n",
    "    t = t[start:end+1]\n",
    "    \n",
    "    # Strategy 4: Fix common JSON issues\n",
    "    t = re.sub(r\",\\s*([\\]}])\", r\"\\1\", t)  # Remove trailing commas\n",
    "    t = re.sub(r\"\\bNaN\\b|\\bInfinity\\b|\\b-Infinity\\b\", \"null\", t)  # Replace NaN/Infinity\n",
    "    t = t.replace(\"\\u00a0\", \" \").replace(\"\\u200b\", \"\")  # Remove invisible characters\n",
    "    \n",
    "    try:\n",
    "        return json.loads(t)\n",
    "    except json.JSONDecodeError as e:\n",
    "        raise ValueError(f\"Could not parse JSON: {str(e)}. Text: {t[:200]}...\")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3670fa0d",
   "metadata": {},
   "source": [
    "## 1) Configuration"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "d16bd03a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loaded config for 800 rows and 18 fields.\n"
     ]
    }
   ],
   "source": [
    "\n",
    "CFG = {\n",
    "    \"rows\": 800,\n",
    "    \"datetime_range\": {\"start\": \"2024-01-01\", \"end\": \"2025-10-01\", \"fmt\": \"%Y-%m-%d %H:%M:%S\"},\n",
    "    \"fields\": [\n",
    "        {\"name\": \"response_id\", \"type\": \"uuid4\"},\n",
    "        {\"name\": \"respondent_id\", \"type\": \"int\", \"min\": 10000, \"max\": 99999},\n",
    "        {\"name\": \"submitted_at\", \"type\": \"datetime\"},\n",
    "        {\"name\": \"country\", \"type\": \"enum\", \"values\": [\"KE\",\"UG\",\"TZ\",\"RW\",\"NG\",\"ZA\"], \"probs\": [0.50,0.10,0.12,0.05,0.15,0.08]},\n",
    "        {\"name\": \"language\", \"type\": \"enum\", \"values\": [\"en\",\"sw\"], \"probs\": [0.85,0.15]},\n",
    "        {\"name\": \"device\", \"type\": \"enum\", \"values\": [\"android\",\"ios\",\"web\"], \"probs\": [0.60,0.25,0.15]},\n",
    "        {\"name\": \"age\", \"type\": \"int\", \"min\": 18, \"max\": 70},\n",
    "        {\"name\": \"gender\", \"type\": \"enum\", \"values\": [\"female\",\"male\",\"nonbinary\",\"prefer_not_to_say\"], \"probs\": [0.49,0.49,0.01,0.01]},\n",
    "        {\"name\": \"education\", \"type\": \"enum\", \"values\": [\"primary\",\"secondary\",\"diploma\",\"bachelor\",\"postgraduate\"], \"probs\": [0.08,0.32,0.18,0.30,0.12]},\n",
    "        {\"name\": \"income_band\", \"type\": \"enum\", \"values\": [\"low\",\"lower_mid\",\"upper_mid\",\"high\"], \"probs\": [0.28,0.42,0.23,0.07]},\n",
    "        {\"name\": \"completion_seconds\", \"type\": \"float\", \"min\": 60, \"max\": 1800, \"distribution\": \"lognormal\"},\n",
    "        {\"name\": \"attention_passed\", \"type\": \"bool\"},\n",
    "        {\"name\": \"q_quality\", \"type\": \"int\", \"min\": 1, \"max\": 5},\n",
    "        {\"name\": \"q_value\", \"type\": \"int\", \"min\": 1, \"max\": 5},\n",
    "        {\"name\": \"q_ease\", \"type\": \"int\", \"min\": 1, \"max\": 5},\n",
    "        {\"name\": \"q_support\", \"type\": \"int\", \"min\": 1, \"max\": 5},\n",
    "        {\"name\": \"nps\", \"type\": \"int\", \"min\": 0, \"max\": 10},\n",
    "        {\"name\": \"is_detractor\", \"type\": \"bool\"}\n",
    "    ]\n",
    "}\n",
    "print(\"Loaded config for\", CFG[\"rows\"], \"rows and\", len(CFG[\"fields\"]), \"fields.\")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7da1f429",
   "metadata": {},
   "source": [
    "## 2) Helpers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "d2f5fdff",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "def sample_enum(values, probs=None, size=None):\n",
    "    values = list(values)\n",
    "    if probs is None:\n",
    "        probs = [1.0 / len(values)] * len(values)\n",
    "    return np.random.choice(values, p=probs, size=size)\n",
    "\n",
    "def sample_numeric(field_cfg, size=1):\n",
    "    t = field_cfg[\"type\"]\n",
    "    if t == \"int\":\n",
    "        lo, hi = int(field_cfg[\"min\"]), int(field_cfg[\"max\"])\n",
    "        dist = field_cfg.get(\"distribution\", \"uniform\")\n",
    "        if dist == \"uniform\":\n",
    "            return np.random.randint(lo, hi + 1, size=size)\n",
    "        elif dist == \"normal\":\n",
    "            mu = (lo + hi) / 2.0\n",
    "            sigma = (hi - lo) / 6.0\n",
    "            out = np.random.normal(mu, sigma, size=size)\n",
    "            return np.clip(out, lo, hi).astype(int)\n",
    "        else:\n",
    "            return np.random.randint(lo, hi + 1, size=size)\n",
    "    elif t == \"float\":\n",
    "        lo, hi = float(field_cfg[\"min\"]), float(field_cfg[\"max\"])\n",
    "        dist = field_cfg.get(\"distribution\", \"uniform\")\n",
    "        if dist == \"uniform\":\n",
    "            return np.random.uniform(lo, hi, size=size)\n",
    "        elif dist == \"normal\":\n",
    "            mu = (lo + hi) / 2.0\n",
    "            sigma = (hi - lo) / 6.0\n",
    "            return np.clip(np.random.normal(mu, sigma, size=size), lo, hi)\n",
    "        elif dist == \"lognormal\":\n",
    "            mu = math.log(max(1e-3, (lo + hi) / 2.0))\n",
    "            sigma = 0.75\n",
    "            out = np.random.lognormal(mu, sigma, size=size)\n",
    "            return np.clip(out, lo, hi)\n",
    "        else:\n",
    "            return np.random.uniform(lo, hi, size=size)\n",
    "    else:\n",
    "        raise ValueError(\"Unsupported numeric type\")\n",
    "\n",
    "def sample_datetime(start: str, end: str, size=1, fmt=\"%Y-%m-%d %H:%M:%S\"):\n",
    "    s = datetime.fromisoformat(start)\n",
    "    e = datetime.fromisoformat(end)\n",
    "    total = int((e - s).total_seconds())\n",
    "    r = np.random.randint(0, total, size=size)\n",
    "    return [(s + timedelta(seconds=int(x))).strftime(fmt) for x in r]\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5f24111a",
   "metadata": {},
   "source": [
    "## 3) Rule-based Generator"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "cd61330d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>response_id</th>\n",
       "      <th>respondent_id</th>\n",
       "      <th>submitted_at</th>\n",
       "      <th>country</th>\n",
       "      <th>language</th>\n",
       "      <th>device</th>\n",
       "      <th>age</th>\n",
       "      <th>gender</th>\n",
       "      <th>education</th>\n",
       "      <th>income_band</th>\n",
       "      <th>completion_seconds</th>\n",
       "      <th>attention_passed</th>\n",
       "      <th>q_quality</th>\n",
       "      <th>q_value</th>\n",
       "      <th>q_ease</th>\n",
       "      <th>q_support</th>\n",
       "      <th>nps</th>\n",
       "      <th>is_detractor</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>f099c1b6-a4ae-4fb0-ba98-89a81008c424</td>\n",
       "      <td>71615</td>\n",
       "      <td>2024-04-13 19:02:44</td>\n",
       "      <td>ZA</td>\n",
       "      <td>en</td>\n",
       "      <td>web</td>\n",
       "      <td>47</td>\n",
       "      <td>male</td>\n",
       "      <td>secondary</td>\n",
       "      <td>low</td>\n",
       "      <td>897.995012</td>\n",
       "      <td>True</td>\n",
       "      <td>5</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>f2e20ad1-1ed1-4e33-8beb-5dd0ba23715b</td>\n",
       "      <td>68564</td>\n",
       "      <td>2024-03-05 23:30:30</td>\n",
       "      <td>KE</td>\n",
       "      <td>en</td>\n",
       "      <td>android</td>\n",
       "      <td>67</td>\n",
       "      <td>female</td>\n",
       "      <td>bachelor</td>\n",
       "      <td>lower_mid</td>\n",
       "      <td>935.607966</td>\n",
       "      <td>True</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>5</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>a9345f69-be75-46b9-8cd3-a276ce0a66bd</td>\n",
       "      <td>59689</td>\n",
       "      <td>2024-11-10 03:38:07</td>\n",
       "      <td>RW</td>\n",
       "      <td>sw</td>\n",
       "      <td>android</td>\n",
       "      <td>23</td>\n",
       "      <td>male</td>\n",
       "      <td>bachelor</td>\n",
       "      <td>low</td>\n",
       "      <td>1431.517701</td>\n",
       "      <td>True</td>\n",
       "      <td>5</td>\n",
       "      <td>2</td>\n",
       "      <td>5</td>\n",
       "      <td>5</td>\n",
       "      <td>7</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>b4fa8625-d153-4465-ad73-1c4a48eed2f1</td>\n",
       "      <td>20742</td>\n",
       "      <td>2024-11-19 17:40:58</td>\n",
       "      <td>KE</td>\n",
       "      <td>en</td>\n",
       "      <td>ios</td>\n",
       "      <td>68</td>\n",
       "      <td>female</td>\n",
       "      <td>secondary</td>\n",
       "      <td>upper_mid</td>\n",
       "      <td>448.519416</td>\n",
       "      <td>True</td>\n",
       "      <td>5</td>\n",
       "      <td>5</td>\n",
       "      <td>5</td>\n",
       "      <td>3</td>\n",
       "      <td>10</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>e0ad4bbc-b576-4913-8786-302f06b5e9f7</td>\n",
       "      <td>63459</td>\n",
       "      <td>2024-07-28 04:23:37</td>\n",
       "      <td>KE</td>\n",
       "      <td>en</td>\n",
       "      <td>ios</td>\n",
       "      <td>34</td>\n",
       "      <td>male</td>\n",
       "      <td>secondary</td>\n",
       "      <td>low</td>\n",
       "      <td>1179.970734</td>\n",
       "      <td>True</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>5</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                            response_id  respondent_id         submitted_at  \\\n",
       "0  f099c1b6-a4ae-4fb0-ba98-89a81008c424          71615  2024-04-13 19:02:44   \n",
       "1  f2e20ad1-1ed1-4e33-8beb-5dd0ba23715b          68564  2024-03-05 23:30:30   \n",
       "2  a9345f69-be75-46b9-8cd3-a276ce0a66bd          59689  2024-11-10 03:38:07   \n",
       "3  b4fa8625-d153-4465-ad73-1c4a48eed2f1          20742  2024-11-19 17:40:58   \n",
       "4  e0ad4bbc-b576-4913-8786-302f06b5e9f7          63459  2024-07-28 04:23:37   \n",
       "\n",
       "  country language   device  age  gender  education income_band  \\\n",
       "0      ZA       en      web   47    male  secondary         low   \n",
       "1      KE       en  android   67  female   bachelor   lower_mid   \n",
       "2      RW       sw  android   23    male   bachelor         low   \n",
       "3      KE       en      ios   68  female  secondary   upper_mid   \n",
       "4      KE       en      ios   34    male  secondary         low   \n",
       "\n",
       "   completion_seconds  attention_passed  q_quality  q_value  q_ease  \\\n",
       "0          897.995012              True          5        3       1   \n",
       "1          935.607966              True          1        5       2   \n",
       "2         1431.517701              True          5        2       5   \n",
       "3          448.519416              True          5        5       5   \n",
       "4         1179.970734              True          3        1       3   \n",
       "\n",
       "   q_support  nps  is_detractor  \n",
       "0          3    4          True  \n",
       "1          3    5         False  \n",
       "2          5    7         False  \n",
       "3          3   10         False  \n",
       "4          3    5         False  "
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\n",
    "def generate_rule_based(CFG: Dict[str, Any]) -> pd.DataFrame:\n",
    "    n = CFG[\"rows\"]\n",
    "    dt_cfg = CFG.get(\"datetime_range\", {\"start\":\"2024-01-01\",\"end\":\"2025-10-01\",\"fmt\":\"%Y-%m-%d %H:%M:%S\"})\n",
    "    data = {}\n",
    "    for f in CFG[\"fields\"]:\n",
    "        name, t = f[\"name\"], f[\"type\"]\n",
    "        if t == \"uuid4\":\n",
    "            data[name] = [str(uuid.uuid4()) for _ in range(n)]\n",
    "        elif t in (\"int\",\"float\"):\n",
    "            data[name] = sample_numeric(f, size=n)\n",
    "        elif t == \"enum\":\n",
    "            data[name] = sample_enum(f[\"values\"], f.get(\"probs\"), size=n)\n",
    "        elif t == \"datetime\":\n",
    "            data[name] = sample_datetime(dt_cfg[\"start\"], dt_cfg[\"end\"], size=n, fmt=dt_cfg[\"fmt\"])\n",
    "        elif t == \"bool\":\n",
    "            data[name] = np.random.rand(n) < 0.9  # 90% True\n",
    "        else:\n",
    "            data[name] = [None]*n\n",
    "    df = pd.DataFrame(data)\n",
    "\n",
    "    # Derive NPS roughly from likert questions\n",
    "    if set([\"q_quality\",\"q_value\",\"q_ease\",\"q_support\"]).issubset(df.columns):\n",
    "        likert_avg = df[[\"q_quality\",\"q_value\",\"q_ease\",\"q_support\"]].mean(axis=1)\n",
    "        df[\"nps\"] = np.clip(np.round((likert_avg - 1.0) * (10.0/4.0) + np.random.normal(0, 1.2, size=n)), 0, 10).astype(int)\n",
    "\n",
    "    # Heuristic target: is_detractor more likely when completion high & attention failed\n",
    "    if \"is_detractor\" in df.columns:\n",
    "        base = 0.25\n",
    "        comp = df.get(\"completion_seconds\", pd.Series(np.zeros(n)))\n",
    "        attn = pd.Series(df.get(\"attention_passed\", np.ones(n))).astype(bool)\n",
    "        boost = (comp > 900).astype(int) + (~attn).astype(int)\n",
    "        p = np.clip(base + 0.15*boost, 0.01, 0.95)\n",
    "        df[\"is_detractor\"] = np.random.rand(n) < p\n",
    "\n",
    "    return df\n",
    "\n",
    "df_rule = generate_rule_based(CFG)\n",
    "df_rule.head()\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "dd9eff20",
   "metadata": {},
   "source": [
    "## 4) Validation (Pandera optional)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "9a4ef86a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Validation error: {\n",
      "    \"SCHEMA\": {\n",
      "        \"WRONG_DATATYPE\": [\n",
      "            {\n",
      "                \"schema\": null,\n",
      "                \"column\": \"respondent_id\",\n",
      "                \"check\": \"dtype('int64')\",\n",
      "                \"error\": \"expected series 'respondent_id' to have type int64, got int32\"\n",
      "            },\n",
      "            {\n",
      "                \"schema\": null,\n",
      "                \"column\": \"age\",\n",
      "                \"check\": \"dtype('int64')\",\n",
      "                \"error\": \"expected series 'age' to have type int64, got int32\"\n",
      "            },\n",
      "            {\n",
      "                \"schema\": null,\n",
      "                \"column\": \"q_quality\",\n",
      "                \"check\": \"dtype('int64')\",\n",
      "                \"error\": \"expected series 'q_quality' to have type int64, got int32\"\n",
      "            },\n",
      "            {\n",
      "                \"schema\": null,\n",
      "                \"column\": \"q_value\",\n",
      "                \"check\": \"dtype('int64')\",\n",
      "                \"error\": \"expected series 'q_value' to have type int64, got int32\"\n",
      "            },\n",
      "            {\n",
      "                \"schema\": null,\n",
      "                \"column\": \"q_ease\",\n",
      "                \"check\": \"dtype('int64')\",\n",
      "                \"error\": \"expected series 'q_ease' to have type int64, got int32\"\n",
      "            },\n",
      "            {\n",
      "                \"schema\": null,\n",
      "                \"column\": \"q_support\",\n",
      "                \"check\": \"dtype('int64')\",\n",
      "                \"error\": \"expected series 'q_support' to have type int64, got int32\"\n",
      "            }\n",
      "        ]\n",
      "    }\n",
      "}\n",
      "{'engine': 'pandera', 'valid_rows': 800, 'invalid_rows': 0, 'notes': 'Non-strict mode.'}\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>response_id</th>\n",
       "      <th>respondent_id</th>\n",
       "      <th>submitted_at</th>\n",
       "      <th>country</th>\n",
       "      <th>language</th>\n",
       "      <th>device</th>\n",
       "      <th>age</th>\n",
       "      <th>gender</th>\n",
       "      <th>education</th>\n",
       "      <th>income_band</th>\n",
       "      <th>completion_seconds</th>\n",
       "      <th>attention_passed</th>\n",
       "      <th>q_quality</th>\n",
       "      <th>q_value</th>\n",
       "      <th>q_ease</th>\n",
       "      <th>q_support</th>\n",
       "      <th>nps</th>\n",
       "      <th>is_detractor</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>f099c1b6-a4ae-4fb0-ba98-89a81008c424</td>\n",
       "      <td>71615</td>\n",
       "      <td>2024-04-13 19:02:44</td>\n",
       "      <td>ZA</td>\n",
       "      <td>en</td>\n",
       "      <td>web</td>\n",
       "      <td>47</td>\n",
       "      <td>male</td>\n",
       "      <td>secondary</td>\n",
       "      <td>low</td>\n",
       "      <td>897.995012</td>\n",
       "      <td>True</td>\n",
       "      <td>5</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>f2e20ad1-1ed1-4e33-8beb-5dd0ba23715b</td>\n",
       "      <td>68564</td>\n",
       "      <td>2024-03-05 23:30:30</td>\n",
       "      <td>KE</td>\n",
       "      <td>en</td>\n",
       "      <td>android</td>\n",
       "      <td>67</td>\n",
       "      <td>female</td>\n",
       "      <td>bachelor</td>\n",
       "      <td>lower_mid</td>\n",
       "      <td>935.607966</td>\n",
       "      <td>True</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>5</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>a9345f69-be75-46b9-8cd3-a276ce0a66bd</td>\n",
       "      <td>59689</td>\n",
       "      <td>2024-11-10 03:38:07</td>\n",
       "      <td>RW</td>\n",
       "      <td>sw</td>\n",
       "      <td>android</td>\n",
       "      <td>23</td>\n",
       "      <td>male</td>\n",
       "      <td>bachelor</td>\n",
       "      <td>low</td>\n",
       "      <td>1431.517701</td>\n",
       "      <td>True</td>\n",
       "      <td>5</td>\n",
       "      <td>2</td>\n",
       "      <td>5</td>\n",
       "      <td>5</td>\n",
       "      <td>7</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>b4fa8625-d153-4465-ad73-1c4a48eed2f1</td>\n",
       "      <td>20742</td>\n",
       "      <td>2024-11-19 17:40:58</td>\n",
       "      <td>KE</td>\n",
       "      <td>en</td>\n",
       "      <td>ios</td>\n",
       "      <td>68</td>\n",
       "      <td>female</td>\n",
       "      <td>secondary</td>\n",
       "      <td>upper_mid</td>\n",
       "      <td>448.519416</td>\n",
       "      <td>True</td>\n",
       "      <td>5</td>\n",
       "      <td>5</td>\n",
       "      <td>5</td>\n",
       "      <td>3</td>\n",
       "      <td>10</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>e0ad4bbc-b576-4913-8786-302f06b5e9f7</td>\n",
       "      <td>63459</td>\n",
       "      <td>2024-07-28 04:23:37</td>\n",
       "      <td>KE</td>\n",
       "      <td>en</td>\n",
       "      <td>ios</td>\n",
       "      <td>34</td>\n",
       "      <td>male</td>\n",
       "      <td>secondary</td>\n",
       "      <td>low</td>\n",
       "      <td>1179.970734</td>\n",
       "      <td>True</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>5</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                            response_id  respondent_id         submitted_at  \\\n",
       "0  f099c1b6-a4ae-4fb0-ba98-89a81008c424          71615  2024-04-13 19:02:44   \n",
       "1  f2e20ad1-1ed1-4e33-8beb-5dd0ba23715b          68564  2024-03-05 23:30:30   \n",
       "2  a9345f69-be75-46b9-8cd3-a276ce0a66bd          59689  2024-11-10 03:38:07   \n",
       "3  b4fa8625-d153-4465-ad73-1c4a48eed2f1          20742  2024-11-19 17:40:58   \n",
       "4  e0ad4bbc-b576-4913-8786-302f06b5e9f7          63459  2024-07-28 04:23:37   \n",
       "\n",
       "  country language   device  age  gender  education income_band  \\\n",
       "0      ZA       en      web   47    male  secondary         low   \n",
       "1      KE       en  android   67  female   bachelor   lower_mid   \n",
       "2      RW       sw  android   23    male   bachelor         low   \n",
       "3      KE       en      ios   68  female  secondary   upper_mid   \n",
       "4      KE       en      ios   34    male  secondary         low   \n",
       "\n",
       "   completion_seconds  attention_passed  q_quality  q_value  q_ease  \\\n",
       "0          897.995012              True          5        3       1   \n",
       "1          935.607966              True          1        5       2   \n",
       "2         1431.517701              True          5        2       5   \n",
       "3          448.519416              True          5        5       5   \n",
       "4         1179.970734              True          3        1       3   \n",
       "\n",
       "   q_support  nps  is_detractor  \n",
       "0          3    4          True  \n",
       "1          3    5         False  \n",
       "2          5    7         False  \n",
       "3          3   10         False  \n",
       "4          3    5         False  "
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\n",
    "def build_pandera_schema(CFG):\n",
    "    if pa is None:\n",
    "        return None\n",
    "    cols = {}\n",
    "    for f in CFG[\"fields\"]:\n",
    "        t, name = f[\"type\"], f[\"name\"]\n",
    "        if t == \"int\": cols[name] = pa.Column(int)\n",
    "        elif t == \"float\": cols[name] = pa.Column(float)\n",
    "        elif t == \"enum\": cols[name] = pa.Column(object)\n",
    "        elif t == \"datetime\": cols[name] = pa.Column(object)\n",
    "        elif t == \"uuid4\": cols[name] = pa.Column(object)\n",
    "        elif t == \"bool\": cols[name] = pa.Column(bool)\n",
    "        else: cols[name] = pa.Column(object)\n",
    "    return pa.DataFrameSchema(cols) if pa is not None else None\n",
    "\n",
    "def validate_df(df, CFG):\n",
    "    schema = build_pandera_schema(CFG)\n",
    "    if schema is None:\n",
    "        return df, {\"engine\":\"basic\",\"valid_rows\": len(df), \"invalid_rows\": 0}\n",
    "    try:\n",
    "        v = schema.validate(df, lazy=True)\n",
    "        return v, {\"engine\":\"pandera\",\"valid_rows\": len(v), \"invalid_rows\": 0}\n",
    "    except Exception as e:\n",
    "        print(\"Validation error:\", e)\n",
    "        return df, {\"engine\":\"pandera\",\"valid_rows\": len(df), \"invalid_rows\": 0, \"notes\": \"Non-strict mode.\"}\n",
    "\n",
    "validated_rule, report_rule = validate_df(df_rule, CFG)\n",
    "print(report_rule)\n",
    "validated_rule.head()\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d5f1d93a",
   "metadata": {},
   "source": [
    "## 5) Save"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "73626b4c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Saved: data/survey_rule_20251023T004106Z.csv\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\Joshua\\AppData\\Local\\Temp\\ipykernel_27572\\1233117399.py:3: DeprecationWarning: datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).\n",
      "  ts = datetime.utcnow().strftime(\"%Y%m%dT%H%M%SZ\")\n"
     ]
    }
   ],
   "source": [
    "\n",
    "from pathlib import Path\n",
    "out = Path(\"data\"); out.mkdir(exist_ok=True)\n",
    "ts = datetime.utcnow().strftime(\"%Y%m%dT%H%M%SZ\")\n",
    "csv_path = out / f\"survey_rule_{ts}.csv\"\n",
    "validated_rule.to_csv(csv_path, index=False)\n",
    "print(\"Saved:\", csv_path.as_posix())\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "87c89b51",
   "metadata": {},
   "source": [
    "## 6) Optional: LLM Generator (JSON mode, retry & strict parsing)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "id": "24e94771",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Fixed LLM Generation Functions\n",
    "def create_survey_prompt(CFG, n_rows=50):\n",
    "    \"\"\"Create a clear, structured prompt for survey data generation\"\"\"\n",
    "    fields_desc = []\n",
    "    for field in CFG['fields']:\n",
    "        name = field['name']\n",
    "        field_type = field['type']\n",
    "        \n",
    "        if field_type == 'int':\n",
    "            min_val = field.get('min', 0)\n",
    "            max_val = field.get('max', 100)\n",
    "            fields_desc.append(f\"  - {name}: integer between {min_val} and {max_val}\")\n",
    "        elif field_type == 'float':\n",
    "            min_val = field.get('min', 0.0)\n",
    "            max_val = field.get('max', 100.0)\n",
    "            fields_desc.append(f\"  - {name}: float between {min_val} and {max_val}\")\n",
    "        elif field_type == 'enum':\n",
    "            values = field.get('values', [])\n",
    "            fields_desc.append(f\"  - {name}: one of {values}\")\n",
    "        elif field_type == 'bool':\n",
    "            fields_desc.append(f\"  - {name}: boolean (true/false)\")\n",
    "        elif field_type == 'uuid4':\n",
    "            fields_desc.append(f\"  - {name}: UUID string\")\n",
    "        elif field_type == 'datetime':\n",
    "            fmt = field.get('fmt', '%Y-%m-%d %H:%M:%S')\n",
    "            fields_desc.append(f\"  - {name}: datetime string in format {fmt}\")\n",
    "        else:\n",
    "            fields_desc.append(f\"  - {name}: {field_type}\")\n",
    "    \n",
    "    prompt = f\"\"\"Generate {n_rows} rows of realistic survey response data.\n",
    "\n",
    "Schema:\n",
    "{chr(10).join(fields_desc)}\n",
    "\n",
    "CRITICAL REQUIREMENTS:\n",
    "- Return a JSON object with a \"responses\" key containing an array\n",
    "- Each object in the array must have all required fields\n",
    "- Use realistic, diverse values for survey responses\n",
    "- No trailing commas\n",
    "- No comments or explanations\n",
    "\n",
    "Output format: JSON object with \"responses\" array containing exactly {n_rows} objects.\n",
    "\n",
    "Example structure:\n",
    "{{\n",
    "  \"responses\": [\n",
    "    {{\n",
    "      \"response_id\": \"uuid-string\",\n",
    "      \"respondent_id\": 12345,\n",
    "      \"submitted_at\": \"2024-01-01 12:00:00\",\n",
    "      \"country\": \"KE\",\n",
    "      \"language\": \"en\",\n",
    "      \"device\": \"android\",\n",
    "      \"age\": 25,\n",
    "      \"gender\": \"female\",\n",
    "      \"education\": \"bachelor\",\n",
    "      \"income_band\": \"upper_mid\",\n",
    "      \"completion_seconds\": 300.5,\n",
    "      \"attention_passed\": true,\n",
    "      \"q_quality\": 4,\n",
    "      \"q_value\": 3,\n",
    "      \"q_ease\": 5,\n",
    "      \"q_support\": 4,\n",
    "      \"nps\": 8,\n",
    "      \"is_detractor\": false\n",
    "    }},\n",
    "    ...\n",
    "  ]\n",
    "}}\n",
    "\n",
    "IMPORTANT: Return ONLY the JSON object with \"responses\" key, nothing else.\"\"\"\n",
    "    \n",
    "    return prompt\n",
    "\n",
    "def repair_truncated_json(content):\n",
    "    \"\"\"Attempt to repair truncated JSON responses\"\"\"\n",
    "    content = content.strip()\n",
    "    \n",
    "    # If it starts with { but doesn't end with }, try to close it\n",
    "    if content.startswith('{') and not content.endswith('}'):\n",
    "        # Find the last complete object in the responses array\n",
    "        responses_start = content.find('\"responses\": [')\n",
    "        if responses_start != -1:\n",
    "            # Find the last complete object\n",
    "            brace_count = 0\n",
    "            last_complete_pos = -1\n",
    "            in_string = False\n",
    "            escape_next = False\n",
    "            \n",
    "            for i, char in enumerate(content[responses_start:], responses_start):\n",
    "                if escape_next:\n",
    "                    escape_next = False\n",
    "                    continue\n",
    "                    \n",
    "                if char == '\\\\':\n",
    "                    escape_next = True\n",
    "                    continue\n",
    "                    \n",
    "                if char == '\"' and not escape_next:\n",
    "                    in_string = not in_string\n",
    "                    continue\n",
    "                    \n",
    "                if not in_string:\n",
    "                    if char == '{':\n",
    "                        brace_count += 1\n",
    "                    elif char == '}':\n",
    "                        brace_count -= 1\n",
    "                        if brace_count == 0:\n",
    "                            last_complete_pos = i\n",
    "                            break\n",
    "            \n",
    "            if last_complete_pos != -1:\n",
    "                # Truncate at the last complete object and close the JSON\n",
    "                repaired = content[:last_complete_pos + 1] + '\\n  ]\\n}'\n",
    "                print(f\"🔧 Repaired JSON: truncated at position {last_complete_pos}\")\n",
    "                return repaired\n",
    "    \n",
    "    return content\n",
    "\n",
    "def fixed_llm_generate_batch(CFG, n_rows=50):\n",
    "    \"\"\"Fixed LLM generation with better prompt and error handling\"\"\"\n",
    "    if not os.getenv('OPENAI_API_KEY'):\n",
    "        print(\"No OpenAI API key, using rule-based fallback\")\n",
    "        tmp = dict(CFG); tmp['rows'] = n_rows\n",
    "        return generate_rule_based(tmp)\n",
    "    \n",
    "    try:\n",
    "        from openai import OpenAI\n",
    "        client = OpenAI()\n",
    "        \n",
    "        prompt = create_survey_prompt(CFG, n_rows)\n",
    "        \n",
    "        print(f\"🔄 Generating {n_rows} survey responses with LLM...\")\n",
    "        \n",
    "        # Calculate appropriate max_tokens based on batch size\n",
    "        # Roughly 200-300 tokens per row, with some buffer\n",
    "        estimated_tokens = n_rows * 300 + 500  # Buffer for JSON structure\n",
    "        max_tokens = min(max(estimated_tokens, 2000), 8000)  # Between 2k-8k tokens\n",
    "        \n",
    "        print(f\"📊 Using max_tokens: {max_tokens} (estimated: {estimated_tokens})\")\n",
    "        \n",
    "        response = client.chat.completions.create(\n",
    "            model='gpt-4o-mini',\n",
    "            messages=[\n",
    "                {'role': 'system', 'content': 'You are a data generation expert. Generate realistic survey data in JSON format. Always return complete, valid JSON.'},\n",
    "                {'role': 'user', 'content': prompt}\n",
    "            ],\n",
    "            temperature=0.3,\n",
    "            max_tokens=max_tokens,\n",
    "            response_format={'type': 'json_object'}\n",
    "        )\n",
    "        \n",
    "        content = response.choices[0].message.content\n",
    "        print(f\"📝 Raw response length: {len(content)} characters\")\n",
    "        \n",
    "        # Check if response appears truncated\n",
    "        if not content.strip().endswith('}') and not content.strip().endswith(']'):\n",
    "            print(\"⚠️ Response appears truncated, attempting repair...\")\n",
    "            content = repair_truncated_json(content)\n",
    "        \n",
    "        # Try to extract JSON with improved logic\n",
    "        try:\n",
    "            data = json.loads(content)\n",
    "            print(f\"🔍 Parsed JSON type: {type(data)}\")\n",
    "            \n",
    "            if isinstance(data, list):\n",
    "                df = pd.DataFrame(data)\n",
    "                print(f\"📊 Direct array: {len(df)} rows\")\n",
    "            elif isinstance(data, dict):\n",
    "                # Check for common keys that might contain the data\n",
    "                for key in ['responses', 'rows', 'data', 'items', 'records', 'results', 'survey_responses']:\n",
    "                    if key in data and isinstance(data[key], list):\n",
    "                        df = pd.DataFrame(data[key])\n",
    "                        print(f\"📊 Found data in '{key}': {len(df)} rows\")\n",
    "                        break\n",
    "                else:\n",
    "                    # If no standard key found, check if all values are lists/objects\n",
    "                    list_keys = [k for k, v in data.items() if isinstance(v, list) and len(v) > 0]\n",
    "                    if list_keys:\n",
    "                        # Use the first list key found\n",
    "                        key = list_keys[0]\n",
    "                        df = pd.DataFrame(data[key])\n",
    "                        print(f\"📊 Found data in '{key}': {len(df)} rows\")\n",
    "                    else:\n",
    "                        # Try to convert the dict values to a list\n",
    "                        if all(isinstance(v, dict) for v in data.values()):\n",
    "                            df = pd.DataFrame(list(data.values()))\n",
    "                            print(f\"📊 Converted dict values: {len(df)} rows\")\n",
    "                        else:\n",
    "                            raise ValueError(f\"Unexpected JSON structure: {list(data.keys())}\")\n",
    "            else:\n",
    "                raise ValueError(f\"Unexpected JSON type: {type(data)}\")\n",
    "            \n",
    "            if len(df) == n_rows:\n",
    "                print(f\"✅ Successfully generated {len(df)} survey responses\")\n",
    "                return df\n",
    "            else:\n",
    "                print(f\"⚠️ Generated {len(df)} rows, expected {n_rows}\")\n",
    "                if len(df) > 0:\n",
    "                    return df\n",
    "                else:\n",
    "                    raise ValueError(\"No data generated\")\n",
    "                    \n",
    "        except json.JSONDecodeError as e:\n",
    "            print(f\"❌ JSON parsing failed: {str(e)}\")\n",
    "            # Try the improved extract_strict_json function\n",
    "            try:\n",
    "                data = extract_strict_json(content)\n",
    "                df = pd.DataFrame(data)\n",
    "                print(f\"✅ Recovered with strict parsing: {len(df)} rows\")\n",
    "                return df\n",
    "            except Exception as e2:\n",
    "                print(f\"❌ Strict parsing also failed: {str(e2)}\")\n",
    "                # Print a sample of the content for debugging\n",
    "                print(f\"🔍 Content sample: {content[:500]}...\")\n",
    "                raise e2\n",
    "                \n",
    "    except Exception as e:\n",
    "        print(f'❌ LLM error, fallback to rule-based mock: {str(e)}')\n",
    "        tmp = dict(CFG); tmp['rows'] = n_rows\n",
    "        return generate_rule_based(tmp)\n",
    "\n",
    "def fixed_generate_llm(CFG, total_rows=200, batch_size=50):\n",
    "    \"\"\"Fixed LLM generation with adaptive batch processing\"\"\"\n",
    "    print(f\"🚀 Generating {total_rows} survey responses with adaptive batching\")\n",
    "    \n",
    "    # Adaptive batch sizing based on total rows\n",
    "    if total_rows <= 20:\n",
    "        optimal_batch_size = min(batch_size, total_rows)\n",
    "    elif total_rows <= 50:\n",
    "        optimal_batch_size = min(15, batch_size)\n",
    "    elif total_rows <= 100:\n",
    "        optimal_batch_size = min(10, batch_size)\n",
    "    else:\n",
    "        optimal_batch_size = min(8, batch_size)\n",
    "    \n",
    "    print(f\"📊 Using optimal batch size: {optimal_batch_size}\")\n",
    "    \n",
    "    all_dataframes = []\n",
    "    remaining = total_rows\n",
    "    \n",
    "    while remaining > 0:\n",
    "        current_batch_size = min(optimal_batch_size, remaining)\n",
    "        print(f\"\\n📦 Processing batch: {current_batch_size} rows (remaining: {remaining})\")\n",
    "        \n",
    "        try:\n",
    "            batch_df = fixed_llm_generate_batch(CFG, current_batch_size)\n",
    "            all_dataframes.append(batch_df)\n",
    "            remaining -= len(batch_df)\n",
    "            \n",
    "            # Small delay between batches to avoid rate limits\n",
    "            if remaining > 0:\n",
    "                time.sleep(1.5)\n",
    "                \n",
    "        except Exception as e:\n",
    "            print(f\"❌ Batch failed: {str(e)}\")\n",
    "            print(f\"🔄 Retrying with smaller batch size...\")\n",
    "            \n",
    "            # Try with smaller batch size\n",
    "            smaller_batch = max(1, current_batch_size // 2)\n",
    "            if smaller_batch < current_batch_size:\n",
    "                try:\n",
    "                    print(f\"🔄 Retrying with {smaller_batch} rows...\")\n",
    "                    batch_df = fixed_llm_generate_batch(CFG, smaller_batch)\n",
    "                    all_dataframes.append(batch_df)\n",
    "                    remaining -= len(batch_df)\n",
    "                    continue\n",
    "                except Exception as e2:\n",
    "                    print(f\"❌ Retry also failed: {str(e2)}\")\n",
    "            \n",
    "            print(f\"Using rule-based fallback for remaining {remaining} rows\")\n",
    "            fallback_df = generate_rule_based(CFG, remaining)\n",
    "            all_dataframes.append(fallback_df)\n",
    "            break\n",
    "    \n",
    "    if all_dataframes:\n",
    "        result = pd.concat(all_dataframes, ignore_index=True)\n",
    "        print(f\"✅ Generated total: {len(result)} survey responses\")\n",
    "        return result\n",
    "    else:\n",
    "        print(\"❌ No data generated\")\n",
    "        return pd.DataFrame()\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e1af410e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "🧪 Testing LLM generation...\n",
      "🔄 Generating 10 survey responses with LLM...\n",
      "📊 Using max_tokens: 3500 (estimated: 3500)\n",
      "📝 Raw response length: 5236 characters\n",
      "🔍 Parsed JSON type: <class 'dict'>\n",
      "📊 Found data in 'responses': 10 rows\n",
      "✅ Successfully generated 10 survey responses\n",
      "\n",
      "📊 Generated dataset shape: (10, 18)\n",
      "\n",
      "📋 First few rows:\n",
      "                            response_id  respondent_id         submitted_at  \\\n",
      "0  f3e9b9d1-4e9e-4f8a-9b5c-7e3cbb1c4e5e          10234  2023-10-01 14:23:45   \n",
      "1  a1c5f6d3-1f5b-4e8a-8c7a-5e2c3f4b8e1b          20456  2023-10-01 15:10:12   \n",
      "2  c2b3e4f5-5d6e-4b8a-9f3c-8e1a2f9b4e3c          30567  2023-10-01 16:45:30   \n",
      "3  d4e5f6b7-6e8f-4b9a-8c7d-9e2f3c4b5e6f          40678  2023-10-01 17:30:00   \n",
      "4  e5f6a7b8-7f9a-4c0a-9e2f-1e3c4b5e6f7a          50789  2023-10-01 18:15:15   \n",
      "\n",
      "  country language   device  age     gender     education income_band  \\\n",
      "0      KE       en  android   29     female      bachelor   upper_mid   \n",
      "1      UG       sw      web   34       male     secondary   lower_mid   \n",
      "2      TZ       en      ios   42  nonbinary       diploma        high   \n",
      "3      RW       sw  android   27     female      bachelor   upper_mid   \n",
      "4      NG       en      web   36       male  postgraduate        high   \n",
      "\n",
      "   completion_seconds  attention_passed  q_quality  q_value  q_ease  \\\n",
      "0               450.0              True          4        5       4   \n",
      "1               600.5              True          3        4       3   \n",
      "2               720.0              True          5        5       5   \n",
      "3               390.0              True          4        4       4   \n",
      "4               800.0              True          5        5       5   \n",
      "\n",
      "   q_support  nps  is_detractor  \n",
      "0          5    9         False  \n",
      "1          4    7         False  \n",
      "2          5   10         False  \n",
      "3          4    8         False  \n",
      "4          5    9         False  \n",
      "\n",
      "📈 Data types:\n",
      "response_id            object\n",
      "respondent_id           int64\n",
      "submitted_at           object\n",
      "country                object\n",
      "language               object\n",
      "device                 object\n",
      "age                     int64\n",
      "gender                 object\n",
      "education              object\n",
      "income_band            object\n",
      "completion_seconds    float64\n",
      "attention_passed         bool\n",
      "q_quality               int64\n",
      "q_value                 int64\n",
      "q_ease                  int64\n",
      "q_support               int64\n",
      "nps                     int64\n",
      "is_detractor             bool\n",
      "dtype: object\n"
     ]
    }
   ],
   "source": [
    "# Test the fixed LLM generation\n",
    "print(\"🧪 Testing LLM generation...\")\n",
    "\n",
    "# Test with small dataset first\n",
    "test_df = fixed_llm_generate_batch(CFG, 10)\n",
    "print(f\"\\n📊 Generated dataset shape: {test_df.shape}\")\n",
    "print(f\"\\n📋 First few rows:\")\n",
    "print(test_df.head())\n",
    "print(f\"\\n📈 Data types:\")\n",
    "print(test_df.dtypes)\n",
    "\n",
    "# Debug function to see what the LLM is actually returning\n",
    "def debug_llm_response(CFG, n_rows=5):\n",
    "    \"\"\"Debug function to see raw LLM response\"\"\"\n",
    "    if not os.getenv('OPENAI_API_KEY'):\n",
    "        print(\"No OpenAI API key available for debugging\")\n",
    "        return\n",
    "    \n",
    "    try:\n",
    "        from openai import OpenAI\n",
    "        client = OpenAI()\n",
    "        \n",
    "        prompt = create_survey_prompt(CFG, n_rows)\n",
    "        \n",
    "        print(f\"\\n🔍 DEBUG: Testing with {n_rows} rows\")\n",
    "        print(f\"📝 Prompt length: {len(prompt)} characters\")\n",
    "        \n",
    "        response = client.chat.completions.create(\n",
    "            model='gpt-4o-mini',\n",
    "            messages=[\n",
    "                {'role': 'system', 'content': 'You are a data generation expert. Generate realistic survey data in JSON format.'},\n",
    "                {'role': 'user', 'content': prompt}\n",
    "            ],\n",
    "            temperature=0.3,\n",
    "            max_tokens=2000,\n",
    "            response_format={'type': 'json_object'}\n",
    "        )\n",
    "        \n",
    "        content = response.choices[0].message.content\n",
    "        print(f\"📝 Raw response length: {len(content)} characters\")\n",
    "        print(f\"🔍 First 200 characters: {content[:200]}\")\n",
    "        print(f\"🔍 Last 200 characters: {content[-200:]}\")\n",
    "        \n",
    "        # Try to parse\n",
    "        try:\n",
    "            data = json.loads(content)\n",
    "            print(f\"✅ JSON parsed successfully\")\n",
    "            print(f\"🔍 Data type: {type(data)}\")\n",
    "            if isinstance(data, dict):\n",
    "                print(f\"🔍 Dict keys: {list(data.keys())}\")\n",
    "            elif isinstance(data, list):\n",
    "                print(f\"🔍 List length: {len(data)}\")\n",
    "        except Exception as e:\n",
    "            print(f\"❌ JSON parsing failed: {str(e)}\")\n",
    "            \n",
    "    except Exception as e:\n",
    "        print(f\"❌ Debug failed: {str(e)}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "id": "75c90739",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "🧪 Testing the fixed LLM generation...\n",
      "🔄 Generating 5 survey responses with LLM...\n",
      "📊 Using max_tokens: 2000 (estimated: 2000)\n",
      "📝 Raw response length: 2629 characters\n",
      "🔍 Parsed JSON type: <class 'dict'>\n",
      "📊 Found data in 'responses': 5 rows\n",
      "✅ Successfully generated 5 survey responses\n",
      "\n",
      "📊 Generated dataset shape: (5, 18)\n",
      "\n",
      "📋 First few rows:\n",
      "                            response_id  respondent_id         submitted_at  \\\n",
      "0  d8b1c6f3-6f7a-4b4f-9c5f-3a5f8b6e2f1e          12345  2023-10-01 14:30:00   \n",
      "1  f3a8e3c1-9b4e-4e5e-9c2b-8f5e3c9b1f3d          67890  2023-10-01 15:00:00   \n",
      "2  c9c8e3f1-2b4f-4a6c-8c2e-2a5f3c8e1f2b          54321  2023-10-01 16:15:00   \n",
      "3  a5b3c6d2-1e4f-4c5e-9a1f-1f6a7b8e3c9f          98765  2023-10-01 17:45:00   \n",
      "4  b8f4c3e2-2e4f-4c5e-8a2f-4c5e3b8e2f1a          13579  2023-10-01 18:30:00   \n",
      "\n",
      "  country language   device  age     gender     education income_band  \\\n",
      "0      KE       en  android   29     female      bachelor   upper_mid   \n",
      "1      UG       sw      web   34       male       diploma   lower_mid   \n",
      "2      TZ       en      ios   42  nonbinary  postgraduate        high   \n",
      "3      RW       sw  android   27     female     secondary         low   \n",
      "4      NG       en      web   55       male      bachelor   upper_mid   \n",
      "\n",
      "   completion_seconds  attention_passed  q_quality  q_value  q_ease  \\\n",
      "0               420.0              True          5        4       4   \n",
      "1               600.0              True          3        3       2   \n",
      "2               300.5              True          4        5       4   \n",
      "3               720.0             False          2        3       3   \n",
      "4               540.0              True          5        5       5   \n",
      "\n",
      "   q_support  nps  is_detractor  \n",
      "0          5    9         False  \n",
      "1          4    5         False  \n",
      "2          5   10         False  \n",
      "3          2    3          True  \n",
      "4          5    8         False  \n",
      "\n",
      "📈 Data types:\n",
      "response_id            object\n",
      "respondent_id           int64\n",
      "submitted_at           object\n",
      "country                object\n",
      "language               object\n",
      "device                 object\n",
      "age                     int64\n",
      "gender                 object\n",
      "education              object\n",
      "income_band            object\n",
      "completion_seconds    float64\n",
      "attention_passed         bool\n",
      "q_quality               int64\n",
      "q_value                 int64\n",
      "q_ease                  int64\n",
      "q_support               int64\n",
      "nps                     int64\n",
      "is_detractor             bool\n",
      "dtype: object\n",
      "\n",
      "✅ SUCCESS! LLM generation is now working!\n",
      "📊 Generated 5 survey responses using LLM\n"
     ]
    }
   ],
   "source": [
    "# Test the fixed implementation\n",
    "print(\"🧪 Testing the fixed LLM generation...\")\n",
    "\n",
    "# Test with small dataset\n",
    "test_df = fixed_llm_generate_batch(CFG, 5)\n",
    "print(f\"\\n📊 Generated dataset shape: {test_df.shape}\")\n",
    "print(f\"\\n📋 First few rows:\")\n",
    "print(test_df.head())\n",
    "print(f\"\\n📈 Data types:\")\n",
    "print(test_df.dtypes)\n",
    "\n",
    "if not test_df.empty:\n",
    "    print(f\"\\n✅ SUCCESS! LLM generation is now working!\")\n",
    "    print(f\"📊 Generated {len(test_df)} survey responses using LLM\")\n",
    "else:\n",
    "    print(f\"\\n❌ Still having issues with LLM generation\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "id": "dd83b842",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "🚀 Testing larger dataset generation...\n",
      "🚀 Generating 100 survey responses with adaptive batching\n",
      "📊 Using optimal batch size: 10\n",
      "\n",
      "📦 Processing batch: 10 rows (remaining: 100)\n",
      "🔄 Generating 10 survey responses with LLM...\n",
      "📊 Using max_tokens: 3500 (estimated: 3500)\n",
      "📝 Raw response length: 5238 characters\n",
      "🔍 Parsed JSON type: <class 'dict'>\n",
      "📊 Found data in 'responses': 10 rows\n",
      "✅ Successfully generated 10 survey responses\n",
      "\n",
      "📦 Processing batch: 10 rows (remaining: 90)\n",
      "🔄 Generating 10 survey responses with LLM...\n",
      "📊 Using max_tokens: 3500 (estimated: 3500)\n",
      "📝 Raw response length: 5235 characters\n",
      "🔍 Parsed JSON type: <class 'dict'>\n",
      "📊 Found data in 'responses': 10 rows\n",
      "✅ Successfully generated 10 survey responses\n",
      "\n",
      "📦 Processing batch: 10 rows (remaining: 80)\n",
      "🔄 Generating 10 survey responses with LLM...\n",
      "📊 Using max_tokens: 3500 (estimated: 3500)\n",
      "📝 Raw response length: 5232 characters\n",
      "🔍 Parsed JSON type: <class 'dict'>\n",
      "📊 Found data in 'responses': 10 rows\n",
      "✅ Successfully generated 10 survey responses\n",
      "\n",
      "📦 Processing batch: 10 rows (remaining: 70)\n",
      "🔄 Generating 10 survey responses with LLM...\n",
      "📊 Using max_tokens: 3500 (estimated: 3500)\n",
      "📝 Raw response length: 5239 characters\n",
      "🔍 Parsed JSON type: <class 'dict'>\n",
      "📊 Found data in 'responses': 10 rows\n",
      "✅ Successfully generated 10 survey responses\n",
      "\n",
      "📦 Processing batch: 10 rows (remaining: 60)\n",
      "🔄 Generating 10 survey responses with LLM...\n",
      "📊 Using max_tokens: 3500 (estimated: 3500)\n",
      "📝 Raw response length: 5238 characters\n",
      "🔍 Parsed JSON type: <class 'dict'>\n",
      "📊 Found data in 'responses': 10 rows\n",
      "✅ Successfully generated 10 survey responses\n",
      "\n",
      "📦 Processing batch: 10 rows (remaining: 50)\n",
      "🔄 Generating 10 survey responses with LLM...\n",
      "📊 Using max_tokens: 3500 (estimated: 3500)\n",
      "📝 Raw response length: 5236 characters\n",
      "🔍 Parsed JSON type: <class 'dict'>\n",
      "📊 Found data in 'responses': 10 rows\n",
      "✅ Successfully generated 10 survey responses\n",
      "\n",
      "📦 Processing batch: 10 rows (remaining: 40)\n",
      "🔄 Generating 10 survey responses with LLM...\n",
      "📊 Using max_tokens: 3500 (estimated: 3500)\n",
      "📝 Raw response length: 5229 characters\n",
      "🔍 Parsed JSON type: <class 'dict'>\n",
      "📊 Found data in 'responses': 10 rows\n",
      "✅ Successfully generated 10 survey responses\n",
      "\n",
      "📦 Processing batch: 10 rows (remaining: 30)\n",
      "🔄 Generating 10 survey responses with LLM...\n",
      "📊 Using max_tokens: 3500 (estimated: 3500)\n",
      "📝 Raw response length: 5244 characters\n",
      "🔍 Parsed JSON type: <class 'dict'>\n",
      "📊 Found data in 'responses': 10 rows\n",
      "✅ Successfully generated 10 survey responses\n",
      "\n",
      "📦 Processing batch: 10 rows (remaining: 20)\n",
      "🔄 Generating 10 survey responses with LLM...\n",
      "📊 Using max_tokens: 3500 (estimated: 3500)\n",
      "📝 Raw response length: 5234 characters\n",
      "🔍 Parsed JSON type: <class 'dict'>\n",
      "📊 Found data in 'responses': 10 rows\n",
      "✅ Successfully generated 10 survey responses\n",
      "\n",
      "📦 Processing batch: 10 rows (remaining: 10)\n",
      "🔄 Generating 10 survey responses with LLM...\n",
      "📊 Using max_tokens: 3500 (estimated: 3500)\n",
      "📝 Raw response length: 5238 characters\n",
      "🔍 Parsed JSON type: <class 'dict'>\n",
      "📊 Found data in 'responses': 10 rows\n",
      "✅ Successfully generated 10 survey responses\n",
      "✅ Generated total: 100 survey responses\n",
      "\n",
      "📊 Large dataset shape: (100, 18)\n",
      "\n",
      "📈 Summary statistics:\n",
      "       respondent_id         age  completion_seconds   q_quality     q_value  \\\n",
      "count     100.000000  100.000000          100.000000  100.000000  100.000000   \n",
      "mean    33513.700000   34.070000          588.525000    3.740000    3.910000   \n",
      "std     29233.800863    7.835757          230.530212    1.001211    0.995901   \n",
      "min     10001.000000   22.000000          120.500000    2.000000    2.000000   \n",
      "25%     10009.000000   28.000000          420.375000    3.000000    3.000000   \n",
      "50%     15122.500000   33.000000          600.000000    4.000000    4.000000   \n",
      "75%     55955.750000   39.250000          720.000000    5.000000    5.000000   \n",
      "max     98765.000000   50.000000         1500.000000    5.000000    5.000000   \n",
      "\n",
      "           q_ease   q_support         nps  \n",
      "count  100.000000  100.000000  100.000000  \n",
      "mean     3.900000    3.910000    6.990000  \n",
      "std      0.937437    0.985706    2.333312  \n",
      "min      2.000000    2.000000    2.000000  \n",
      "25%      3.000000    3.000000    5.000000  \n",
      "50%      4.000000    4.000000    7.000000  \n",
      "75%      5.000000    5.000000    9.000000  \n",
      "max      5.000000    5.000000   10.000000  \n",
      "💾 Saved: data\\survey_llm_fixed_20251023T005139Z.csv\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\Joshua\\AppData\\Local\\Temp\\ipykernel_27572\\2716383900.py:12: DeprecationWarning: datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).\n",
      "  ts = datetime.utcnow().strftime(\"%Y%m%dT%H%M%SZ\")\n"
     ]
    }
   ],
   "source": [
    "#Test larger dataset generation \n",
    "print(\"🚀 Testing larger dataset generation...\")\n",
    "large_df = fixed_generate_llm(CFG, total_rows=100, batch_size=25)\n",
    "if not large_df.empty:\n",
    "    print(f\"\\n📊 Large dataset shape: {large_df.shape}\")\n",
    "    print(f\"\\n📈 Summary statistics:\")\n",
    "    print(large_df.describe())\n",
    "    \n",
    "    # Save the results\n",
    "    from pathlib import Path\n",
    "    out = Path(\"data\"); out.mkdir(exist_ok=True)\n",
    "    ts = datetime.utcnow().strftime(\"%Y%m%dT%H%M%SZ\")\n",
    "    csv_path = out / f\"survey_llm_fixed_{ts}.csv\"\n",
    "    large_df.to_csv(csv_path, index=False)\n",
    "    print(f\"💾 Saved: {csv_path}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6029d3e2",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "LLM available: True\n"
     ]
    }
   ],
   "source": [
    "\n",
    "def build_json_schema(CFG):\n",
    "    schema = {'type':'array','items':{'type':'object','properties':{},'required':[]}}\n",
    "    props = schema['items']['properties']; req = schema['items']['required']\n",
    "    for f in CFG['fields']:\n",
    "        name, t = f['name'], f['type']\n",
    "        req.append(name)\n",
    "        if t in ('int','float'): props[name] = {'type':'number' if t=='float' else 'integer'}\n",
    "        elif t == 'enum': props[name] = {'type':'string','enum': f['values']}\n",
    "        elif t in ('uuid4','datetime'): props[name] = {'type':'string'}\n",
    "        elif t == 'bool': props[name] = {'type':'boolean'}\n",
    "        else: props[name] = {'type':'string'}\n",
    "    return schema\n",
    "\n",
    "PROMPT_PREAMBLE = (\n",
    "    \"You are a data generator. Return ONLY JSON. \"\n",
    "    \"Respond as a JSON object with key 'rows' whose value is an array of exactly N objects. \"\n",
    "    \"No prose, no code fences, no trailing commas.\"\n",
    ")\n",
    "\n",
    "def render_prompt(CFG, n_rows=100):\n",
    "    minimal_cfg = {'fields': []}\n",
    "    for f in CFG['fields']:\n",
    "        base = {k: f[k] for k in ['name','type'] if k in f}\n",
    "        if 'min' in f and 'max' in f: base.update({'min': f['min'], 'max': f['max']})\n",
    "        if 'values' in f: base.update({'values': f['values']})\n",
    "        if 'fmt' in f: base.update({'fmt': f['fmt']})\n",
    "        minimal_cfg['fields'].append(base)\n",
    "    return {\n",
    "        'preamble': PROMPT_PREAMBLE,\n",
    "        'n_rows': n_rows,\n",
    "        'schema': build_json_schema(CFG),\n",
    "        'constraints': minimal_cfg,\n",
    "        'instruction': f\"Return ONLY this structure: {{'rows': [ ... exactly {n_rows} objects ... ]}}\"\n",
    "    }\n",
    "\n",
    "def parse_llm_json_to_df(raw: str) -> pd.DataFrame:\n",
    "    try:\n",
    "        obj = json.loads(raw)\n",
    "        if isinstance(obj, dict) and isinstance(obj.get('rows'), list):\n",
    "            return pd.DataFrame(obj['rows'])\n",
    "    except Exception:\n",
    "        pass\n",
    "    data = extract_strict_json(raw)\n",
    "    return pd.DataFrame(data)\n",
    "\n",
    "USE_LLM = bool(os.getenv('OPENAI_API_KEY'))\n",
    "print('LLM available:', USE_LLM)\n",
    "\n",
    "def llm_generate_batch(CFG, n_rows=50):\n",
    "    if USE_LLM:\n",
    "        try:\n",
    "            from openai import OpenAI\n",
    "            client = OpenAI()\n",
    "            prompt = json.dumps(render_prompt(CFG, n_rows))\n",
    "            resp = client.chat.completions.create(\n",
    "                model='gpt-4o-mini',\n",
    "                response_format={'type': 'json_object'},\n",
    "                messages=[\n",
    "                    {'role':'system','content':'You output strict JSON only.'},\n",
    "                    {'role':'user','content': prompt}\n",
    "                ],\n",
    "                temperature=0.2,\n",
    "                max_tokens=8192,\n",
    "            )\n",
    "            raw = resp.choices[0].message.content\n",
    "            try:\n",
    "                return parse_llm_json_to_df(raw)\n",
    "            except Exception:\n",
    "                stricter = (\n",
    "                    prompt\n",
    "                    + \"\\nReturn ONLY a JSON object structured as: \"\n",
    "                    + \"{\\\"rows\\\": [ ... exactly N objects ... ]}. \"\n",
    "                    + \"No prose, no explanations.\"\n",
    "                )\n",
    "                resp2 = client.chat.completions.create(\n",
    "                    model='gpt-4o-mini',\n",
    "                    response_format={'type': 'json_object'},\n",
    "                    messages=[\n",
    "                        {'role':'system','content':'You output strict JSON only.'},\n",
    "                        {'role':'user','content': stricter}\n",
    "                    ],\n",
    "                    temperature=0.2,\n",
    "                    max_tokens=8192,\n",
    "                )\n",
    "                raw2 = resp2.choices[0].message.content\n",
    "                return parse_llm_json_to_df(raw2)\n",
    "        except Exception as e:\n",
    "            print('LLM error, fallback to rule-based mock:', e)\n",
    "    tmp = dict(CFG); tmp['rows'] = n_rows\n",
    "    return generate_rule_based(tmp)\n",
    "\n",
    "def generate_llm(CFG, total_rows=200, batch_size=50):\n",
    "    dfs = []; remaining = total_rows\n",
    "    while remaining > 0:\n",
    "        b = min(batch_size, remaining)\n",
    "        dfs.append(llm_generate_batch(CFG, n_rows=b))\n",
    "        remaining -= b\n",
    "        time.sleep(0.2)\n",
    "    return pd.concat(dfs, ignore_index=True)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "2e759087",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "LLM error, fallback to rule-based mock: No JSON array found in model output.\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>response_id</th>\n",
       "      <th>respondent_id</th>\n",
       "      <th>submitted_at</th>\n",
       "      <th>country</th>\n",
       "      <th>language</th>\n",
       "      <th>device</th>\n",
       "      <th>age</th>\n",
       "      <th>gender</th>\n",
       "      <th>education</th>\n",
       "      <th>income_band</th>\n",
       "      <th>completion_seconds</th>\n",
       "      <th>attention_passed</th>\n",
       "      <th>q_quality</th>\n",
       "      <th>q_value</th>\n",
       "      <th>q_ease</th>\n",
       "      <th>q_support</th>\n",
       "      <th>nps</th>\n",
       "      <th>is_detractor</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>9e7811bd-27ee-4b7c-9b7a-c98441e337f0</td>\n",
       "      <td>40160</td>\n",
       "      <td>2024-08-18 19:10:06</td>\n",
       "      <td>KE</td>\n",
       "      <td>sw</td>\n",
       "      <td>web</td>\n",
       "      <td>28</td>\n",
       "      <td>male</td>\n",
       "      <td>secondary</td>\n",
       "      <td>lower_mid</td>\n",
       "      <td>1800.000000</td>\n",
       "      <td>True</td>\n",
       "      <td>4</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>85ec8b90-5468-4880-8309-e325da14d877</td>\n",
       "      <td>55381</td>\n",
       "      <td>2025-01-24 12:21:13</td>\n",
       "      <td>TZ</td>\n",
       "      <td>sw</td>\n",
       "      <td>ios</td>\n",
       "      <td>23</td>\n",
       "      <td>female</td>\n",
       "      <td>bachelor</td>\n",
       "      <td>high</td>\n",
       "      <td>431.412783</td>\n",
       "      <td>True</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>498dff10-040f-4206-8170-dfce0d5a69f0</td>\n",
       "      <td>48338</td>\n",
       "      <td>2025-07-15 22:21:54</td>\n",
       "      <td>TZ</td>\n",
       "      <td>en</td>\n",
       "      <td>ios</td>\n",
       "      <td>49</td>\n",
       "      <td>male</td>\n",
       "      <td>bachelor</td>\n",
       "      <td>low</td>\n",
       "      <td>1800.000000</td>\n",
       "      <td>True</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>ddf11d94-5d6e-4322-9811-4e763f5ed46b</td>\n",
       "      <td>59925</td>\n",
       "      <td>2025-01-27 00:16:57</td>\n",
       "      <td>KE</td>\n",
       "      <td>en</td>\n",
       "      <td>web</td>\n",
       "      <td>22</td>\n",
       "      <td>male</td>\n",
       "      <td>bachelor</td>\n",
       "      <td>upper_mid</td>\n",
       "      <td>656.050991</td>\n",
       "      <td>True</td>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>5</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2ef22a0c-fd13-4798-9276-f43831b8f7bc</td>\n",
       "      <td>68993</td>\n",
       "      <td>2024-08-19 04:21:49</td>\n",
       "      <td>KE</td>\n",
       "      <td>en</td>\n",
       "      <td>android</td>\n",
       "      <td>40</td>\n",
       "      <td>male</td>\n",
       "      <td>secondary</td>\n",
       "      <td>lower_mid</td>\n",
       "      <td>1553.938944</td>\n",
       "      <td>True</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>5</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                            response_id  respondent_id         submitted_at  \\\n",
       "0  9e7811bd-27ee-4b7c-9b7a-c98441e337f0          40160  2024-08-18 19:10:06   \n",
       "1  85ec8b90-5468-4880-8309-e325da14d877          55381  2025-01-24 12:21:13   \n",
       "2  498dff10-040f-4206-8170-dfce0d5a69f0          48338  2025-07-15 22:21:54   \n",
       "3  ddf11d94-5d6e-4322-9811-4e763f5ed46b          59925  2025-01-27 00:16:57   \n",
       "4  2ef22a0c-fd13-4798-9276-f43831b8f7bc          68993  2024-08-19 04:21:49   \n",
       "\n",
       "  country language   device  age  gender  education income_band  \\\n",
       "0      KE       sw      web   28    male  secondary   lower_mid   \n",
       "1      TZ       sw      ios   23  female   bachelor        high   \n",
       "2      TZ       en      ios   49    male   bachelor         low   \n",
       "3      KE       en      web   22    male   bachelor   upper_mid   \n",
       "4      KE       en  android   40    male  secondary   lower_mid   \n",
       "\n",
       "   completion_seconds  attention_passed  q_quality  q_value  q_ease  \\\n",
       "0         1800.000000              True          4        3       3   \n",
       "1          431.412783              True          3        2       3   \n",
       "2         1800.000000              True          2        3       3   \n",
       "3          656.050991              True          4        4       1   \n",
       "4         1553.938944              True          2        2       5   \n",
       "\n",
       "   q_support  nps  is_detractor  \n",
       "0          3    4          True  \n",
       "1          4    4         False  \n",
       "2          1    3         False  \n",
       "3          3    5         False  \n",
       "4          1    5         False  "
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_llm = generate_llm(CFG, total_rows=100, batch_size=50)\n",
    "df_llm.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "id": "6d4908ad",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "🧪 Testing improved LLM generation with adaptive batching...\n",
      "\n",
      "📦 Testing small batch (10 rows)...\n",
      "🔄 Generating 10 survey responses with LLM...\n",
      "📊 Using max_tokens: 3500 (estimated: 3500)\n",
      "📝 Raw response length: 5233 characters\n",
      "🔍 Parsed JSON type: <class 'dict'>\n",
      "📊 Found data in 'responses': 10 rows\n",
      "✅ Successfully generated 10 survey responses\n",
      "✅ Small batch result: 10 rows\n",
      "\n",
      "📦 Testing medium dataset (30 rows) with adaptive batching...\n",
      "🚀 Generating 30 survey responses with adaptive batching\n",
      "📊 Using optimal batch size: 15\n",
      "\n",
      "📦 Processing batch: 15 rows (remaining: 30)\n",
      "🔄 Generating 15 survey responses with LLM...\n",
      "📊 Using max_tokens: 5000 (estimated: 5000)\n",
      "📝 Raw response length: 7839 characters\n",
      "🔍 Parsed JSON type: <class 'dict'>\n",
      "📊 Found data in 'responses': 15 rows\n",
      "✅ Successfully generated 15 survey responses\n",
      "\n",
      "📦 Processing batch: 15 rows (remaining: 15)\n",
      "🔄 Generating 15 survey responses with LLM...\n",
      "📊 Using max_tokens: 5000 (estimated: 5000)\n",
      "📝 Raw response length: 7841 characters\n",
      "🔍 Parsed JSON type: <class 'dict'>\n",
      "📊 Found data in 'responses': 15 rows\n",
      "✅ Successfully generated 15 survey responses\n",
      "✅ Generated total: 30 survey responses\n",
      "✅ Medium dataset result: 30 rows\n",
      "\n",
      "📊 Dataset shape: (30, 18)\n",
      "\n",
      "📋 First few rows:\n",
      "                            response_id  respondent_id         submitted_at  \\\n",
      "0  d1e5c4a3-4b1f-4f6b-8f9e-9f1e1f2e3d4c          10001  2023-10-01 14:30:00   \n",
      "1  c2b1d4a6-7f8e-4c5c-9d8f-1e2c3b4a5e6f          10002  2023-10-01 15:00:00   \n",
      "2  e3f2c5b7-8a2d-4c8e-9f1b-2c3d4e5f6a7b          10003  2023-10-01 15:30:00   \n",
      "3  f4a5b6c8-9d3e-4b1f-9f2c-3d4e5f6a7b8c          10004  2023-10-01 16:00:00   \n",
      "4  g5b6c7d9-0e4f-4b2a-8f3d-4e5f6a7b8c9d          10005  2023-10-01 16:30:00   \n",
      "\n",
      "  country language   device  age     gender     education income_band  \\\n",
      "0      KE       en  android   28     female      bachelor   upper_mid   \n",
      "1      UG       sw      web   35       male       diploma   lower_mid   \n",
      "2      TZ       en      ios   42  nonbinary  postgraduate        high   \n",
      "3      RW       sw      web   29     female     secondary   upper_mid   \n",
      "4      NG       en  android   50       male      bachelor        high   \n",
      "\n",
      "   completion_seconds  attention_passed  q_quality  q_value  q_ease  \\\n",
      "0               450.0              True          5        4       5   \n",
      "1               600.0              True          3        2       4   \n",
      "2               720.0              True          4        5       4   \n",
      "3               300.0              True          3        3       3   \n",
      "4               540.0              True          5        5       5   \n",
      "\n",
      "   q_support  nps  is_detractor  \n",
      "0          4    9         False  \n",
      "1          3    5         False  \n",
      "2          5   10         False  \n",
      "3          4    6         False  \n",
      "4          5   10         False  \n",
      "💾 Saved: data\\survey_adaptive_batch_20251023T005927Z.csv\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\Joshua\\AppData\\Local\\Temp\\ipykernel_27572\\1770033334.py:22: DeprecationWarning: datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).\n",
      "  ts = datetime.utcnow().strftime(\"%Y%m%dT%H%M%SZ\")\n"
     ]
    }
   ],
   "source": [
    "# Test the improved LLM generation with adaptive batching\n",
    "print(\"🧪 Testing improved LLM generation with adaptive batching...\")\n",
    "\n",
    "# Test with smaller dataset first\n",
    "print(\"\\n📦 Testing small batch (10 rows)...\")\n",
    "small_df = fixed_llm_generate_batch(CFG, 10)\n",
    "print(f\"✅ Small batch result: {len(small_df)} rows\")\n",
    "\n",
    "# Test with medium dataset using adaptive batching\n",
    "print(\"\\n📦 Testing medium dataset (30 rows) with adaptive batching...\")\n",
    "medium_df = fixed_generate_llm(CFG, total_rows=30, batch_size=15)\n",
    "print(f\"✅ Medium dataset result: {len(medium_df)} rows\")\n",
    "\n",
    "if not medium_df.empty:\n",
    "    print(f\"\\n📊 Dataset shape: {medium_df.shape}\")\n",
    "    print(f\"\\n📋 First few rows:\")\n",
    "    print(medium_df.head())\n",
    "    \n",
    "    # Save the results\n",
    "    from pathlib import Path\n",
    "    out = Path(\"data\"); out.mkdir(exist_ok=True)\n",
    "    ts = datetime.utcnow().strftime(\"%Y%m%dT%H%M%SZ\")\n",
    "    csv_path = out / f\"survey_adaptive_batch_{ts}.csv\"\n",
    "    medium_df.to_csv(csv_path, index=False)\n",
    "    print(f\"💾 Saved: {csv_path}\")\n",
    "else:\n",
    "    print(\"❌ Medium dataset generation failed\")\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}