diff --git a/week3/community-contributions/week3_Exercise_survey_Dataset_Generation.ipynb b/week3/community-contributions/week3_Exercise_survey_Dataset_Generation.ipynb
index a11fa96..a4474af 100644
--- a/week3/community-contributions/week3_Exercise_survey_Dataset_Generation.ipynb
+++ b/week3/community-contributions/week3_Exercise_survey_Dataset_Generation.ipynb
@@ -10,18 +10,10 @@
},
{
"cell_type": "code",
- "execution_count": 34,
+ "execution_count": null,
"id": "8d86f629",
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "โ
Base libraries ready. Pandera available: True\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"\n",
"import os, re, json, time, uuid, math, random\n",
@@ -35,7 +27,7 @@
},
{
"cell_type": "code",
- "execution_count": 23,
+ "execution_count": null,
"id": "f196ae73",
"metadata": {},
"outputs": [],
@@ -94,18 +86,10 @@
},
{
"cell_type": "code",
- "execution_count": 36,
+ "execution_count": null,
"id": "d16bd03a",
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Loaded config for 800 rows and 18 fields.\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"\n",
"CFG = {\n",
@@ -145,7 +129,7 @@
},
{
"cell_type": "code",
- "execution_count": 37,
+ "execution_count": null,
"id": "d2f5fdff",
"metadata": {},
"outputs": [],
@@ -208,196 +192,10 @@
},
{
"cell_type": "code",
- "execution_count": 38,
+ "execution_count": null,
"id": "cd61330d",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "
\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " response_id | \n",
- " respondent_id | \n",
- " submitted_at | \n",
- " country | \n",
- " language | \n",
- " device | \n",
- " age | \n",
- " gender | \n",
- " education | \n",
- " income_band | \n",
- " completion_seconds | \n",
- " attention_passed | \n",
- " q_quality | \n",
- " q_value | \n",
- " q_ease | \n",
- " q_support | \n",
- " nps | \n",
- " is_detractor | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " f099c1b6-a4ae-4fb0-ba98-89a81008c424 | \n",
- " 71615 | \n",
- " 2024-04-13 19:02:44 | \n",
- " ZA | \n",
- " en | \n",
- " web | \n",
- " 47 | \n",
- " male | \n",
- " secondary | \n",
- " low | \n",
- " 897.995012 | \n",
- " True | \n",
- " 5 | \n",
- " 3 | \n",
- " 1 | \n",
- " 3 | \n",
- " 4 | \n",
- " True | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " f2e20ad1-1ed1-4e33-8beb-5dd0ba23715b | \n",
- " 68564 | \n",
- " 2024-03-05 23:30:30 | \n",
- " KE | \n",
- " en | \n",
- " android | \n",
- " 67 | \n",
- " female | \n",
- " bachelor | \n",
- " lower_mid | \n",
- " 935.607966 | \n",
- " True | \n",
- " 1 | \n",
- " 5 | \n",
- " 2 | \n",
- " 3 | \n",
- " 5 | \n",
- " False | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " a9345f69-be75-46b9-8cd3-a276ce0a66bd | \n",
- " 59689 | \n",
- " 2024-11-10 03:38:07 | \n",
- " RW | \n",
- " sw | \n",
- " android | \n",
- " 23 | \n",
- " male | \n",
- " bachelor | \n",
- " low | \n",
- " 1431.517701 | \n",
- " True | \n",
- " 5 | \n",
- " 2 | \n",
- " 5 | \n",
- " 5 | \n",
- " 7 | \n",
- " False | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " b4fa8625-d153-4465-ad73-1c4a48eed2f1 | \n",
- " 20742 | \n",
- " 2024-11-19 17:40:58 | \n",
- " KE | \n",
- " en | \n",
- " ios | \n",
- " 68 | \n",
- " female | \n",
- " secondary | \n",
- " upper_mid | \n",
- " 448.519416 | \n",
- " True | \n",
- " 5 | \n",
- " 5 | \n",
- " 5 | \n",
- " 3 | \n",
- " 10 | \n",
- " False | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " e0ad4bbc-b576-4913-8786-302f06b5e9f7 | \n",
- " 63459 | \n",
- " 2024-07-28 04:23:37 | \n",
- " KE | \n",
- " en | \n",
- " ios | \n",
- " 34 | \n",
- " male | \n",
- " secondary | \n",
- " low | \n",
- " 1179.970734 | \n",
- " True | \n",
- " 3 | \n",
- " 1 | \n",
- " 3 | \n",
- " 3 | \n",
- " 5 | \n",
- " False | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " response_id respondent_id submitted_at \\\n",
- "0 f099c1b6-a4ae-4fb0-ba98-89a81008c424 71615 2024-04-13 19:02:44 \n",
- "1 f2e20ad1-1ed1-4e33-8beb-5dd0ba23715b 68564 2024-03-05 23:30:30 \n",
- "2 a9345f69-be75-46b9-8cd3-a276ce0a66bd 59689 2024-11-10 03:38:07 \n",
- "3 b4fa8625-d153-4465-ad73-1c4a48eed2f1 20742 2024-11-19 17:40:58 \n",
- "4 e0ad4bbc-b576-4913-8786-302f06b5e9f7 63459 2024-07-28 04:23:37 \n",
- "\n",
- " country language device age gender education income_band \\\n",
- "0 ZA en web 47 male secondary low \n",
- "1 KE en android 67 female bachelor lower_mid \n",
- "2 RW sw android 23 male bachelor low \n",
- "3 KE en ios 68 female secondary upper_mid \n",
- "4 KE en ios 34 male secondary low \n",
- "\n",
- " completion_seconds attention_passed q_quality q_value q_ease \\\n",
- "0 897.995012 True 5 3 1 \n",
- "1 935.607966 True 1 5 2 \n",
- "2 1431.517701 True 5 2 5 \n",
- "3 448.519416 True 5 5 5 \n",
- "4 1179.970734 True 3 1 3 \n",
- "\n",
- " q_support nps is_detractor \n",
- "0 3 4 True \n",
- "1 3 5 False \n",
- "2 5 7 False \n",
- "3 3 10 False \n",
- "4 3 5 False "
- ]
- },
- "execution_count": 38,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"\n",
"def generate_rule_based(CFG: Dict[str, Any]) -> pd.DataFrame:\n",
@@ -450,245 +248,10 @@
},
{
"cell_type": "code",
- "execution_count": 39,
+ "execution_count": null,
"id": "9a4ef86a",
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Validation error: {\n",
- " \"SCHEMA\": {\n",
- " \"WRONG_DATATYPE\": [\n",
- " {\n",
- " \"schema\": null,\n",
- " \"column\": \"respondent_id\",\n",
- " \"check\": \"dtype('int64')\",\n",
- " \"error\": \"expected series 'respondent_id' to have type int64, got int32\"\n",
- " },\n",
- " {\n",
- " \"schema\": null,\n",
- " \"column\": \"age\",\n",
- " \"check\": \"dtype('int64')\",\n",
- " \"error\": \"expected series 'age' to have type int64, got int32\"\n",
- " },\n",
- " {\n",
- " \"schema\": null,\n",
- " \"column\": \"q_quality\",\n",
- " \"check\": \"dtype('int64')\",\n",
- " \"error\": \"expected series 'q_quality' to have type int64, got int32\"\n",
- " },\n",
- " {\n",
- " \"schema\": null,\n",
- " \"column\": \"q_value\",\n",
- " \"check\": \"dtype('int64')\",\n",
- " \"error\": \"expected series 'q_value' to have type int64, got int32\"\n",
- " },\n",
- " {\n",
- " \"schema\": null,\n",
- " \"column\": \"q_ease\",\n",
- " \"check\": \"dtype('int64')\",\n",
- " \"error\": \"expected series 'q_ease' to have type int64, got int32\"\n",
- " },\n",
- " {\n",
- " \"schema\": null,\n",
- " \"column\": \"q_support\",\n",
- " \"check\": \"dtype('int64')\",\n",
- " \"error\": \"expected series 'q_support' to have type int64, got int32\"\n",
- " }\n",
- " ]\n",
- " }\n",
- "}\n",
- "{'engine': 'pandera', 'valid_rows': 800, 'invalid_rows': 0, 'notes': 'Non-strict mode.'}\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " response_id | \n",
- " respondent_id | \n",
- " submitted_at | \n",
- " country | \n",
- " language | \n",
- " device | \n",
- " age | \n",
- " gender | \n",
- " education | \n",
- " income_band | \n",
- " completion_seconds | \n",
- " attention_passed | \n",
- " q_quality | \n",
- " q_value | \n",
- " q_ease | \n",
- " q_support | \n",
- " nps | \n",
- " is_detractor | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " f099c1b6-a4ae-4fb0-ba98-89a81008c424 | \n",
- " 71615 | \n",
- " 2024-04-13 19:02:44 | \n",
- " ZA | \n",
- " en | \n",
- " web | \n",
- " 47 | \n",
- " male | \n",
- " secondary | \n",
- " low | \n",
- " 897.995012 | \n",
- " True | \n",
- " 5 | \n",
- " 3 | \n",
- " 1 | \n",
- " 3 | \n",
- " 4 | \n",
- " True | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " f2e20ad1-1ed1-4e33-8beb-5dd0ba23715b | \n",
- " 68564 | \n",
- " 2024-03-05 23:30:30 | \n",
- " KE | \n",
- " en | \n",
- " android | \n",
- " 67 | \n",
- " female | \n",
- " bachelor | \n",
- " lower_mid | \n",
- " 935.607966 | \n",
- " True | \n",
- " 1 | \n",
- " 5 | \n",
- " 2 | \n",
- " 3 | \n",
- " 5 | \n",
- " False | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " a9345f69-be75-46b9-8cd3-a276ce0a66bd | \n",
- " 59689 | \n",
- " 2024-11-10 03:38:07 | \n",
- " RW | \n",
- " sw | \n",
- " android | \n",
- " 23 | \n",
- " male | \n",
- " bachelor | \n",
- " low | \n",
- " 1431.517701 | \n",
- " True | \n",
- " 5 | \n",
- " 2 | \n",
- " 5 | \n",
- " 5 | \n",
- " 7 | \n",
- " False | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " b4fa8625-d153-4465-ad73-1c4a48eed2f1 | \n",
- " 20742 | \n",
- " 2024-11-19 17:40:58 | \n",
- " KE | \n",
- " en | \n",
- " ios | \n",
- " 68 | \n",
- " female | \n",
- " secondary | \n",
- " upper_mid | \n",
- " 448.519416 | \n",
- " True | \n",
- " 5 | \n",
- " 5 | \n",
- " 5 | \n",
- " 3 | \n",
- " 10 | \n",
- " False | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " e0ad4bbc-b576-4913-8786-302f06b5e9f7 | \n",
- " 63459 | \n",
- " 2024-07-28 04:23:37 | \n",
- " KE | \n",
- " en | \n",
- " ios | \n",
- " 34 | \n",
- " male | \n",
- " secondary | \n",
- " low | \n",
- " 1179.970734 | \n",
- " True | \n",
- " 3 | \n",
- " 1 | \n",
- " 3 | \n",
- " 3 | \n",
- " 5 | \n",
- " False | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " response_id respondent_id submitted_at \\\n",
- "0 f099c1b6-a4ae-4fb0-ba98-89a81008c424 71615 2024-04-13 19:02:44 \n",
- "1 f2e20ad1-1ed1-4e33-8beb-5dd0ba23715b 68564 2024-03-05 23:30:30 \n",
- "2 a9345f69-be75-46b9-8cd3-a276ce0a66bd 59689 2024-11-10 03:38:07 \n",
- "3 b4fa8625-d153-4465-ad73-1c4a48eed2f1 20742 2024-11-19 17:40:58 \n",
- "4 e0ad4bbc-b576-4913-8786-302f06b5e9f7 63459 2024-07-28 04:23:37 \n",
- "\n",
- " country language device age gender education income_band \\\n",
- "0 ZA en web 47 male secondary low \n",
- "1 KE en android 67 female bachelor lower_mid \n",
- "2 RW sw android 23 male bachelor low \n",
- "3 KE en ios 68 female secondary upper_mid \n",
- "4 KE en ios 34 male secondary low \n",
- "\n",
- " completion_seconds attention_passed q_quality q_value q_ease \\\n",
- "0 897.995012 True 5 3 1 \n",
- "1 935.607966 True 1 5 2 \n",
- "2 1431.517701 True 5 2 5 \n",
- "3 448.519416 True 5 5 5 \n",
- "4 1179.970734 True 3 1 3 \n",
- "\n",
- " q_support nps is_detractor \n",
- "0 3 4 True \n",
- "1 3 5 False \n",
- "2 5 7 False \n",
- "3 3 10 False \n",
- "4 3 5 False "
- ]
- },
- "execution_count": 39,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"\n",
"def build_pandera_schema(CFG):\n",
@@ -732,26 +295,10 @@
},
{
"cell_type": "code",
- "execution_count": 40,
+ "execution_count": null,
"id": "73626b4c",
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Saved: data/survey_rule_20251023T004106Z.csv\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "C:\\Users\\Joshua\\AppData\\Local\\Temp\\ipykernel_27572\\1233117399.py:3: DeprecationWarning: datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).\n",
- " ts = datetime.utcnow().strftime(\"%Y%m%dT%H%M%SZ\")\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"\n",
"from pathlib import Path\n",
@@ -772,7 +319,7 @@
},
{
"cell_type": "code",
- "execution_count": 41,
+ "execution_count": null,
"id": "24e94771",
"metadata": {},
"outputs": [],
@@ -1067,73 +614,7 @@
"execution_count": null,
"id": "e1af410e",
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "๐งช Testing LLM generation...\n",
- "๐ Generating 10 survey responses with LLM...\n",
- "๐ Using max_tokens: 3500 (estimated: 3500)\n",
- "๐ Raw response length: 5236 characters\n",
- "๐ Parsed JSON type: \n",
- "๐ Found data in 'responses': 10 rows\n",
- "โ
Successfully generated 10 survey responses\n",
- "\n",
- "๐ Generated dataset shape: (10, 18)\n",
- "\n",
- "๐ First few rows:\n",
- " response_id respondent_id submitted_at \\\n",
- "0 f3e9b9d1-4e9e-4f8a-9b5c-7e3cbb1c4e5e 10234 2023-10-01 14:23:45 \n",
- "1 a1c5f6d3-1f5b-4e8a-8c7a-5e2c3f4b8e1b 20456 2023-10-01 15:10:12 \n",
- "2 c2b3e4f5-5d6e-4b8a-9f3c-8e1a2f9b4e3c 30567 2023-10-01 16:45:30 \n",
- "3 d4e5f6b7-6e8f-4b9a-8c7d-9e2f3c4b5e6f 40678 2023-10-01 17:30:00 \n",
- "4 e5f6a7b8-7f9a-4c0a-9e2f-1e3c4b5e6f7a 50789 2023-10-01 18:15:15 \n",
- "\n",
- " country language device age gender education income_band \\\n",
- "0 KE en android 29 female bachelor upper_mid \n",
- "1 UG sw web 34 male secondary lower_mid \n",
- "2 TZ en ios 42 nonbinary diploma high \n",
- "3 RW sw android 27 female bachelor upper_mid \n",
- "4 NG en web 36 male postgraduate high \n",
- "\n",
- " completion_seconds attention_passed q_quality q_value q_ease \\\n",
- "0 450.0 True 4 5 4 \n",
- "1 600.5 True 3 4 3 \n",
- "2 720.0 True 5 5 5 \n",
- "3 390.0 True 4 4 4 \n",
- "4 800.0 True 5 5 5 \n",
- "\n",
- " q_support nps is_detractor \n",
- "0 5 9 False \n",
- "1 4 7 False \n",
- "2 5 10 False \n",
- "3 4 8 False \n",
- "4 5 9 False \n",
- "\n",
- "๐ Data types:\n",
- "response_id object\n",
- "respondent_id int64\n",
- "submitted_at object\n",
- "country object\n",
- "language object\n",
- "device object\n",
- "age int64\n",
- "gender object\n",
- "education object\n",
- "income_band object\n",
- "completion_seconds float64\n",
- "attention_passed bool\n",
- "q_quality int64\n",
- "q_value int64\n",
- "q_ease int64\n",
- "q_support int64\n",
- "nps int64\n",
- "is_detractor bool\n",
- "dtype: object\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"# Test the fixed LLM generation\n",
"print(\"๐งช Testing LLM generation...\")\n",
@@ -1196,79 +677,10 @@
},
{
"cell_type": "code",
- "execution_count": 43,
+ "execution_count": null,
"id": "75c90739",
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "๐งช Testing the fixed LLM generation...\n",
- "๐ Generating 5 survey responses with LLM...\n",
- "๐ Using max_tokens: 2000 (estimated: 2000)\n",
- "๐ Raw response length: 2629 characters\n",
- "๐ Parsed JSON type: \n",
- "๐ Found data in 'responses': 5 rows\n",
- "โ
Successfully generated 5 survey responses\n",
- "\n",
- "๐ Generated dataset shape: (5, 18)\n",
- "\n",
- "๐ First few rows:\n",
- " response_id respondent_id submitted_at \\\n",
- "0 d8b1c6f3-6f7a-4b4f-9c5f-3a5f8b6e2f1e 12345 2023-10-01 14:30:00 \n",
- "1 f3a8e3c1-9b4e-4e5e-9c2b-8f5e3c9b1f3d 67890 2023-10-01 15:00:00 \n",
- "2 c9c8e3f1-2b4f-4a6c-8c2e-2a5f3c8e1f2b 54321 2023-10-01 16:15:00 \n",
- "3 a5b3c6d2-1e4f-4c5e-9a1f-1f6a7b8e3c9f 98765 2023-10-01 17:45:00 \n",
- "4 b8f4c3e2-2e4f-4c5e-8a2f-4c5e3b8e2f1a 13579 2023-10-01 18:30:00 \n",
- "\n",
- " country language device age gender education income_band \\\n",
- "0 KE en android 29 female bachelor upper_mid \n",
- "1 UG sw web 34 male diploma lower_mid \n",
- "2 TZ en ios 42 nonbinary postgraduate high \n",
- "3 RW sw android 27 female secondary low \n",
- "4 NG en web 55 male bachelor upper_mid \n",
- "\n",
- " completion_seconds attention_passed q_quality q_value q_ease \\\n",
- "0 420.0 True 5 4 4 \n",
- "1 600.0 True 3 3 2 \n",
- "2 300.5 True 4 5 4 \n",
- "3 720.0 False 2 3 3 \n",
- "4 540.0 True 5 5 5 \n",
- "\n",
- " q_support nps is_detractor \n",
- "0 5 9 False \n",
- "1 4 5 False \n",
- "2 5 10 False \n",
- "3 2 3 True \n",
- "4 5 8 False \n",
- "\n",
- "๐ Data types:\n",
- "response_id object\n",
- "respondent_id int64\n",
- "submitted_at object\n",
- "country object\n",
- "language object\n",
- "device object\n",
- "age int64\n",
- "gender object\n",
- "education object\n",
- "income_band object\n",
- "completion_seconds float64\n",
- "attention_passed bool\n",
- "q_quality int64\n",
- "q_value int64\n",
- "q_ease int64\n",
- "q_support int64\n",
- "nps int64\n",
- "is_detractor bool\n",
- "dtype: object\n",
- "\n",
- "โ
SUCCESS! LLM generation is now working!\n",
- "๐ Generated 5 survey responses using LLM\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"# Test the fixed implementation\n",
"print(\"๐งช Testing the fixed LLM generation...\")\n",
@@ -1290,133 +702,10 @@
},
{
"cell_type": "code",
- "execution_count": 44,
+ "execution_count": null,
"id": "dd83b842",
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "๐ Testing larger dataset generation...\n",
- "๐ Generating 100 survey responses with adaptive batching\n",
- "๐ Using optimal batch size: 10\n",
- "\n",
- "๐ฆ Processing batch: 10 rows (remaining: 100)\n",
- "๐ Generating 10 survey responses with LLM...\n",
- "๐ Using max_tokens: 3500 (estimated: 3500)\n",
- "๐ Raw response length: 5238 characters\n",
- "๐ Parsed JSON type: \n",
- "๐ Found data in 'responses': 10 rows\n",
- "โ
Successfully generated 10 survey responses\n",
- "\n",
- "๐ฆ Processing batch: 10 rows (remaining: 90)\n",
- "๐ Generating 10 survey responses with LLM...\n",
- "๐ Using max_tokens: 3500 (estimated: 3500)\n",
- "๐ Raw response length: 5235 characters\n",
- "๐ Parsed JSON type: \n",
- "๐ Found data in 'responses': 10 rows\n",
- "โ
Successfully generated 10 survey responses\n",
- "\n",
- "๐ฆ Processing batch: 10 rows (remaining: 80)\n",
- "๐ Generating 10 survey responses with LLM...\n",
- "๐ Using max_tokens: 3500 (estimated: 3500)\n",
- "๐ Raw response length: 5232 characters\n",
- "๐ Parsed JSON type: \n",
- "๐ Found data in 'responses': 10 rows\n",
- "โ
Successfully generated 10 survey responses\n",
- "\n",
- "๐ฆ Processing batch: 10 rows (remaining: 70)\n",
- "๐ Generating 10 survey responses with LLM...\n",
- "๐ Using max_tokens: 3500 (estimated: 3500)\n",
- "๐ Raw response length: 5239 characters\n",
- "๐ Parsed JSON type: \n",
- "๐ Found data in 'responses': 10 rows\n",
- "โ
Successfully generated 10 survey responses\n",
- "\n",
- "๐ฆ Processing batch: 10 rows (remaining: 60)\n",
- "๐ Generating 10 survey responses with LLM...\n",
- "๐ Using max_tokens: 3500 (estimated: 3500)\n",
- "๐ Raw response length: 5238 characters\n",
- "๐ Parsed JSON type: \n",
- "๐ Found data in 'responses': 10 rows\n",
- "โ
Successfully generated 10 survey responses\n",
- "\n",
- "๐ฆ Processing batch: 10 rows (remaining: 50)\n",
- "๐ Generating 10 survey responses with LLM...\n",
- "๐ Using max_tokens: 3500 (estimated: 3500)\n",
- "๐ Raw response length: 5236 characters\n",
- "๐ Parsed JSON type: \n",
- "๐ Found data in 'responses': 10 rows\n",
- "โ
Successfully generated 10 survey responses\n",
- "\n",
- "๐ฆ Processing batch: 10 rows (remaining: 40)\n",
- "๐ Generating 10 survey responses with LLM...\n",
- "๐ Using max_tokens: 3500 (estimated: 3500)\n",
- "๐ Raw response length: 5229 characters\n",
- "๐ Parsed JSON type: \n",
- "๐ Found data in 'responses': 10 rows\n",
- "โ
Successfully generated 10 survey responses\n",
- "\n",
- "๐ฆ Processing batch: 10 rows (remaining: 30)\n",
- "๐ Generating 10 survey responses with LLM...\n",
- "๐ Using max_tokens: 3500 (estimated: 3500)\n",
- "๐ Raw response length: 5244 characters\n",
- "๐ Parsed JSON type: \n",
- "๐ Found data in 'responses': 10 rows\n",
- "โ
Successfully generated 10 survey responses\n",
- "\n",
- "๐ฆ Processing batch: 10 rows (remaining: 20)\n",
- "๐ Generating 10 survey responses with LLM...\n",
- "๐ Using max_tokens: 3500 (estimated: 3500)\n",
- "๐ Raw response length: 5234 characters\n",
- "๐ Parsed JSON type: \n",
- "๐ Found data in 'responses': 10 rows\n",
- "โ
Successfully generated 10 survey responses\n",
- "\n",
- "๐ฆ Processing batch: 10 rows (remaining: 10)\n",
- "๐ Generating 10 survey responses with LLM...\n",
- "๐ Using max_tokens: 3500 (estimated: 3500)\n",
- "๐ Raw response length: 5238 characters\n",
- "๐ Parsed JSON type: \n",
- "๐ Found data in 'responses': 10 rows\n",
- "โ
Successfully generated 10 survey responses\n",
- "โ
Generated total: 100 survey responses\n",
- "\n",
- "๐ Large dataset shape: (100, 18)\n",
- "\n",
- "๐ Summary statistics:\n",
- " respondent_id age completion_seconds q_quality q_value \\\n",
- "count 100.000000 100.000000 100.000000 100.000000 100.000000 \n",
- "mean 33513.700000 34.070000 588.525000 3.740000 3.910000 \n",
- "std 29233.800863 7.835757 230.530212 1.001211 0.995901 \n",
- "min 10001.000000 22.000000 120.500000 2.000000 2.000000 \n",
- "25% 10009.000000 28.000000 420.375000 3.000000 3.000000 \n",
- "50% 15122.500000 33.000000 600.000000 4.000000 4.000000 \n",
- "75% 55955.750000 39.250000 720.000000 5.000000 5.000000 \n",
- "max 98765.000000 50.000000 1500.000000 5.000000 5.000000 \n",
- "\n",
- " q_ease q_support nps \n",
- "count 100.000000 100.000000 100.000000 \n",
- "mean 3.900000 3.910000 6.990000 \n",
- "std 0.937437 0.985706 2.333312 \n",
- "min 2.000000 2.000000 2.000000 \n",
- "25% 3.000000 3.000000 5.000000 \n",
- "50% 4.000000 4.000000 7.000000 \n",
- "75% 5.000000 5.000000 9.000000 \n",
- "max 5.000000 5.000000 10.000000 \n",
- "๐พ Saved: data\\survey_llm_fixed_20251023T005139Z.csv\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "C:\\Users\\Joshua\\AppData\\Local\\Temp\\ipykernel_27572\\2716383900.py:12: DeprecationWarning: datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).\n",
- " ts = datetime.utcnow().strftime(\"%Y%m%dT%H%M%SZ\")\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"#Test larger dataset generation \n",
"print(\"๐ Testing larger dataset generation...\")\n",
@@ -1440,15 +729,7 @@
"execution_count": null,
"id": "6029d3e2",
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "LLM available: True\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"\n",
"def build_json_schema(CFG):\n",
@@ -1555,203 +836,10 @@
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": null,
"id": "2e759087",
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "LLM error, fallback to rule-based mock: No JSON array found in model output.\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " response_id | \n",
- " respondent_id | \n",
- " submitted_at | \n",
- " country | \n",
- " language | \n",
- " device | \n",
- " age | \n",
- " gender | \n",
- " education | \n",
- " income_band | \n",
- " completion_seconds | \n",
- " attention_passed | \n",
- " q_quality | \n",
- " q_value | \n",
- " q_ease | \n",
- " q_support | \n",
- " nps | \n",
- " is_detractor | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " 9e7811bd-27ee-4b7c-9b7a-c98441e337f0 | \n",
- " 40160 | \n",
- " 2024-08-18 19:10:06 | \n",
- " KE | \n",
- " sw | \n",
- " web | \n",
- " 28 | \n",
- " male | \n",
- " secondary | \n",
- " lower_mid | \n",
- " 1800.000000 | \n",
- " True | \n",
- " 4 | \n",
- " 3 | \n",
- " 3 | \n",
- " 3 | \n",
- " 4 | \n",
- " True | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " 85ec8b90-5468-4880-8309-e325da14d877 | \n",
- " 55381 | \n",
- " 2025-01-24 12:21:13 | \n",
- " TZ | \n",
- " sw | \n",
- " ios | \n",
- " 23 | \n",
- " female | \n",
- " bachelor | \n",
- " high | \n",
- " 431.412783 | \n",
- " True | \n",
- " 3 | \n",
- " 2 | \n",
- " 3 | \n",
- " 4 | \n",
- " 4 | \n",
- " False | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " 498dff10-040f-4206-8170-dfce0d5a69f0 | \n",
- " 48338 | \n",
- " 2025-07-15 22:21:54 | \n",
- " TZ | \n",
- " en | \n",
- " ios | \n",
- " 49 | \n",
- " male | \n",
- " bachelor | \n",
- " low | \n",
- " 1800.000000 | \n",
- " True | \n",
- " 2 | \n",
- " 3 | \n",
- " 3 | \n",
- " 1 | \n",
- " 3 | \n",
- " False | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " ddf11d94-5d6e-4322-9811-4e763f5ed46b | \n",
- " 59925 | \n",
- " 2025-01-27 00:16:57 | \n",
- " KE | \n",
- " en | \n",
- " web | \n",
- " 22 | \n",
- " male | \n",
- " bachelor | \n",
- " upper_mid | \n",
- " 656.050991 | \n",
- " True | \n",
- " 4 | \n",
- " 4 | \n",
- " 1 | \n",
- " 3 | \n",
- " 5 | \n",
- " False | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " 2ef22a0c-fd13-4798-9276-f43831b8f7bc | \n",
- " 68993 | \n",
- " 2024-08-19 04:21:49 | \n",
- " KE | \n",
- " en | \n",
- " android | \n",
- " 40 | \n",
- " male | \n",
- " secondary | \n",
- " lower_mid | \n",
- " 1553.938944 | \n",
- " True | \n",
- " 2 | \n",
- " 2 | \n",
- " 5 | \n",
- " 1 | \n",
- " 5 | \n",
- " False | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " response_id respondent_id submitted_at \\\n",
- "0 9e7811bd-27ee-4b7c-9b7a-c98441e337f0 40160 2024-08-18 19:10:06 \n",
- "1 85ec8b90-5468-4880-8309-e325da14d877 55381 2025-01-24 12:21:13 \n",
- "2 498dff10-040f-4206-8170-dfce0d5a69f0 48338 2025-07-15 22:21:54 \n",
- "3 ddf11d94-5d6e-4322-9811-4e763f5ed46b 59925 2025-01-27 00:16:57 \n",
- "4 2ef22a0c-fd13-4798-9276-f43831b8f7bc 68993 2024-08-19 04:21:49 \n",
- "\n",
- " country language device age gender education income_band \\\n",
- "0 KE sw web 28 male secondary lower_mid \n",
- "1 TZ sw ios 23 female bachelor high \n",
- "2 TZ en ios 49 male bachelor low \n",
- "3 KE en web 22 male bachelor upper_mid \n",
- "4 KE en android 40 male secondary lower_mid \n",
- "\n",
- " completion_seconds attention_passed q_quality q_value q_ease \\\n",
- "0 1800.000000 True 4 3 3 \n",
- "1 431.412783 True 3 2 3 \n",
- "2 1800.000000 True 2 3 3 \n",
- "3 656.050991 True 4 4 1 \n",
- "4 1553.938944 True 2 2 5 \n",
- "\n",
- " q_support nps is_detractor \n",
- "0 3 4 True \n",
- "1 4 4 False \n",
- "2 1 3 False \n",
- "3 3 5 False \n",
- "4 1 5 False "
- ]
- },
- "execution_count": 9,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"df_llm = generate_llm(CFG, total_rows=100, batch_size=50)\n",
"df_llm.head()"
@@ -1759,89 +847,10 @@
},
{
"cell_type": "code",
- "execution_count": 46,
+ "execution_count": null,
"id": "6d4908ad",
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "๐งช Testing improved LLM generation with adaptive batching...\n",
- "\n",
- "๐ฆ Testing small batch (10 rows)...\n",
- "๐ Generating 10 survey responses with LLM...\n",
- "๐ Using max_tokens: 3500 (estimated: 3500)\n",
- "๐ Raw response length: 5233 characters\n",
- "๐ Parsed JSON type: \n",
- "๐ Found data in 'responses': 10 rows\n",
- "โ
Successfully generated 10 survey responses\n",
- "โ
Small batch result: 10 rows\n",
- "\n",
- "๐ฆ Testing medium dataset (30 rows) with adaptive batching...\n",
- "๐ Generating 30 survey responses with adaptive batching\n",
- "๐ Using optimal batch size: 15\n",
- "\n",
- "๐ฆ Processing batch: 15 rows (remaining: 30)\n",
- "๐ Generating 15 survey responses with LLM...\n",
- "๐ Using max_tokens: 5000 (estimated: 5000)\n",
- "๐ Raw response length: 7839 characters\n",
- "๐ Parsed JSON type: \n",
- "๐ Found data in 'responses': 15 rows\n",
- "โ
Successfully generated 15 survey responses\n",
- "\n",
- "๐ฆ Processing batch: 15 rows (remaining: 15)\n",
- "๐ Generating 15 survey responses with LLM...\n",
- "๐ Using max_tokens: 5000 (estimated: 5000)\n",
- "๐ Raw response length: 7841 characters\n",
- "๐ Parsed JSON type: \n",
- "๐ Found data in 'responses': 15 rows\n",
- "โ
Successfully generated 15 survey responses\n",
- "โ
Generated total: 30 survey responses\n",
- "โ
Medium dataset result: 30 rows\n",
- "\n",
- "๐ Dataset shape: (30, 18)\n",
- "\n",
- "๐ First few rows:\n",
- " response_id respondent_id submitted_at \\\n",
- "0 d1e5c4a3-4b1f-4f6b-8f9e-9f1e1f2e3d4c 10001 2023-10-01 14:30:00 \n",
- "1 c2b1d4a6-7f8e-4c5c-9d8f-1e2c3b4a5e6f 10002 2023-10-01 15:00:00 \n",
- "2 e3f2c5b7-8a2d-4c8e-9f1b-2c3d4e5f6a7b 10003 2023-10-01 15:30:00 \n",
- "3 f4a5b6c8-9d3e-4b1f-9f2c-3d4e5f6a7b8c 10004 2023-10-01 16:00:00 \n",
- "4 g5b6c7d9-0e4f-4b2a-8f3d-4e5f6a7b8c9d 10005 2023-10-01 16:30:00 \n",
- "\n",
- " country language device age gender education income_band \\\n",
- "0 KE en android 28 female bachelor upper_mid \n",
- "1 UG sw web 35 male diploma lower_mid \n",
- "2 TZ en ios 42 nonbinary postgraduate high \n",
- "3 RW sw web 29 female secondary upper_mid \n",
- "4 NG en android 50 male bachelor high \n",
- "\n",
- " completion_seconds attention_passed q_quality q_value q_ease \\\n",
- "0 450.0 True 5 4 5 \n",
- "1 600.0 True 3 2 4 \n",
- "2 720.0 True 4 5 4 \n",
- "3 300.0 True 3 3 3 \n",
- "4 540.0 True 5 5 5 \n",
- "\n",
- " q_support nps is_detractor \n",
- "0 4 9 False \n",
- "1 3 5 False \n",
- "2 5 10 False \n",
- "3 4 6 False \n",
- "4 5 10 False \n",
- "๐พ Saved: data\\survey_adaptive_batch_20251023T005927Z.csv\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "C:\\Users\\Joshua\\AppData\\Local\\Temp\\ipykernel_27572\\1770033334.py:22: DeprecationWarning: datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).\n",
- " ts = datetime.utcnow().strftime(\"%Y%m%dT%H%M%SZ\")\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"# Test the improved LLM generation with adaptive batching\n",
"print(\"๐งช Testing improved LLM generation with adaptive batching...\")\n",