From 0ceb9136d5a2248495e9ecf1619cb9613da0e9b2 Mon Sep 17 00:00:00 2001 From: The Top Dev Date: Thu, 23 Oct 2025 08:48:44 +0300 Subject: [PATCH] Cleared output for the Synthentic survey data generator --- ...3_Exercise_survey_Dataset_Generation.ipynb | 1037 +---------------- 1 file changed, 23 insertions(+), 1014 deletions(-) diff --git a/week3/community-contributions/week3_Exercise_survey_Dataset_Generation.ipynb b/week3/community-contributions/week3_Exercise_survey_Dataset_Generation.ipynb index a11fa96..a4474af 100644 --- a/week3/community-contributions/week3_Exercise_survey_Dataset_Generation.ipynb +++ b/week3/community-contributions/week3_Exercise_survey_Dataset_Generation.ipynb @@ -10,18 +10,10 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": null, "id": "8d86f629", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "โœ… Base libraries ready. Pandera available: True\n" - ] - } - ], + "outputs": [], "source": [ "\n", "import os, re, json, time, uuid, math, random\n", @@ -35,7 +27,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": null, "id": "f196ae73", "metadata": {}, "outputs": [], @@ -94,18 +86,10 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": null, "id": "d16bd03a", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Loaded config for 800 rows and 18 fields.\n" - ] - } - ], + "outputs": [], "source": [ "\n", "CFG = {\n", @@ -145,7 +129,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": null, "id": "d2f5fdff", "metadata": {}, "outputs": [], @@ -208,196 +192,10 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": null, "id": "cd61330d", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
response_idrespondent_idsubmitted_atcountrylanguagedeviceagegendereducationincome_bandcompletion_secondsattention_passedq_qualityq_valueq_easeq_supportnpsis_detractor
0f099c1b6-a4ae-4fb0-ba98-89a81008c424716152024-04-13 19:02:44ZAenweb47malesecondarylow897.995012True53134True
1f2e20ad1-1ed1-4e33-8beb-5dd0ba23715b685642024-03-05 23:30:30KEenandroid67femalebachelorlower_mid935.607966True15235False
2a9345f69-be75-46b9-8cd3-a276ce0a66bd596892024-11-10 03:38:07RWswandroid23malebachelorlow1431.517701True52557False
3b4fa8625-d153-4465-ad73-1c4a48eed2f1207422024-11-19 17:40:58KEenios68femalesecondaryupper_mid448.519416True555310False
4e0ad4bbc-b576-4913-8786-302f06b5e9f7634592024-07-28 04:23:37KEenios34malesecondarylow1179.970734True31335False
\n", - "
" - ], - "text/plain": [ - " response_id respondent_id submitted_at \\\n", - "0 f099c1b6-a4ae-4fb0-ba98-89a81008c424 71615 2024-04-13 19:02:44 \n", - "1 f2e20ad1-1ed1-4e33-8beb-5dd0ba23715b 68564 2024-03-05 23:30:30 \n", - "2 a9345f69-be75-46b9-8cd3-a276ce0a66bd 59689 2024-11-10 03:38:07 \n", - "3 b4fa8625-d153-4465-ad73-1c4a48eed2f1 20742 2024-11-19 17:40:58 \n", - "4 e0ad4bbc-b576-4913-8786-302f06b5e9f7 63459 2024-07-28 04:23:37 \n", - "\n", - " country language device age gender education income_band \\\n", - "0 ZA en web 47 male secondary low \n", - "1 KE en android 67 female bachelor lower_mid \n", - "2 RW sw android 23 male bachelor low \n", - "3 KE en ios 68 female secondary upper_mid \n", - "4 KE en ios 34 male secondary low \n", - "\n", - " completion_seconds attention_passed q_quality q_value q_ease \\\n", - "0 897.995012 True 5 3 1 \n", - "1 935.607966 True 1 5 2 \n", - "2 1431.517701 True 5 2 5 \n", - "3 448.519416 True 5 5 5 \n", - "4 1179.970734 True 3 1 3 \n", - "\n", - " q_support nps is_detractor \n", - "0 3 4 True \n", - "1 3 5 False \n", - "2 5 7 False \n", - "3 3 10 False \n", - "4 3 5 False " - ] - }, - "execution_count": 38, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "\n", "def generate_rule_based(CFG: Dict[str, Any]) -> pd.DataFrame:\n", @@ -450,245 +248,10 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": null, "id": "9a4ef86a", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Validation error: {\n", - " \"SCHEMA\": {\n", - " \"WRONG_DATATYPE\": [\n", - " {\n", - " \"schema\": null,\n", - " \"column\": \"respondent_id\",\n", - " \"check\": \"dtype('int64')\",\n", - " \"error\": \"expected series 'respondent_id' to have type int64, got int32\"\n", - " },\n", - " {\n", - " \"schema\": null,\n", - " \"column\": \"age\",\n", - " \"check\": \"dtype('int64')\",\n", - " \"error\": \"expected series 'age' to have type int64, got int32\"\n", - " },\n", - " {\n", - " \"schema\": null,\n", - " \"column\": \"q_quality\",\n", - " \"check\": \"dtype('int64')\",\n", - " \"error\": \"expected series 'q_quality' to have type int64, got int32\"\n", - " },\n", - " {\n", - " \"schema\": null,\n", - " \"column\": \"q_value\",\n", - " \"check\": \"dtype('int64')\",\n", - " \"error\": \"expected series 'q_value' to have type int64, got int32\"\n", - " },\n", - " {\n", - " \"schema\": null,\n", - " \"column\": \"q_ease\",\n", - " \"check\": \"dtype('int64')\",\n", - " \"error\": \"expected series 'q_ease' to have type int64, got int32\"\n", - " },\n", - " {\n", - " \"schema\": null,\n", - " \"column\": \"q_support\",\n", - " \"check\": \"dtype('int64')\",\n", - " \"error\": \"expected series 'q_support' to have type int64, got int32\"\n", - " }\n", - " ]\n", - " }\n", - "}\n", - "{'engine': 'pandera', 'valid_rows': 800, 'invalid_rows': 0, 'notes': 'Non-strict mode.'}\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
response_idrespondent_idsubmitted_atcountrylanguagedeviceagegendereducationincome_bandcompletion_secondsattention_passedq_qualityq_valueq_easeq_supportnpsis_detractor
0f099c1b6-a4ae-4fb0-ba98-89a81008c424716152024-04-13 19:02:44ZAenweb47malesecondarylow897.995012True53134True
1f2e20ad1-1ed1-4e33-8beb-5dd0ba23715b685642024-03-05 23:30:30KEenandroid67femalebachelorlower_mid935.607966True15235False
2a9345f69-be75-46b9-8cd3-a276ce0a66bd596892024-11-10 03:38:07RWswandroid23malebachelorlow1431.517701True52557False
3b4fa8625-d153-4465-ad73-1c4a48eed2f1207422024-11-19 17:40:58KEenios68femalesecondaryupper_mid448.519416True555310False
4e0ad4bbc-b576-4913-8786-302f06b5e9f7634592024-07-28 04:23:37KEenios34malesecondarylow1179.970734True31335False
\n", - "
" - ], - "text/plain": [ - " response_id respondent_id submitted_at \\\n", - "0 f099c1b6-a4ae-4fb0-ba98-89a81008c424 71615 2024-04-13 19:02:44 \n", - "1 f2e20ad1-1ed1-4e33-8beb-5dd0ba23715b 68564 2024-03-05 23:30:30 \n", - "2 a9345f69-be75-46b9-8cd3-a276ce0a66bd 59689 2024-11-10 03:38:07 \n", - "3 b4fa8625-d153-4465-ad73-1c4a48eed2f1 20742 2024-11-19 17:40:58 \n", - "4 e0ad4bbc-b576-4913-8786-302f06b5e9f7 63459 2024-07-28 04:23:37 \n", - "\n", - " country language device age gender education income_band \\\n", - "0 ZA en web 47 male secondary low \n", - "1 KE en android 67 female bachelor lower_mid \n", - "2 RW sw android 23 male bachelor low \n", - "3 KE en ios 68 female secondary upper_mid \n", - "4 KE en ios 34 male secondary low \n", - "\n", - " completion_seconds attention_passed q_quality q_value q_ease \\\n", - "0 897.995012 True 5 3 1 \n", - "1 935.607966 True 1 5 2 \n", - "2 1431.517701 True 5 2 5 \n", - "3 448.519416 True 5 5 5 \n", - "4 1179.970734 True 3 1 3 \n", - "\n", - " q_support nps is_detractor \n", - "0 3 4 True \n", - "1 3 5 False \n", - "2 5 7 False \n", - "3 3 10 False \n", - "4 3 5 False " - ] - }, - "execution_count": 39, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "\n", "def build_pandera_schema(CFG):\n", @@ -732,26 +295,10 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": null, "id": "73626b4c", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Saved: data/survey_rule_20251023T004106Z.csv\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "C:\\Users\\Joshua\\AppData\\Local\\Temp\\ipykernel_27572\\1233117399.py:3: DeprecationWarning: datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).\n", - " ts = datetime.utcnow().strftime(\"%Y%m%dT%H%M%SZ\")\n" - ] - } - ], + "outputs": [], "source": [ "\n", "from pathlib import Path\n", @@ -772,7 +319,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": null, "id": "24e94771", "metadata": {}, "outputs": [], @@ -1067,73 +614,7 @@ "execution_count": null, "id": "e1af410e", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "๐Ÿงช Testing LLM generation...\n", - "๐Ÿ”„ Generating 10 survey responses with LLM...\n", - "๐Ÿ“Š Using max_tokens: 3500 (estimated: 3500)\n", - "๐Ÿ“ Raw response length: 5236 characters\n", - "๐Ÿ” Parsed JSON type: \n", - "๐Ÿ“Š Found data in 'responses': 10 rows\n", - "โœ… Successfully generated 10 survey responses\n", - "\n", - "๐Ÿ“Š Generated dataset shape: (10, 18)\n", - "\n", - "๐Ÿ“‹ First few rows:\n", - " response_id respondent_id submitted_at \\\n", - "0 f3e9b9d1-4e9e-4f8a-9b5c-7e3cbb1c4e5e 10234 2023-10-01 14:23:45 \n", - "1 a1c5f6d3-1f5b-4e8a-8c7a-5e2c3f4b8e1b 20456 2023-10-01 15:10:12 \n", - "2 c2b3e4f5-5d6e-4b8a-9f3c-8e1a2f9b4e3c 30567 2023-10-01 16:45:30 \n", - "3 d4e5f6b7-6e8f-4b9a-8c7d-9e2f3c4b5e6f 40678 2023-10-01 17:30:00 \n", - "4 e5f6a7b8-7f9a-4c0a-9e2f-1e3c4b5e6f7a 50789 2023-10-01 18:15:15 \n", - "\n", - " country language device age gender education income_band \\\n", - "0 KE en android 29 female bachelor upper_mid \n", - "1 UG sw web 34 male secondary lower_mid \n", - "2 TZ en ios 42 nonbinary diploma high \n", - "3 RW sw android 27 female bachelor upper_mid \n", - "4 NG en web 36 male postgraduate high \n", - "\n", - " completion_seconds attention_passed q_quality q_value q_ease \\\n", - "0 450.0 True 4 5 4 \n", - "1 600.5 True 3 4 3 \n", - "2 720.0 True 5 5 5 \n", - "3 390.0 True 4 4 4 \n", - "4 800.0 True 5 5 5 \n", - "\n", - " q_support nps is_detractor \n", - "0 5 9 False \n", - "1 4 7 False \n", - "2 5 10 False \n", - "3 4 8 False \n", - "4 5 9 False \n", - "\n", - "๐Ÿ“ˆ Data types:\n", - "response_id object\n", - "respondent_id int64\n", - "submitted_at object\n", - "country object\n", - "language object\n", - "device object\n", - "age int64\n", - "gender object\n", - "education object\n", - "income_band object\n", - "completion_seconds float64\n", - "attention_passed bool\n", - "q_quality int64\n", - "q_value int64\n", - "q_ease int64\n", - "q_support int64\n", - "nps int64\n", - "is_detractor bool\n", - "dtype: object\n" - ] - } - ], + "outputs": [], "source": [ "# Test the fixed LLM generation\n", "print(\"๐Ÿงช Testing LLM generation...\")\n", @@ -1196,79 +677,10 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": null, "id": "75c90739", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "๐Ÿงช Testing the fixed LLM generation...\n", - "๐Ÿ”„ Generating 5 survey responses with LLM...\n", - "๐Ÿ“Š Using max_tokens: 2000 (estimated: 2000)\n", - "๐Ÿ“ Raw response length: 2629 characters\n", - "๐Ÿ” Parsed JSON type: \n", - "๐Ÿ“Š Found data in 'responses': 5 rows\n", - "โœ… Successfully generated 5 survey responses\n", - "\n", - "๐Ÿ“Š Generated dataset shape: (5, 18)\n", - "\n", - "๐Ÿ“‹ First few rows:\n", - " response_id respondent_id submitted_at \\\n", - "0 d8b1c6f3-6f7a-4b4f-9c5f-3a5f8b6e2f1e 12345 2023-10-01 14:30:00 \n", - "1 f3a8e3c1-9b4e-4e5e-9c2b-8f5e3c9b1f3d 67890 2023-10-01 15:00:00 \n", - "2 c9c8e3f1-2b4f-4a6c-8c2e-2a5f3c8e1f2b 54321 2023-10-01 16:15:00 \n", - "3 a5b3c6d2-1e4f-4c5e-9a1f-1f6a7b8e3c9f 98765 2023-10-01 17:45:00 \n", - "4 b8f4c3e2-2e4f-4c5e-8a2f-4c5e3b8e2f1a 13579 2023-10-01 18:30:00 \n", - "\n", - " country language device age gender education income_band \\\n", - "0 KE en android 29 female bachelor upper_mid \n", - "1 UG sw web 34 male diploma lower_mid \n", - "2 TZ en ios 42 nonbinary postgraduate high \n", - "3 RW sw android 27 female secondary low \n", - "4 NG en web 55 male bachelor upper_mid \n", - "\n", - " completion_seconds attention_passed q_quality q_value q_ease \\\n", - "0 420.0 True 5 4 4 \n", - "1 600.0 True 3 3 2 \n", - "2 300.5 True 4 5 4 \n", - "3 720.0 False 2 3 3 \n", - "4 540.0 True 5 5 5 \n", - "\n", - " q_support nps is_detractor \n", - "0 5 9 False \n", - "1 4 5 False \n", - "2 5 10 False \n", - "3 2 3 True \n", - "4 5 8 False \n", - "\n", - "๐Ÿ“ˆ Data types:\n", - "response_id object\n", - "respondent_id int64\n", - "submitted_at object\n", - "country object\n", - "language object\n", - "device object\n", - "age int64\n", - "gender object\n", - "education object\n", - "income_band object\n", - "completion_seconds float64\n", - "attention_passed bool\n", - "q_quality int64\n", - "q_value int64\n", - "q_ease int64\n", - "q_support int64\n", - "nps int64\n", - "is_detractor bool\n", - "dtype: object\n", - "\n", - "โœ… SUCCESS! LLM generation is now working!\n", - "๐Ÿ“Š Generated 5 survey responses using LLM\n" - ] - } - ], + "outputs": [], "source": [ "# Test the fixed implementation\n", "print(\"๐Ÿงช Testing the fixed LLM generation...\")\n", @@ -1290,133 +702,10 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": null, "id": "dd83b842", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "๐Ÿš€ Testing larger dataset generation...\n", - "๐Ÿš€ Generating 100 survey responses with adaptive batching\n", - "๐Ÿ“Š Using optimal batch size: 10\n", - "\n", - "๐Ÿ“ฆ Processing batch: 10 rows (remaining: 100)\n", - "๐Ÿ”„ Generating 10 survey responses with LLM...\n", - "๐Ÿ“Š Using max_tokens: 3500 (estimated: 3500)\n", - "๐Ÿ“ Raw response length: 5238 characters\n", - "๐Ÿ” Parsed JSON type: \n", - "๐Ÿ“Š Found data in 'responses': 10 rows\n", - "โœ… Successfully generated 10 survey responses\n", - "\n", - "๐Ÿ“ฆ Processing batch: 10 rows (remaining: 90)\n", - "๐Ÿ”„ Generating 10 survey responses with LLM...\n", - "๐Ÿ“Š Using max_tokens: 3500 (estimated: 3500)\n", - "๐Ÿ“ Raw response length: 5235 characters\n", - "๐Ÿ” Parsed JSON type: \n", - "๐Ÿ“Š Found data in 'responses': 10 rows\n", - "โœ… Successfully generated 10 survey responses\n", - "\n", - "๐Ÿ“ฆ Processing batch: 10 rows (remaining: 80)\n", - "๐Ÿ”„ Generating 10 survey responses with LLM...\n", - "๐Ÿ“Š Using max_tokens: 3500 (estimated: 3500)\n", - "๐Ÿ“ Raw response length: 5232 characters\n", - "๐Ÿ” Parsed JSON type: \n", - "๐Ÿ“Š Found data in 'responses': 10 rows\n", - "โœ… Successfully generated 10 survey responses\n", - "\n", - "๐Ÿ“ฆ Processing batch: 10 rows (remaining: 70)\n", - "๐Ÿ”„ Generating 10 survey responses with LLM...\n", - "๐Ÿ“Š Using max_tokens: 3500 (estimated: 3500)\n", - "๐Ÿ“ Raw response length: 5239 characters\n", - "๐Ÿ” Parsed JSON type: \n", - "๐Ÿ“Š Found data in 'responses': 10 rows\n", - "โœ… Successfully generated 10 survey responses\n", - "\n", - "๐Ÿ“ฆ Processing batch: 10 rows (remaining: 60)\n", - "๐Ÿ”„ Generating 10 survey responses with LLM...\n", - "๐Ÿ“Š Using max_tokens: 3500 (estimated: 3500)\n", - "๐Ÿ“ Raw response length: 5238 characters\n", - "๐Ÿ” Parsed JSON type: \n", - "๐Ÿ“Š Found data in 'responses': 10 rows\n", - "โœ… Successfully generated 10 survey responses\n", - "\n", - "๐Ÿ“ฆ Processing batch: 10 rows (remaining: 50)\n", - "๐Ÿ”„ Generating 10 survey responses with LLM...\n", - "๐Ÿ“Š Using max_tokens: 3500 (estimated: 3500)\n", - "๐Ÿ“ Raw response length: 5236 characters\n", - "๐Ÿ” Parsed JSON type: \n", - "๐Ÿ“Š Found data in 'responses': 10 rows\n", - "โœ… Successfully generated 10 survey responses\n", - "\n", - "๐Ÿ“ฆ Processing batch: 10 rows (remaining: 40)\n", - "๐Ÿ”„ Generating 10 survey responses with LLM...\n", - "๐Ÿ“Š Using max_tokens: 3500 (estimated: 3500)\n", - "๐Ÿ“ Raw response length: 5229 characters\n", - "๐Ÿ” Parsed JSON type: \n", - "๐Ÿ“Š Found data in 'responses': 10 rows\n", - "โœ… Successfully generated 10 survey responses\n", - "\n", - "๐Ÿ“ฆ Processing batch: 10 rows (remaining: 30)\n", - "๐Ÿ”„ Generating 10 survey responses with LLM...\n", - "๐Ÿ“Š Using max_tokens: 3500 (estimated: 3500)\n", - "๐Ÿ“ Raw response length: 5244 characters\n", - "๐Ÿ” Parsed JSON type: \n", - "๐Ÿ“Š Found data in 'responses': 10 rows\n", - "โœ… Successfully generated 10 survey responses\n", - "\n", - "๐Ÿ“ฆ Processing batch: 10 rows (remaining: 20)\n", - "๐Ÿ”„ Generating 10 survey responses with LLM...\n", - "๐Ÿ“Š Using max_tokens: 3500 (estimated: 3500)\n", - "๐Ÿ“ Raw response length: 5234 characters\n", - "๐Ÿ” Parsed JSON type: \n", - "๐Ÿ“Š Found data in 'responses': 10 rows\n", - "โœ… Successfully generated 10 survey responses\n", - "\n", - "๐Ÿ“ฆ Processing batch: 10 rows (remaining: 10)\n", - "๐Ÿ”„ Generating 10 survey responses with LLM...\n", - "๐Ÿ“Š Using max_tokens: 3500 (estimated: 3500)\n", - "๐Ÿ“ Raw response length: 5238 characters\n", - "๐Ÿ” Parsed JSON type: \n", - "๐Ÿ“Š Found data in 'responses': 10 rows\n", - "โœ… Successfully generated 10 survey responses\n", - "โœ… Generated total: 100 survey responses\n", - "\n", - "๐Ÿ“Š Large dataset shape: (100, 18)\n", - "\n", - "๐Ÿ“ˆ Summary statistics:\n", - " respondent_id age completion_seconds q_quality q_value \\\n", - "count 100.000000 100.000000 100.000000 100.000000 100.000000 \n", - "mean 33513.700000 34.070000 588.525000 3.740000 3.910000 \n", - "std 29233.800863 7.835757 230.530212 1.001211 0.995901 \n", - "min 10001.000000 22.000000 120.500000 2.000000 2.000000 \n", - "25% 10009.000000 28.000000 420.375000 3.000000 3.000000 \n", - "50% 15122.500000 33.000000 600.000000 4.000000 4.000000 \n", - "75% 55955.750000 39.250000 720.000000 5.000000 5.000000 \n", - "max 98765.000000 50.000000 1500.000000 5.000000 5.000000 \n", - "\n", - " q_ease q_support nps \n", - "count 100.000000 100.000000 100.000000 \n", - "mean 3.900000 3.910000 6.990000 \n", - "std 0.937437 0.985706 2.333312 \n", - "min 2.000000 2.000000 2.000000 \n", - "25% 3.000000 3.000000 5.000000 \n", - "50% 4.000000 4.000000 7.000000 \n", - "75% 5.000000 5.000000 9.000000 \n", - "max 5.000000 5.000000 10.000000 \n", - "๐Ÿ’พ Saved: data\\survey_llm_fixed_20251023T005139Z.csv\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "C:\\Users\\Joshua\\AppData\\Local\\Temp\\ipykernel_27572\\2716383900.py:12: DeprecationWarning: datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).\n", - " ts = datetime.utcnow().strftime(\"%Y%m%dT%H%M%SZ\")\n" - ] - } - ], + "outputs": [], "source": [ "#Test larger dataset generation \n", "print(\"๐Ÿš€ Testing larger dataset generation...\")\n", @@ -1440,15 +729,7 @@ "execution_count": null, "id": "6029d3e2", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "LLM available: True\n" - ] - } - ], + "outputs": [], "source": [ "\n", "def build_json_schema(CFG):\n", @@ -1555,203 +836,10 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "id": "2e759087", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "LLM error, fallback to rule-based mock: No JSON array found in model output.\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
response_idrespondent_idsubmitted_atcountrylanguagedeviceagegendereducationincome_bandcompletion_secondsattention_passedq_qualityq_valueq_easeq_supportnpsis_detractor
09e7811bd-27ee-4b7c-9b7a-c98441e337f0401602024-08-18 19:10:06KEswweb28malesecondarylower_mid1800.000000True43334True
185ec8b90-5468-4880-8309-e325da14d877553812025-01-24 12:21:13TZswios23femalebachelorhigh431.412783True32344False
2498dff10-040f-4206-8170-dfce0d5a69f0483382025-07-15 22:21:54TZenios49malebachelorlow1800.000000True23313False
3ddf11d94-5d6e-4322-9811-4e763f5ed46b599252025-01-27 00:16:57KEenweb22malebachelorupper_mid656.050991True44135False
42ef22a0c-fd13-4798-9276-f43831b8f7bc689932024-08-19 04:21:49KEenandroid40malesecondarylower_mid1553.938944True22515False
\n", - "
" - ], - "text/plain": [ - " response_id respondent_id submitted_at \\\n", - "0 9e7811bd-27ee-4b7c-9b7a-c98441e337f0 40160 2024-08-18 19:10:06 \n", - "1 85ec8b90-5468-4880-8309-e325da14d877 55381 2025-01-24 12:21:13 \n", - "2 498dff10-040f-4206-8170-dfce0d5a69f0 48338 2025-07-15 22:21:54 \n", - "3 ddf11d94-5d6e-4322-9811-4e763f5ed46b 59925 2025-01-27 00:16:57 \n", - "4 2ef22a0c-fd13-4798-9276-f43831b8f7bc 68993 2024-08-19 04:21:49 \n", - "\n", - " country language device age gender education income_band \\\n", - "0 KE sw web 28 male secondary lower_mid \n", - "1 TZ sw ios 23 female bachelor high \n", - "2 TZ en ios 49 male bachelor low \n", - "3 KE en web 22 male bachelor upper_mid \n", - "4 KE en android 40 male secondary lower_mid \n", - "\n", - " completion_seconds attention_passed q_quality q_value q_ease \\\n", - "0 1800.000000 True 4 3 3 \n", - "1 431.412783 True 3 2 3 \n", - "2 1800.000000 True 2 3 3 \n", - "3 656.050991 True 4 4 1 \n", - "4 1553.938944 True 2 2 5 \n", - "\n", - " q_support nps is_detractor \n", - "0 3 4 True \n", - "1 4 4 False \n", - "2 1 3 False \n", - "3 3 5 False \n", - "4 1 5 False " - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "df_llm = generate_llm(CFG, total_rows=100, batch_size=50)\n", "df_llm.head()" @@ -1759,89 +847,10 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": null, "id": "6d4908ad", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "๐Ÿงช Testing improved LLM generation with adaptive batching...\n", - "\n", - "๐Ÿ“ฆ Testing small batch (10 rows)...\n", - "๐Ÿ”„ Generating 10 survey responses with LLM...\n", - "๐Ÿ“Š Using max_tokens: 3500 (estimated: 3500)\n", - "๐Ÿ“ Raw response length: 5233 characters\n", - "๐Ÿ” Parsed JSON type: \n", - "๐Ÿ“Š Found data in 'responses': 10 rows\n", - "โœ… Successfully generated 10 survey responses\n", - "โœ… Small batch result: 10 rows\n", - "\n", - "๐Ÿ“ฆ Testing medium dataset (30 rows) with adaptive batching...\n", - "๐Ÿš€ Generating 30 survey responses with adaptive batching\n", - "๐Ÿ“Š Using optimal batch size: 15\n", - "\n", - "๐Ÿ“ฆ Processing batch: 15 rows (remaining: 30)\n", - "๐Ÿ”„ Generating 15 survey responses with LLM...\n", - "๐Ÿ“Š Using max_tokens: 5000 (estimated: 5000)\n", - "๐Ÿ“ Raw response length: 7839 characters\n", - "๐Ÿ” Parsed JSON type: \n", - "๐Ÿ“Š Found data in 'responses': 15 rows\n", - "โœ… Successfully generated 15 survey responses\n", - "\n", - "๐Ÿ“ฆ Processing batch: 15 rows (remaining: 15)\n", - "๐Ÿ”„ Generating 15 survey responses with LLM...\n", - "๐Ÿ“Š Using max_tokens: 5000 (estimated: 5000)\n", - "๐Ÿ“ Raw response length: 7841 characters\n", - "๐Ÿ” Parsed JSON type: \n", - "๐Ÿ“Š Found data in 'responses': 15 rows\n", - "โœ… Successfully generated 15 survey responses\n", - "โœ… Generated total: 30 survey responses\n", - "โœ… Medium dataset result: 30 rows\n", - "\n", - "๐Ÿ“Š Dataset shape: (30, 18)\n", - "\n", - "๐Ÿ“‹ First few rows:\n", - " response_id respondent_id submitted_at \\\n", - "0 d1e5c4a3-4b1f-4f6b-8f9e-9f1e1f2e3d4c 10001 2023-10-01 14:30:00 \n", - "1 c2b1d4a6-7f8e-4c5c-9d8f-1e2c3b4a5e6f 10002 2023-10-01 15:00:00 \n", - "2 e3f2c5b7-8a2d-4c8e-9f1b-2c3d4e5f6a7b 10003 2023-10-01 15:30:00 \n", - "3 f4a5b6c8-9d3e-4b1f-9f2c-3d4e5f6a7b8c 10004 2023-10-01 16:00:00 \n", - "4 g5b6c7d9-0e4f-4b2a-8f3d-4e5f6a7b8c9d 10005 2023-10-01 16:30:00 \n", - "\n", - " country language device age gender education income_band \\\n", - "0 KE en android 28 female bachelor upper_mid \n", - "1 UG sw web 35 male diploma lower_mid \n", - "2 TZ en ios 42 nonbinary postgraduate high \n", - "3 RW sw web 29 female secondary upper_mid \n", - "4 NG en android 50 male bachelor high \n", - "\n", - " completion_seconds attention_passed q_quality q_value q_ease \\\n", - "0 450.0 True 5 4 5 \n", - "1 600.0 True 3 2 4 \n", - "2 720.0 True 4 5 4 \n", - "3 300.0 True 3 3 3 \n", - "4 540.0 True 5 5 5 \n", - "\n", - " q_support nps is_detractor \n", - "0 4 9 False \n", - "1 3 5 False \n", - "2 5 10 False \n", - "3 4 6 False \n", - "4 5 10 False \n", - "๐Ÿ’พ Saved: data\\survey_adaptive_batch_20251023T005927Z.csv\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "C:\\Users\\Joshua\\AppData\\Local\\Temp\\ipykernel_27572\\1770033334.py:22: DeprecationWarning: datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).\n", - " ts = datetime.utcnow().strftime(\"%Y%m%dT%H%M%SZ\")\n" - ] - } - ], + "outputs": [], "source": [ "# Test the improved LLM generation with adaptive batching\n", "print(\"๐Ÿงช Testing improved LLM generation with adaptive batching...\")\n",