minor update on local machine

2025-10-27 11:42:28 +00:00
parent 7ad790be3d
commit e8dc9bd396
1 changed files with 119 additions and 136 deletions
--- a/week3/community-contributions/ranskills-week3-coherent-data-generator.ipynb
+++ b/week3/community-contributions/ranskills-week3-coherent-data-generator.ipynb
@@ -1,49 +1,30 @@
 {
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "collapsed_sections": [
        "tqSpfJGnme7y"
      ],
      "gpuType": "T4"
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    },
    "accelerator": "GPU"
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "KbMea_UrO3Ke"
      },
      "source": [
        "# ✨ Coherent Data Generator\n",
        "\n",
-        "## In real life, data has meaning, relationships, etc. and this is where this tool shines.\n",
+        "## In real life, data has meaning, relationships, etc., and this is where this tool shines.\n",
        "\n",
-        "Dependencies between fields are detected and a coherent data is generated.\n",
+        "Dependencies between fields are detected, and coherent data is generated.\n",
        "Example:\n",
-        "When asked to generate data with **Ghana** cited as the context, fields like `name`, `food`, etc. will be Ghanaian. Fields such as phone number will have the appropriate prefix of `+233`, etc.\n",
+        "When asked to generate data with **Ghana** cited as the context, fields like `name`, `food`, etc., will be Ghanaian. Fields such as phone number will have the appropriate prefix of `+233`, etc.\n",
        "\n",
        "This is better than Faker.\n",
        "\n",
        "## Steps\n",
        "Schema -> Generate Data\n",
        "\n",
-        "Schema Sources:\n",
+        "Schema Sources: \n",
        "- Use the guided schema builder\n",
        "- Bring your own schema from an SQL Data Definition Language (DDL)\n",
        "- Prompting\n",
-        "- Providing a domain to an old-hat to definition features for a dataset"
+        "- Providing a domain to an old hat to define features for a dataset"
-      ],
+      ]
      "metadata": {
        "id": "KbMea_UrO3Ke"
      }
    },
    {
      "cell_type": "code",
@@ -65,6 +46,11 @@
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "DOBBN3P2GD2O"
      },
      "outputs": [],
      "source": [
        "model_id = \"Qwen/Qwen3-4B-Instruct-2507\"\n",
        "\n",
@@ -78,24 +64,24 @@
        "    dtype=\"auto\",\n",
        "    device_map=\"auto\"\n",
        ")"
-      ],
+      ]
      "metadata": {
        "id": "DOBBN3P2GD2O"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Schema Definitions"
      ],
      "metadata": {
        "id": "HSUebXa1O3MM"
-      }
+      },
      "source": [
        "## Schema Definitions"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "5LNM76OQjAw6"
      },
      "outputs": [],
      "source": [
        "# This is for future use where errors in SQL DDL statements can be fixed if the\n",
        "# specifies that from the UI\n",
@@ -115,33 +101,33 @@
        "class Schema(BaseModel):\n",
        "    name: str = Field(..., description='Name of the schema')\n",
        "    fields: list[FieldDescriptor] = Field(..., description='List of fields in the schema')"
-      ],
+      ]
      "metadata": {
        "id": "5LNM76OQjAw6"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "## LLM Interactions"
      ],
      "metadata": {
        "id": "6QjitfTBPa1E"
-      }
+      },
      "source": [
        "## LLM Interactions"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "### Generate Content from LLM"
      ],
      "metadata": {
        "id": "dXiRHok7Peir"
-      }
+      },
      "source": [
        "### Generate Content from LLM"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "daTUVG8_PmvM"
      },
      "outputs": [],
      "source": [
        "def generate(messages: list[dict[str, str]], temperature: float = 0.1) -> any:\n",
        "  text = tokenizer.apply_chat_template(\n",
@@ -161,24 +147,24 @@
        "  content = tokenizer.decode(output_ids, skip_special_tokens=True)\n",
        "\n",
        "  return content"
-      ],
+      ]
      "metadata": {
        "id": "daTUVG8_PmvM"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "### Generate Data Given A Valid Schema"
      ],
      "metadata": {
        "id": "sBHJKn8qQhM5"
-      }
+      },
      "source": [
        "### Generate Data Given A Valid Schema"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "Fla8UQf4Qm5l"
      },
      "outputs": [],
      "source": [
        "def generate_data(schema: str, context: str = '', num_records: int = 5):\n",
        "  system_prompt = f'''\n",
@@ -211,24 +197,24 @@
        "  ]\n",
        "\n",
        "  return generate(messages)"
-      ],
+      ]
      "metadata": {
        "id": "Fla8UQf4Qm5l"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "### SQL"
      ],
      "metadata": {
        "id": "izrClU6VPsZp"
-      }
+      },
      "source": [
        "### SQL"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "aQgY6EK0QPPd"
      },
      "outputs": [],
      "source": [
        "def sql_validator(ddl: str):\n",
        "  system_prompt = '''\n",
@@ -267,26 +253,26 @@
        "  ]\n",
        "\n",
        "  return generate(messages)"
-      ],
+      ]
      "metadata": {
        "id": "aQgY6EK0QPPd"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "4mgwDQyDQ1wv"
      },
      "source": [
        "### Data Scientist\n",
        "\n",
        "Just give it a domain and you will be amazed the features will give you."
-      ],
+      ]
      "metadata": {
        "id": "4mgwDQyDQ1wv"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "P36AMvBq8AST"
      },
      "outputs": [],
      "source": [
        "def create_domain_schema(domain: str):\n",
        "  system_prompt = f'''\n",
@@ -326,35 +312,35 @@
        "    created_at TIMESTAMP DEFAULT NOW()\n",
        ");\n",
        "'''"
-      ],
+      ]
      "metadata": {
        "id": "P36AMvBq8AST"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
-      "source": [
+      "execution_count": null,
        "print(f'{model.get_memory_footprint() / 1e9:, .2f} GB')"
      ],
      "metadata": {
        "id": "QuVyHOhjDtSH"
      },
-      "execution_count": null,
+      "outputs": [],
-      "outputs": []
+      "source": [
        "print(f'{model.get_memory_footprint() / 1e9:, .2f} GB')"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Export Functions"
      ],
      "metadata": {
        "id": "tqSpfJGnme7y"
-      }
+      },
      "source": [
        "## Export Functions"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "pAu5OPfUmMSm"
      },
      "outputs": [],
      "source": [
        "from enum import StrEnum\n",
        "\n",
@@ -451,24 +437,28 @@
        "      tmp.write(content)\n",
        "      tmp.flush()\n",
        "      return tmp.name"
-      ],
+      ]
      "metadata": {
        "id": "pAu5OPfUmMSm"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Gradio UI"
      ],
      "metadata": {
        "id": "Q0fZsCuso_YZ"
-      }
+      },
      "source": [
        "## Gradio UI"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "TJYUWecybDpP",
        "outputId": "e82d0a13-3ca3-4a01-d45c-78fc94ade9bc"
      },
      "outputs": [],
      "source": [
        "import gradio as gr\n",
        "from pydantic import BaseModel, Field\n",
@@ -718,34 +708,27 @@
        "    )\n",
        "\n",
        "\n",
-        "ui.launch(debug=True)"
+        "ui.launch(debug=True)\n"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "TJYUWecybDpP",
        "outputId": "e82d0a13-3ca3-4a01-d45c-78fc94ade9bc"
      },
      "execution_count": 10,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Keyboard interruption in main thread... closing server.\n",
            "Killing tunnel 127.0.0.1:7860 <> https://5954eb89d994d7a5ee.gradio.live\n"
          ]
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": []
          },
          "metadata": {},
          "execution_count": 10
        }
      ]
    }
-  ]
+  ],
-}
+  "metadata": {
    "accelerator": "GPU",
    "colab": {
      "collapsed_sections": [
        "tqSpfJGnme7y"
      ],
      "gpuType": "T4",
      "provenance": []
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
 }