diff --git a/week3/community-contributions/ranskills-week3-coherent-data-generator.ipynb b/week3/community-contributions/ranskills-week3-coherent-data-generator.ipynb index 6be1ba4..716fb62 100644 --- a/week3/community-contributions/ranskills-week3-coherent-data-generator.ipynb +++ b/week3/community-contributions/ranskills-week3-coherent-data-generator.ipynb @@ -1,49 +1,30 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "provenance": [], - "collapsed_sections": [ - "tqSpfJGnme7y" - ], - "gpuType": "T4" - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "language_info": { - "name": "python" - }, - "accelerator": "GPU" - }, "cells": [ { "cell_type": "markdown", + "metadata": { + "id": "KbMea_UrO3Ke" + }, "source": [ "# ✨ Coherent Data Generator\n", "\n", - "## In real life, data has meaning, relationships, etc. and this is where this tool shines.\n", + "## In real life, data has meaning, relationships, etc., and this is where this tool shines.\n", "\n", - "Dependencies between fields are detected and a coherent data is generated.\n", + "Dependencies between fields are detected, and coherent data is generated.\n", "Example:\n", - "When asked to generate data with **Ghana** cited as the context, fields like `name`, `food`, etc. will be Ghanaian. Fields such as phone number will have the appropriate prefix of `+233`, etc.\n", + "When asked to generate data with **Ghana** cited as the context, fields like `name`, `food`, etc., will be Ghanaian. Fields such as phone number will have the appropriate prefix of `+233`, etc.\n", "\n", "This is better than Faker.\n", "\n", "## Steps\n", "Schema -> Generate Data\n", "\n", - "Schema Sources:\n", + "Schema Sources: \n", "- Use the guided schema builder\n", "- Bring your own schema from an SQL Data Definition Language (DDL)\n", "- Prompting\n", - "- Providing a domain to an old-hat to definition features for a dataset" - ], - "metadata": { - "id": "KbMea_UrO3Ke" - } + "- Providing a domain to an old hat to define features for a dataset" + ] }, { "cell_type": "code", @@ -65,6 +46,11 @@ }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "DOBBN3P2GD2O" + }, + "outputs": [], "source": [ "model_id = \"Qwen/Qwen3-4B-Instruct-2507\"\n", "\n", @@ -78,24 +64,24 @@ " dtype=\"auto\",\n", " device_map=\"auto\"\n", ")" - ], - "metadata": { - "id": "DOBBN3P2GD2O" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", - "source": [ - "## Schema Definitions" - ], "metadata": { "id": "HSUebXa1O3MM" - } + }, + "source": [ + "## Schema Definitions" + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "5LNM76OQjAw6" + }, + "outputs": [], "source": [ "# This is for future use where errors in SQL DDL statements can be fixed if the\n", "# specifies that from the UI\n", @@ -115,33 +101,33 @@ "class Schema(BaseModel):\n", " name: str = Field(..., description='Name of the schema')\n", " fields: list[FieldDescriptor] = Field(..., description='List of fields in the schema')" - ], - "metadata": { - "id": "5LNM76OQjAw6" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", - "source": [ - "## LLM Interactions" - ], "metadata": { "id": "6QjitfTBPa1E" - } + }, + "source": [ + "## LLM Interactions" + ] }, { "cell_type": "markdown", - "source": [ - "### Generate Content from LLM" - ], "metadata": { "id": "dXiRHok7Peir" - } + }, + "source": [ + "### Generate Content from LLM" + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "daTUVG8_PmvM" + }, + "outputs": [], "source": [ "def generate(messages: list[dict[str, str]], temperature: float = 0.1) -> any:\n", " text = tokenizer.apply_chat_template(\n", @@ -161,24 +147,24 @@ " content = tokenizer.decode(output_ids, skip_special_tokens=True)\n", "\n", " return content" - ], - "metadata": { - "id": "daTUVG8_PmvM" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", - "source": [ - "### Generate Data Given A Valid Schema" - ], "metadata": { "id": "sBHJKn8qQhM5" - } + }, + "source": [ + "### Generate Data Given A Valid Schema" + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Fla8UQf4Qm5l" + }, + "outputs": [], "source": [ "def generate_data(schema: str, context: str = '', num_records: int = 5):\n", " system_prompt = f'''\n", @@ -211,24 +197,24 @@ " ]\n", "\n", " return generate(messages)" - ], - "metadata": { - "id": "Fla8UQf4Qm5l" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", - "source": [ - "### SQL" - ], "metadata": { "id": "izrClU6VPsZp" - } + }, + "source": [ + "### SQL" + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "aQgY6EK0QPPd" + }, + "outputs": [], "source": [ "def sql_validator(ddl: str):\n", " system_prompt = '''\n", @@ -267,26 +253,26 @@ " ]\n", "\n", " return generate(messages)" - ], - "metadata": { - "id": "aQgY6EK0QPPd" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", + "metadata": { + "id": "4mgwDQyDQ1wv" + }, "source": [ "### Data Scientist\n", "\n", "Just give it a domain and you will be amazed the features will give you." - ], - "metadata": { - "id": "4mgwDQyDQ1wv" - } + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "P36AMvBq8AST" + }, + "outputs": [], "source": [ "def create_domain_schema(domain: str):\n", " system_prompt = f'''\n", @@ -326,35 +312,35 @@ " created_at TIMESTAMP DEFAULT NOW()\n", ");\n", "'''" - ], - "metadata": { - "id": "P36AMvBq8AST" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", - "source": [ - "print(f'{model.get_memory_footprint() / 1e9:, .2f} GB')" - ], + "execution_count": null, "metadata": { "id": "QuVyHOhjDtSH" }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "print(f'{model.get_memory_footprint() / 1e9:, .2f} GB')" + ] }, { "cell_type": "markdown", - "source": [ - "## Export Functions" - ], "metadata": { "id": "tqSpfJGnme7y" - } + }, + "source": [ + "## Export Functions" + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "pAu5OPfUmMSm" + }, + "outputs": [], "source": [ "from enum import StrEnum\n", "\n", @@ -451,24 +437,28 @@ " tmp.write(content)\n", " tmp.flush()\n", " return tmp.name" - ], - "metadata": { - "id": "pAu5OPfUmMSm" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", - "source": [ - "## Gradio UI" - ], "metadata": { "id": "Q0fZsCuso_YZ" - } + }, + "source": [ + "## Gradio UI" + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "TJYUWecybDpP", + "outputId": "e82d0a13-3ca3-4a01-d45c-78fc94ade9bc" + }, + "outputs": [], "source": [ "import gradio as gr\n", "from pydantic import BaseModel, Field\n", @@ -718,34 +708,27 @@ " )\n", "\n", "\n", - "ui.launch(debug=True)" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "TJYUWecybDpP", - "outputId": "e82d0a13-3ca3-4a01-d45c-78fc94ade9bc" - }, - "execution_count": 10, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Keyboard interruption in main thread... closing server.\n", - "Killing tunnel 127.0.0.1:7860 <> https://5954eb89d994d7a5ee.gradio.live\n" - ] - }, - { - "output_type": "execute_result", - "data": { - "text/plain": [] - }, - "metadata": {}, - "execution_count": 10 - } + "ui.launch(debug=True)\n" ] } - ] -} \ No newline at end of file + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "collapsed_sections": [ + "tqSpfJGnme7y" + ], + "gpuType": "T4", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}