minor update on local machine
This commit is contained in:
@@ -1,49 +1,30 @@
|
|||||||
{
|
{
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 0,
|
|
||||||
"metadata": {
|
|
||||||
"colab": {
|
|
||||||
"provenance": [],
|
|
||||||
"collapsed_sections": [
|
|
||||||
"tqSpfJGnme7y"
|
|
||||||
],
|
|
||||||
"gpuType": "T4"
|
|
||||||
},
|
|
||||||
"kernelspec": {
|
|
||||||
"name": "python3",
|
|
||||||
"display_name": "Python 3"
|
|
||||||
},
|
|
||||||
"language_info": {
|
|
||||||
"name": "python"
|
|
||||||
},
|
|
||||||
"accelerator": "GPU"
|
|
||||||
},
|
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
|
"metadata": {
|
||||||
|
"id": "KbMea_UrO3Ke"
|
||||||
|
},
|
||||||
"source": [
|
"source": [
|
||||||
"# ✨ Coherent Data Generator\n",
|
"# ✨ Coherent Data Generator\n",
|
||||||
"\n",
|
"\n",
|
||||||
"## In real life, data has meaning, relationships, etc. and this is where this tool shines.\n",
|
"## In real life, data has meaning, relationships, etc., and this is where this tool shines.\n",
|
||||||
"\n",
|
"\n",
|
||||||
"Dependencies between fields are detected and a coherent data is generated.\n",
|
"Dependencies between fields are detected, and coherent data is generated.\n",
|
||||||
"Example:\n",
|
"Example:\n",
|
||||||
"When asked to generate data with **Ghana** cited as the context, fields like `name`, `food`, etc. will be Ghanaian. Fields such as phone number will have the appropriate prefix of `+233`, etc.\n",
|
"When asked to generate data with **Ghana** cited as the context, fields like `name`, `food`, etc., will be Ghanaian. Fields such as phone number will have the appropriate prefix of `+233`, etc.\n",
|
||||||
"\n",
|
"\n",
|
||||||
"This is better than Faker.\n",
|
"This is better than Faker.\n",
|
||||||
"\n",
|
"\n",
|
||||||
"## Steps\n",
|
"## Steps\n",
|
||||||
"Schema -> Generate Data\n",
|
"Schema -> Generate Data\n",
|
||||||
"\n",
|
"\n",
|
||||||
"Schema Sources:\n",
|
"Schema Sources: \n",
|
||||||
"- Use the guided schema builder\n",
|
"- Use the guided schema builder\n",
|
||||||
"- Bring your own schema from an SQL Data Definition Language (DDL)\n",
|
"- Bring your own schema from an SQL Data Definition Language (DDL)\n",
|
||||||
"- Prompting\n",
|
"- Prompting\n",
|
||||||
"- Providing a domain to an old-hat to definition features for a dataset"
|
"- Providing a domain to an old hat to define features for a dataset"
|
||||||
],
|
]
|
||||||
"metadata": {
|
|
||||||
"id": "KbMea_UrO3Ke"
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
@@ -65,6 +46,11 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"id": "DOBBN3P2GD2O"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"model_id = \"Qwen/Qwen3-4B-Instruct-2507\"\n",
|
"model_id = \"Qwen/Qwen3-4B-Instruct-2507\"\n",
|
||||||
"\n",
|
"\n",
|
||||||
@@ -78,24 +64,24 @@
|
|||||||
" dtype=\"auto\",\n",
|
" dtype=\"auto\",\n",
|
||||||
" device_map=\"auto\"\n",
|
" device_map=\"auto\"\n",
|
||||||
")"
|
")"
|
||||||
],
|
]
|
||||||
"metadata": {
|
|
||||||
"id": "DOBBN3P2GD2O"
|
|
||||||
},
|
|
||||||
"execution_count": null,
|
|
||||||
"outputs": []
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"source": [
|
|
||||||
"## Schema Definitions"
|
|
||||||
],
|
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"id": "HSUebXa1O3MM"
|
"id": "HSUebXa1O3MM"
|
||||||
}
|
},
|
||||||
|
"source": [
|
||||||
|
"## Schema Definitions"
|
||||||
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"id": "5LNM76OQjAw6"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# This is for future use where errors in SQL DDL statements can be fixed if the\n",
|
"# This is for future use where errors in SQL DDL statements can be fixed if the\n",
|
||||||
"# specifies that from the UI\n",
|
"# specifies that from the UI\n",
|
||||||
@@ -115,33 +101,33 @@
|
|||||||
"class Schema(BaseModel):\n",
|
"class Schema(BaseModel):\n",
|
||||||
" name: str = Field(..., description='Name of the schema')\n",
|
" name: str = Field(..., description='Name of the schema')\n",
|
||||||
" fields: list[FieldDescriptor] = Field(..., description='List of fields in the schema')"
|
" fields: list[FieldDescriptor] = Field(..., description='List of fields in the schema')"
|
||||||
],
|
]
|
||||||
"metadata": {
|
|
||||||
"id": "5LNM76OQjAw6"
|
|
||||||
},
|
|
||||||
"execution_count": null,
|
|
||||||
"outputs": []
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"source": [
|
|
||||||
"## LLM Interactions"
|
|
||||||
],
|
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"id": "6QjitfTBPa1E"
|
"id": "6QjitfTBPa1E"
|
||||||
}
|
},
|
||||||
|
"source": [
|
||||||
|
"## LLM Interactions"
|
||||||
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"source": [
|
|
||||||
"### Generate Content from LLM"
|
|
||||||
],
|
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"id": "dXiRHok7Peir"
|
"id": "dXiRHok7Peir"
|
||||||
}
|
},
|
||||||
|
"source": [
|
||||||
|
"### Generate Content from LLM"
|
||||||
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"id": "daTUVG8_PmvM"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"def generate(messages: list[dict[str, str]], temperature: float = 0.1) -> any:\n",
|
"def generate(messages: list[dict[str, str]], temperature: float = 0.1) -> any:\n",
|
||||||
" text = tokenizer.apply_chat_template(\n",
|
" text = tokenizer.apply_chat_template(\n",
|
||||||
@@ -161,24 +147,24 @@
|
|||||||
" content = tokenizer.decode(output_ids, skip_special_tokens=True)\n",
|
" content = tokenizer.decode(output_ids, skip_special_tokens=True)\n",
|
||||||
"\n",
|
"\n",
|
||||||
" return content"
|
" return content"
|
||||||
],
|
]
|
||||||
"metadata": {
|
|
||||||
"id": "daTUVG8_PmvM"
|
|
||||||
},
|
|
||||||
"execution_count": null,
|
|
||||||
"outputs": []
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"source": [
|
|
||||||
"### Generate Data Given A Valid Schema"
|
|
||||||
],
|
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"id": "sBHJKn8qQhM5"
|
"id": "sBHJKn8qQhM5"
|
||||||
}
|
},
|
||||||
|
"source": [
|
||||||
|
"### Generate Data Given A Valid Schema"
|
||||||
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"id": "Fla8UQf4Qm5l"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"def generate_data(schema: str, context: str = '', num_records: int = 5):\n",
|
"def generate_data(schema: str, context: str = '', num_records: int = 5):\n",
|
||||||
" system_prompt = f'''\n",
|
" system_prompt = f'''\n",
|
||||||
@@ -211,24 +197,24 @@
|
|||||||
" ]\n",
|
" ]\n",
|
||||||
"\n",
|
"\n",
|
||||||
" return generate(messages)"
|
" return generate(messages)"
|
||||||
],
|
]
|
||||||
"metadata": {
|
|
||||||
"id": "Fla8UQf4Qm5l"
|
|
||||||
},
|
|
||||||
"execution_count": null,
|
|
||||||
"outputs": []
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"source": [
|
|
||||||
"### SQL"
|
|
||||||
],
|
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"id": "izrClU6VPsZp"
|
"id": "izrClU6VPsZp"
|
||||||
}
|
},
|
||||||
|
"source": [
|
||||||
|
"### SQL"
|
||||||
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"id": "aQgY6EK0QPPd"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"def sql_validator(ddl: str):\n",
|
"def sql_validator(ddl: str):\n",
|
||||||
" system_prompt = '''\n",
|
" system_prompt = '''\n",
|
||||||
@@ -267,26 +253,26 @@
|
|||||||
" ]\n",
|
" ]\n",
|
||||||
"\n",
|
"\n",
|
||||||
" return generate(messages)"
|
" return generate(messages)"
|
||||||
],
|
]
|
||||||
"metadata": {
|
|
||||||
"id": "aQgY6EK0QPPd"
|
|
||||||
},
|
|
||||||
"execution_count": null,
|
|
||||||
"outputs": []
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
|
"metadata": {
|
||||||
|
"id": "4mgwDQyDQ1wv"
|
||||||
|
},
|
||||||
"source": [
|
"source": [
|
||||||
"### Data Scientist\n",
|
"### Data Scientist\n",
|
||||||
"\n",
|
"\n",
|
||||||
"Just give it a domain and you will be amazed the features will give you."
|
"Just give it a domain and you will be amazed the features will give you."
|
||||||
],
|
]
|
||||||
"metadata": {
|
|
||||||
"id": "4mgwDQyDQ1wv"
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"id": "P36AMvBq8AST"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"def create_domain_schema(domain: str):\n",
|
"def create_domain_schema(domain: str):\n",
|
||||||
" system_prompt = f'''\n",
|
" system_prompt = f'''\n",
|
||||||
@@ -326,35 +312,35 @@
|
|||||||
" created_at TIMESTAMP DEFAULT NOW()\n",
|
" created_at TIMESTAMP DEFAULT NOW()\n",
|
||||||
");\n",
|
");\n",
|
||||||
"'''"
|
"'''"
|
||||||
],
|
]
|
||||||
"metadata": {
|
|
||||||
"id": "P36AMvBq8AST"
|
|
||||||
},
|
|
||||||
"execution_count": null,
|
|
||||||
"outputs": []
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"source": [
|
"execution_count": null,
|
||||||
"print(f'{model.get_memory_footprint() / 1e9:, .2f} GB')"
|
|
||||||
],
|
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"id": "QuVyHOhjDtSH"
|
"id": "QuVyHOhjDtSH"
|
||||||
},
|
},
|
||||||
"execution_count": null,
|
"outputs": [],
|
||||||
"outputs": []
|
"source": [
|
||||||
|
"print(f'{model.get_memory_footprint() / 1e9:, .2f} GB')"
|
||||||
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"source": [
|
|
||||||
"## Export Functions"
|
|
||||||
],
|
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"id": "tqSpfJGnme7y"
|
"id": "tqSpfJGnme7y"
|
||||||
}
|
},
|
||||||
|
"source": [
|
||||||
|
"## Export Functions"
|
||||||
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"id": "pAu5OPfUmMSm"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"from enum import StrEnum\n",
|
"from enum import StrEnum\n",
|
||||||
"\n",
|
"\n",
|
||||||
@@ -451,24 +437,28 @@
|
|||||||
" tmp.write(content)\n",
|
" tmp.write(content)\n",
|
||||||
" tmp.flush()\n",
|
" tmp.flush()\n",
|
||||||
" return tmp.name"
|
" return tmp.name"
|
||||||
],
|
]
|
||||||
"metadata": {
|
|
||||||
"id": "pAu5OPfUmMSm"
|
|
||||||
},
|
|
||||||
"execution_count": null,
|
|
||||||
"outputs": []
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"source": [
|
|
||||||
"## Gradio UI"
|
|
||||||
],
|
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"id": "Q0fZsCuso_YZ"
|
"id": "Q0fZsCuso_YZ"
|
||||||
}
|
},
|
||||||
|
"source": [
|
||||||
|
"## Gradio UI"
|
||||||
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"colab": {
|
||||||
|
"base_uri": "https://localhost:8080/"
|
||||||
|
},
|
||||||
|
"id": "TJYUWecybDpP",
|
||||||
|
"outputId": "e82d0a13-3ca3-4a01-d45c-78fc94ade9bc"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"import gradio as gr\n",
|
"import gradio as gr\n",
|
||||||
"from pydantic import BaseModel, Field\n",
|
"from pydantic import BaseModel, Field\n",
|
||||||
@@ -718,34 +708,27 @@
|
|||||||
" )\n",
|
" )\n",
|
||||||
"\n",
|
"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"ui.launch(debug=True)"
|
"ui.launch(debug=True)\n"
|
||||||
],
|
|
||||||
"metadata": {
|
|
||||||
"colab": {
|
|
||||||
"base_uri": "https://localhost:8080/"
|
|
||||||
},
|
|
||||||
"id": "TJYUWecybDpP",
|
|
||||||
"outputId": "e82d0a13-3ca3-4a01-d45c-78fc94ade9bc"
|
|
||||||
},
|
|
||||||
"execution_count": 10,
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"output_type": "stream",
|
|
||||||
"name": "stdout",
|
|
||||||
"text": [
|
|
||||||
"Keyboard interruption in main thread... closing server.\n",
|
|
||||||
"Killing tunnel 127.0.0.1:7860 <> https://5954eb89d994d7a5ee.gradio.live\n"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"output_type": "execute_result",
|
|
||||||
"data": {
|
|
||||||
"text/plain": []
|
|
||||||
},
|
|
||||||
"metadata": {},
|
|
||||||
"execution_count": 10
|
|
||||||
}
|
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
]
|
],
|
||||||
}
|
"metadata": {
|
||||||
|
"accelerator": "GPU",
|
||||||
|
"colab": {
|
||||||
|
"collapsed_sections": [
|
||||||
|
"tqSpfJGnme7y"
|
||||||
|
],
|
||||||
|
"gpuType": "T4",
|
||||||
|
"provenance": []
|
||||||
|
},
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"name": "python"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 0
|
||||||
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user