minor update on local machine
This commit is contained in:
@@ -1,49 +1,30 @@
|
||||
{
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 0,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"provenance": [],
|
||||
"collapsed_sections": [
|
||||
"tqSpfJGnme7y"
|
||||
],
|
||||
"gpuType": "T4"
|
||||
},
|
||||
"kernelspec": {
|
||||
"name": "python3",
|
||||
"display_name": "Python 3"
|
||||
},
|
||||
"language_info": {
|
||||
"name": "python"
|
||||
},
|
||||
"accelerator": "GPU"
|
||||
},
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "KbMea_UrO3Ke"
|
||||
},
|
||||
"source": [
|
||||
"# ✨ Coherent Data Generator\n",
|
||||
"\n",
|
||||
"## In real life, data has meaning, relationships, etc. and this is where this tool shines.\n",
|
||||
"## In real life, data has meaning, relationships, etc., and this is where this tool shines.\n",
|
||||
"\n",
|
||||
"Dependencies between fields are detected and a coherent data is generated.\n",
|
||||
"Dependencies between fields are detected, and coherent data is generated.\n",
|
||||
"Example:\n",
|
||||
"When asked to generate data with **Ghana** cited as the context, fields like `name`, `food`, etc. will be Ghanaian. Fields such as phone number will have the appropriate prefix of `+233`, etc.\n",
|
||||
"When asked to generate data with **Ghana** cited as the context, fields like `name`, `food`, etc., will be Ghanaian. Fields such as phone number will have the appropriate prefix of `+233`, etc.\n",
|
||||
"\n",
|
||||
"This is better than Faker.\n",
|
||||
"\n",
|
||||
"## Steps\n",
|
||||
"Schema -> Generate Data\n",
|
||||
"\n",
|
||||
"Schema Sources:\n",
|
||||
"Schema Sources: \n",
|
||||
"- Use the guided schema builder\n",
|
||||
"- Bring your own schema from an SQL Data Definition Language (DDL)\n",
|
||||
"- Prompting\n",
|
||||
"- Providing a domain to an old-hat to definition features for a dataset"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "KbMea_UrO3Ke"
|
||||
}
|
||||
"- Providing a domain to an old hat to define features for a dataset"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
@@ -65,6 +46,11 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "DOBBN3P2GD2O"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model_id = \"Qwen/Qwen3-4B-Instruct-2507\"\n",
|
||||
"\n",
|
||||
@@ -78,24 +64,24 @@
|
||||
" dtype=\"auto\",\n",
|
||||
" device_map=\"auto\"\n",
|
||||
")"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "DOBBN3P2GD2O"
|
||||
},
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"## Schema Definitions"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "HSUebXa1O3MM"
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"## Schema Definitions"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "5LNM76OQjAw6"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# This is for future use where errors in SQL DDL statements can be fixed if the\n",
|
||||
"# specifies that from the UI\n",
|
||||
@@ -115,33 +101,33 @@
|
||||
"class Schema(BaseModel):\n",
|
||||
" name: str = Field(..., description='Name of the schema')\n",
|
||||
" fields: list[FieldDescriptor] = Field(..., description='List of fields in the schema')"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "5LNM76OQjAw6"
|
||||
},
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"## LLM Interactions"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "6QjitfTBPa1E"
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"## LLM Interactions"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"### Generate Content from LLM"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "dXiRHok7Peir"
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"### Generate Content from LLM"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "daTUVG8_PmvM"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def generate(messages: list[dict[str, str]], temperature: float = 0.1) -> any:\n",
|
||||
" text = tokenizer.apply_chat_template(\n",
|
||||
@@ -161,24 +147,24 @@
|
||||
" content = tokenizer.decode(output_ids, skip_special_tokens=True)\n",
|
||||
"\n",
|
||||
" return content"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "daTUVG8_PmvM"
|
||||
},
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"### Generate Data Given A Valid Schema"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "sBHJKn8qQhM5"
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"### Generate Data Given A Valid Schema"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "Fla8UQf4Qm5l"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def generate_data(schema: str, context: str = '', num_records: int = 5):\n",
|
||||
" system_prompt = f'''\n",
|
||||
@@ -211,24 +197,24 @@
|
||||
" ]\n",
|
||||
"\n",
|
||||
" return generate(messages)"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "Fla8UQf4Qm5l"
|
||||
},
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"### SQL"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "izrClU6VPsZp"
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"### SQL"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "aQgY6EK0QPPd"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def sql_validator(ddl: str):\n",
|
||||
" system_prompt = '''\n",
|
||||
@@ -267,26 +253,26 @@
|
||||
" ]\n",
|
||||
"\n",
|
||||
" return generate(messages)"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "aQgY6EK0QPPd"
|
||||
},
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "4mgwDQyDQ1wv"
|
||||
},
|
||||
"source": [
|
||||
"### Data Scientist\n",
|
||||
"\n",
|
||||
"Just give it a domain and you will be amazed the features will give you."
|
||||
],
|
||||
"metadata": {
|
||||
"id": "4mgwDQyDQ1wv"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "P36AMvBq8AST"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def create_domain_schema(domain: str):\n",
|
||||
" system_prompt = f'''\n",
|
||||
@@ -326,35 +312,35 @@
|
||||
" created_at TIMESTAMP DEFAULT NOW()\n",
|
||||
");\n",
|
||||
"'''"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "P36AMvBq8AST"
|
||||
},
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"print(f'{model.get_memory_footprint() / 1e9:, .2f} GB')"
|
||||
],
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "QuVyHOhjDtSH"
|
||||
},
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(f'{model.get_memory_footprint() / 1e9:, .2f} GB')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"## Export Functions"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "tqSpfJGnme7y"
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"## Export Functions"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "pAu5OPfUmMSm"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from enum import StrEnum\n",
|
||||
"\n",
|
||||
@@ -451,24 +437,28 @@
|
||||
" tmp.write(content)\n",
|
||||
" tmp.flush()\n",
|
||||
" return tmp.name"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "pAu5OPfUmMSm"
|
||||
},
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"## Gradio UI"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "Q0fZsCuso_YZ"
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"## Gradio UI"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/"
|
||||
},
|
||||
"id": "TJYUWecybDpP",
|
||||
"outputId": "e82d0a13-3ca3-4a01-d45c-78fc94ade9bc"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import gradio as gr\n",
|
||||
"from pydantic import BaseModel, Field\n",
|
||||
@@ -718,34 +708,27 @@
|
||||
" )\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"ui.launch(debug=True)"
|
||||
],
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/"
|
||||
},
|
||||
"id": "TJYUWecybDpP",
|
||||
"outputId": "e82d0a13-3ca3-4a01-d45c-78fc94ade9bc"
|
||||
},
|
||||
"execution_count": 10,
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": [
|
||||
"Keyboard interruption in main thread... closing server.\n",
|
||||
"Killing tunnel 127.0.0.1:7860 <> https://5954eb89d994d7a5ee.gradio.live\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"output_type": "execute_result",
|
||||
"data": {
|
||||
"text/plain": []
|
||||
},
|
||||
"metadata": {},
|
||||
"execution_count": 10
|
||||
}
|
||||
"ui.launch(debug=True)\n"
|
||||
]
|
||||
}
|
||||
]
|
||||
],
|
||||
"metadata": {
|
||||
"accelerator": "GPU",
|
||||
"colab": {
|
||||
"collapsed_sections": [
|
||||
"tqSpfJGnme7y"
|
||||
],
|
||||
"gpuType": "T4",
|
||||
"provenance": []
|
||||
},
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"name": "python"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 0
|
||||
}
|
||||
Reference in New Issue
Block a user