From 5aebd4f0de398315459b64f4cd1b7258b8b3bb4b Mon Sep 17 00:00:00 2001
From: prateektambe <prateektambe@gmail.com>
Date: Thu, 28 Aug 2025 21:12:18 +0530
Subject: [PATCH] Week3Day5 - Dataset generator

---
 .../SyntheticDataGenerator_PT.ipynb           | 141 ++++++++++++++++++
 1 file changed, 141 insertions(+)
 create mode 100644 community-contributions/SyntheticDataGenerator_PT.ipynb

diff --git a/community-contributions/SyntheticDataGenerator_PT.ipynb b/community-contributions/SyntheticDataGenerator_PT.ipynb
new file mode 100644
index 0000000..18cf4c6
--- /dev/null
+++ b/community-contributions/SyntheticDataGenerator_PT.ipynb
@@ -0,0 +1,141 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d08b387c-53fb-46d2-b083-5eebc3c97e1b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install -q --upgrade torch==2.5.1+cu124 torchvision==0.20.1+cu124 torchaudio==2.5.1+cu124 --index-url https://download.pytorch.org/whl/cu124\n",
+    "!pip install -q requests bitsandbytes==0.46.0 transformers==4.48.3 accelerate==1.3.0 openai"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4f1851b2-890c-427b-8e70-b998efa04c67",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# imports\n",
+    "\n",
+    "import os\n",
+    "import requests\n",
+    "from IPython.display import Markdown, display, update_display\n",
+    "from openai import OpenAI\n",
+    "from google.colab import drive\n",
+    "from huggingface_hub import login\n",
+    "from google.colab import userdata\n",
+    "from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig\n",
+    "import torch"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c2d334b5-453e-4213-8e1c-905d504d2dc1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "LLAMA = \"meta-llama/Meta-Llama-3.1-8B-Instruct\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c1b3684c-c170-45f2-a83d-7e6e2ca1e23b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "hf_token = userdata.get('HF_TOKEN')\n",
+    "login(hf_token, add_to_git_credential=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8c1b6dae-3213-4d68-8fa1-d195704790dc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "openai_api_key = userdata.get('OPENAI_API_KEY')\n",
+    "openai = OpenAI(api_key=openai_api_key)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "988974c7-814c-478a-be7b-0928b0efdbab",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "system_message = \"You are an assistant that produces synthetic test data. The fields, data type of the field like numeric, date, alphanumeric etc., will be provided. Generate data considering all cases, if it is a workflow audit data then consider all touchpoint movements. Do not provide a python script to generate the data. Provide the data as a json with arrays.\"\n",
+    "user_prompt = \"\"\"Create a synthetic dataset for testing. \n",
+    "Column names and type - \n",
+    "ID: 10 digit number\n",
+    "TRACKING_ID: 13 character alphanumeric\n",
+    "CASE REPORT DATE : DD-MMM-YYYY HH:MM:SS\n",
+    "NOTIFICATION DATE : DD-MMM-YYYY HH:MM:SS\n",
+    "IN SCOPE : (Yes/No)\n",
+    "\"\"\"\n",
+    "\n",
+    "messages = [\n",
+    "    {\"role\": \"system\", \"content\": system_message},\n",
+    "    {\"role\": \"user\", \"content\": user_prompt}\n",
+    "  ]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "40cebc04-abf0-4c61-8b18-f98d3c1fe680",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "quant_config = BitsAndBytesConfig(\n",
+    "    load_in_4bit=True,\n",
+    "    bnb_4bit_use_double_quant=True,\n",
+    "    bnb_4bit_compute_dtype=torch.bfloat16,\n",
+    "    bnb_4bit_quant_type=\"nf4\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "710ba1af-8e12-4635-933b-00df8d2e3f9d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tokenizer = AutoTokenizer.from_pretrained(LLAMA)\n",
+    "tokenizer.pad_token = tokenizer.eos_token\n",
+    "inputs = tokenizer.apply_chat_template(messages, return_tensors=\"pt\").to(\"cuda\")\n",
+    "streamer = TextStreamer(tokenizer)\n",
+    "model = AutoModelForCausalLM.from_pretrained(LLAMA, device_map=\"auto\", quantization_config=quant_config)\n",
+    "outputs = model.generate(inputs, max_new_tokens=2000, streamer=streamer)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}