From 5aebd4f0de398315459b64f4cd1b7258b8b3bb4b Mon Sep 17 00:00:00 2001 From: prateektambe Date: Thu, 28 Aug 2025 21:12:18 +0530 Subject: [PATCH] Week3Day5 - Dataset generator --- .../SyntheticDataGenerator_PT.ipynb | 141 ++++++++++++++++++ 1 file changed, 141 insertions(+) create mode 100644 community-contributions/SyntheticDataGenerator_PT.ipynb diff --git a/community-contributions/SyntheticDataGenerator_PT.ipynb b/community-contributions/SyntheticDataGenerator_PT.ipynb new file mode 100644 index 0000000..18cf4c6 --- /dev/null +++ b/community-contributions/SyntheticDataGenerator_PT.ipynb @@ -0,0 +1,141 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "d08b387c-53fb-46d2-b083-5eebc3c97e1b", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install -q --upgrade torch==2.5.1+cu124 torchvision==0.20.1+cu124 torchaudio==2.5.1+cu124 --index-url https://download.pytorch.org/whl/cu124\n", + "!pip install -q requests bitsandbytes==0.46.0 transformers==4.48.3 accelerate==1.3.0 openai" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4f1851b2-890c-427b-8e70-b998efa04c67", + "metadata": {}, + "outputs": [], + "source": [ + "# imports\n", + "\n", + "import os\n", + "import requests\n", + "from IPython.display import Markdown, display, update_display\n", + "from openai import OpenAI\n", + "from google.colab import drive\n", + "from huggingface_hub import login\n", + "from google.colab import userdata\n", + "from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig\n", + "import torch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c2d334b5-453e-4213-8e1c-905d504d2dc1", + "metadata": {}, + "outputs": [], + "source": [ + "LLAMA = \"meta-llama/Meta-Llama-3.1-8B-Instruct\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c1b3684c-c170-45f2-a83d-7e6e2ca1e23b", + "metadata": {}, + "outputs": [], + "source": [ + "hf_token = userdata.get('HF_TOKEN')\n", + "login(hf_token, add_to_git_credential=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8c1b6dae-3213-4d68-8fa1-d195704790dc", + "metadata": {}, + "outputs": [], + "source": [ + "openai_api_key = userdata.get('OPENAI_API_KEY')\n", + "openai = OpenAI(api_key=openai_api_key)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "988974c7-814c-478a-be7b-0928b0efdbab", + "metadata": {}, + "outputs": [], + "source": [ + "system_message = \"You are an assistant that produces synthetic test data. The fields, data type of the field like numeric, date, alphanumeric etc., will be provided. Generate data considering all cases, if it is a workflow audit data then consider all touchpoint movements. Do not provide a python script to generate the data. Provide the data as a json with arrays.\"\n", + "user_prompt = \"\"\"Create a synthetic dataset for testing. \n", + "Column names and type - \n", + "ID: 10 digit number\n", + "TRACKING_ID: 13 character alphanumeric\n", + "CASE REPORT DATE : DD-MMM-YYYY HH:MM:SS\n", + "NOTIFICATION DATE : DD-MMM-YYYY HH:MM:SS\n", + "IN SCOPE : (Yes/No)\n", + "\"\"\"\n", + "\n", + "messages = [\n", + " {\"role\": \"system\", \"content\": system_message},\n", + " {\"role\": \"user\", \"content\": user_prompt}\n", + " ]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "40cebc04-abf0-4c61-8b18-f98d3c1fe680", + "metadata": {}, + "outputs": [], + "source": [ + "quant_config = BitsAndBytesConfig(\n", + " load_in_4bit=True,\n", + " bnb_4bit_use_double_quant=True,\n", + " bnb_4bit_compute_dtype=torch.bfloat16,\n", + " bnb_4bit_quant_type=\"nf4\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "710ba1af-8e12-4635-933b-00df8d2e3f9d", + "metadata": {}, + "outputs": [], + "source": [ + "tokenizer = AutoTokenizer.from_pretrained(LLAMA)\n", + "tokenizer.pad_token = tokenizer.eos_token\n", + "inputs = tokenizer.apply_chat_template(messages, return_tensors=\"pt\").to(\"cuda\")\n", + "streamer = TextStreamer(tokenizer)\n", + "model = AutoModelForCausalLM.from_pretrained(LLAMA, device_map=\"auto\", quantization_config=quant_config)\n", + "outputs = model.generate(inputs, max_new_tokens=2000, streamer=streamer)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}