diff --git a/community-contributions/SyntheticDataGenerator_PT.ipynb b/community-contributions/SyntheticDataGenerator_PT.ipynb new file mode 100644 index 0000000..18cf4c6 --- /dev/null +++ b/community-contributions/SyntheticDataGenerator_PT.ipynb @@ -0,0 +1,141 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "d08b387c-53fb-46d2-b083-5eebc3c97e1b", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install -q --upgrade torch==2.5.1+cu124 torchvision==0.20.1+cu124 torchaudio==2.5.1+cu124 --index-url https://download.pytorch.org/whl/cu124\n", + "!pip install -q requests bitsandbytes==0.46.0 transformers==4.48.3 accelerate==1.3.0 openai" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4f1851b2-890c-427b-8e70-b998efa04c67", + "metadata": {}, + "outputs": [], + "source": [ + "# imports\n", + "\n", + "import os\n", + "import requests\n", + "from IPython.display import Markdown, display, update_display\n", + "from openai import OpenAI\n", + "from google.colab import drive\n", + "from huggingface_hub import login\n", + "from google.colab import userdata\n", + "from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig\n", + "import torch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c2d334b5-453e-4213-8e1c-905d504d2dc1", + "metadata": {}, + "outputs": [], + "source": [ + "LLAMA = \"meta-llama/Meta-Llama-3.1-8B-Instruct\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c1b3684c-c170-45f2-a83d-7e6e2ca1e23b", + "metadata": {}, + "outputs": [], + "source": [ + "hf_token = userdata.get('HF_TOKEN')\n", + "login(hf_token, add_to_git_credential=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8c1b6dae-3213-4d68-8fa1-d195704790dc", + "metadata": {}, + "outputs": [], + "source": [ + "openai_api_key = userdata.get('OPENAI_API_KEY')\n", + "openai = OpenAI(api_key=openai_api_key)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "988974c7-814c-478a-be7b-0928b0efdbab", + "metadata": {}, + "outputs": [], + "source": [ + "system_message = \"You are an assistant that produces synthetic test data. The fields, data type of the field like numeric, date, alphanumeric etc., will be provided. Generate data considering all cases, if it is a workflow audit data then consider all touchpoint movements. Do not provide a python script to generate the data. Provide the data as a json with arrays.\"\n", + "user_prompt = \"\"\"Create a synthetic dataset for testing. \n", + "Column names and type - \n", + "ID: 10 digit number\n", + "TRACKING_ID: 13 character alphanumeric\n", + "CASE REPORT DATE : DD-MMM-YYYY HH:MM:SS\n", + "NOTIFICATION DATE : DD-MMM-YYYY HH:MM:SS\n", + "IN SCOPE : (Yes/No)\n", + "\"\"\"\n", + "\n", + "messages = [\n", + " {\"role\": \"system\", \"content\": system_message},\n", + " {\"role\": \"user\", \"content\": user_prompt}\n", + " ]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "40cebc04-abf0-4c61-8b18-f98d3c1fe680", + "metadata": {}, + "outputs": [], + "source": [ + "quant_config = BitsAndBytesConfig(\n", + " load_in_4bit=True,\n", + " bnb_4bit_use_double_quant=True,\n", + " bnb_4bit_compute_dtype=torch.bfloat16,\n", + " bnb_4bit_quant_type=\"nf4\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "710ba1af-8e12-4635-933b-00df8d2e3f9d", + "metadata": {}, + "outputs": [], + "source": [ + "tokenizer = AutoTokenizer.from_pretrained(LLAMA)\n", + "tokenizer.pad_token = tokenizer.eos_token\n", + "inputs = tokenizer.apply_chat_template(messages, return_tensors=\"pt\").to(\"cuda\")\n", + "streamer = TextStreamer(tokenizer)\n", + "model = AutoModelForCausalLM.from_pretrained(LLAMA, device_map=\"auto\", quantization_config=quant_config)\n", + "outputs = model.generate(inputs, max_new_tokens=2000, streamer=streamer)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}