227 lines
9.0 KiB
Plaintext
227 lines
9.0 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "18b82c6b-10dc-4d94-b8dc-592ff011ce2b",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Meeting minutes creator\n",
|
|
"\n",
|
|
"https://colab.research.google.com/drive/13wR4Blz3Ot_x0GOpflmvvFffm5XU3Kct?usp=sharing\n",
|
|
"\n",
|
|
"## **Week 3 task.**\n",
|
|
"Create your own tool that generates synthetic data/test data. Input the type of dataset or products or job postings, etc. and let the tool dream up various data samples.\n",
|
|
"\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "e9289ba7-200c-43a9-b67a-c5ce826c9537",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# imports\n",
|
|
"import gradio as gr, requests, json, time, os, torch\n",
|
|
"from transformers import pipeline, set_seed\n",
|
|
"from functools import partial\n",
|
|
"from openai import OpenAI, APIError, AuthenticationError\n",
|
|
"from google.colab import drive, userdata\n",
|
|
"from huggingface_hub import login\n",
|
|
"from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig\n",
|
|
"\n",
|
|
"# Sample user_prompt = \"a list of student profiles with full name, email, course studied, and GPA for each of 6 semesters, and a CGPA for the 6 semesters\"\n",
|
|
"\n",
|
|
"# Sign in to HuggingFace Hub\n",
|
|
"hf_token = userdata.get('HF_TOKEN')\n",
|
|
"login(hf_token, add_to_git_credential=True)\n",
|
|
"\n",
|
|
"# Sign in to OpenAI using Secrets in Colab\n",
|
|
"openai_api_key = userdata.get('OPENAI_API_KEY')\n",
|
|
"\n",
|
|
"# Initialize client\n",
|
|
"try:\n",
|
|
" openai = OpenAI(api_key=openai_api_key)\n",
|
|
"except Exception as e:\n",
|
|
" openai = None\n",
|
|
" print(f\"OpenAI client not initialized: {e}\")\n",
|
|
"\n",
|
|
"# Constants\n",
|
|
"GPT_MODEL = \"gpt-3.5-turbo\"\n",
|
|
"\n",
|
|
"# Local Llama Model Setup\n",
|
|
"# Loads a Llama model from Hugging Face for local inference.\n",
|
|
"# Note: This requires a powerful GPU and specific library installations (e.g., bitsandbytes, accelerate).\n",
|
|
"LLAMA_MODEL = \"meta-llama/Meta-Llama-3.1-8B-Instruct\"\n",
|
|
"\n",
|
|
"try:\n",
|
|
" # Set up quantization config for efficient memory usage.\n",
|
|
" # This loads the model in 4-bit precision, significantly reducing VRAM requirements.\n",
|
|
" quant_config = BitsAndBytesConfig(\n",
|
|
" load_in_4bit=True,\n",
|
|
" bnb_4bit_use_double_quant=True,\n",
|
|
" bnb_4bit_compute_dtype=torch.bfloat16,\n",
|
|
" bnb_4bit_quant_type=\"nf4\"\n",
|
|
" )\n",
|
|
"\n",
|
|
" # Load the tokenizer and model.\n",
|
|
" tokenizer = AutoTokenizer.from_pretrained(LLAMA_MODEL)\n",
|
|
" model = AutoModelForCausalLM.from_pretrained(\n",
|
|
" LLAMA_MODEL, \n",
|
|
" device_map=\"auto\", \n",
|
|
" quantization_config=quant_config,\n",
|
|
" trust_remote_code=True\n",
|
|
" )\n",
|
|
" \n",
|
|
" # Set the model to evaluation mode for inference.\n",
|
|
" model.eval()\n",
|
|
"\n",
|
|
"except Exception as e:\n",
|
|
" model = None\n",
|
|
" tokenizer = None\n",
|
|
" print(f\"Failed to load local Llama model: {e}\")\n",
|
|
"\n",
|
|
"\n",
|
|
"def generate_with_llama(user_prompt: str, num_samples: int = 5):\n",
|
|
" \"\"\"\n",
|
|
" Generates synthetic data using a local Llama model.\n",
|
|
" Return a JSON string.\n",
|
|
" \"\"\"\n",
|
|
" if not model or not tokenizer:\n",
|
|
" return json.dumps({\"error\": \"Llama model not loaded. Check model paths and hardware compatibility.\"}, indent=2)\n",
|
|
"\n",
|
|
" # Llama 3.1 uses a specific chat template for conversation formatting.\n",
|
|
" messages = [\n",
|
|
" {\"role\": \"system\", \"content\": f\"You are a data generation assistant. Generate a JSON array of exactly {num_samples} objects based on the user's request. The output must be valid JSON only, without any other text or formatting.\"},\n",
|
|
" {\"role\": \"user\", \"content\": user_prompt}\n",
|
|
" ]\n",
|
|
"\n",
|
|
" try:\n",
|
|
" inputs = tokenizer.apply_chat_template(messages, return_tensors=\"pt\").to(\"cuda\")\n",
|
|
"\n",
|
|
" outputs = model.generate(inputs, max_new_tokens=2000, do_sample=True, top_p=0.9, temperature=0.7)\n",
|
|
"\n",
|
|
" # Decode the generated tokens.\n",
|
|
" response_text = tokenizer.decode(outputs[0], skip_special_tokens=True)\n",
|
|
"\n",
|
|
" # Extract only the assistant's part from the complete chat history.\n",
|
|
" assistant_start = \"<|eot_id|><|start_header_id|>assistant<|end_header_id|>\\n\\n\"\n",
|
|
" if assistant_start in response_text:\n",
|
|
" response_text = response_text.split(assistant_start)[-1]\n",
|
|
" \n",
|
|
" # Parse the JSON and return it.\n",
|
|
" parsed_json = json.loads(response_text)\n",
|
|
" return json.dumps(parsed_json, indent=2)\n",
|
|
"\n",
|
|
" except Exception as e:\n",
|
|
" return json.dumps({\"error\": f\"An error occurred during local model generation: {e}\"}, indent=2)\n",
|
|
"\n",
|
|
"\n",
|
|
"\n",
|
|
"def generate_with_gpt(user_prompt: str, num_samples: int = 5):\n",
|
|
" \"\"\"\n",
|
|
" Generates synthetic data using OpenAI's GPT.\n",
|
|
" Return a JSON string.\n",
|
|
" \"\"\"\n",
|
|
" if not openai:\n",
|
|
" return json.dumps({\"error\": \"OpenAI client not initialized. Please check your API key.\"}, indent=2)\n",
|
|
"\n",
|
|
" try:\n",
|
|
" response = openai.chat.completions.create(\n",
|
|
" model=GPT_MODEL,\n",
|
|
" messages=[\n",
|
|
" {\"role\": \"system\", \"content\": f\"You are a data generation assistant. Generate a JSON array of exactly {num_samples} objects based on the user's request. The output must be valid JSON only, without any other text or formatting.\"},\n",
|
|
" {\"role\": \"user\", \"content\": user_prompt}\n",
|
|
" ],\n",
|
|
" response_format={\"type\": \"json_object\"}\n",
|
|
" )\n",
|
|
" \n",
|
|
" json_text = response.choices[0].message.content\n",
|
|
" return json_text\n",
|
|
" except APIError as e:\n",
|
|
" return json.dumps({\"error\": f\"Error from OpenAI API: {e.body}\"}, indent=2)\n",
|
|
" except Exception as e:\n",
|
|
" return json.dumps({\"error\": f\"An unexpected error occurred: {e}\"}, indent=2)\n",
|
|
"\n",
|
|
"\n",
|
|
"def generate_data(user_prompt, model_choice):\n",
|
|
" \"\"\"\n",
|
|
" Wrapper function that calls the appropriate generation function based on model choice.\n",
|
|
" \"\"\"\n",
|
|
" if not user_prompt:\n",
|
|
" return json.dumps({\"error\": \"Please provide a description for the data.\"}, indent=2)\n",
|
|
"\n",
|
|
" if model_choice == f\"Hugging Face ({LLAMA_MODEL})\":\n",
|
|
" return generate_with_llama(user_prompt)\n",
|
|
" elif model_choice == f\"OpenAI ({GPT_MODEL})\":\n",
|
|
" return generate_with_gpt(user_prompt)\n",
|
|
" else:\n",
|
|
" return json.dumps({\"error\": \"Invalid model choice.\"}, indent=2)\n",
|
|
"\n",
|
|
"# Gradio UI\n",
|
|
"with gr.Blocks(theme=gr.themes.Soft(), title=\"Synthetic Data Generator\") as ui:\n",
|
|
" gr.Markdown(\"# Synthetic Data Generator\")\n",
|
|
" gr.Markdown(\"Describe the type of data you need, select a model, and click 'Generate'.\")\n",
|
|
"\n",
|
|
" with gr.Row():\n",
|
|
" with gr.Column(scale=3):\n",
|
|
" data_prompt = gr.Textbox(\n",
|
|
" lines=5,\n",
|
|
" label=\"Data Prompt\",\n",
|
|
" placeholder=\"e.g., a list of customer profiles with name, email, and a favorite product\"\n",
|
|
" )\n",
|
|
" \n",
|
|
" with gr.Column(scale=1):\n",
|
|
" model_choice = gr.Radio(\n",
|
|
" [f\"Hugging Face ({LLAMA_MODEL})\", f\"OpenAI ({GPT_MODEL})\"],\n",
|
|
" label=\"Choose a Model\",\n",
|
|
" value=f\"Hugging Face ({LLAMA_MODEL})\"\n",
|
|
" )\n",
|
|
" \n",
|
|
" generate_btn = gr.Button(\"Generate Data\")\n",
|
|
" \n",
|
|
" with gr.Row():\n",
|
|
" output_json = gr.JSON(label=\"Generated Data\")\n",
|
|
" \n",
|
|
" # Click trigger\n",
|
|
" generate_btn.click(\n",
|
|
" fn=generate_data,\n",
|
|
" inputs=[data_prompt, model_choice],\n",
|
|
" outputs=output_json\n",
|
|
" )\n",
|
|
"\n",
|
|
"ui.launch(inbrowser=True, debug=True)\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "cd2020d3",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.11.7"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|