Merge pull request #47 from innovoe/patch-1
Dataset Generator with Gradio
This commit is contained in:
267
week3/community-contributions/dataset_generator.ipynb
Normal file
267
week3/community-contributions/dataset_generator.ipynb
Normal file
@@ -0,0 +1,267 @@
|
||||
{
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 0,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"provenance": [],
|
||||
"gpuType": "T4"
|
||||
},
|
||||
"kernelspec": {
|
||||
"name": "python3",
|
||||
"display_name": "Python 3"
|
||||
},
|
||||
"language_info": {
|
||||
"name": "python"
|
||||
},
|
||||
"accelerator": "GPU"
|
||||
},
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"!pip install -q requests torch bitsandbytes transformers sentencepiece accelerate gradio"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "kU2JrcPlhwd9"
|
||||
},
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"**Imports**"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "lAMIVT4iwNg0"
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import requests\n",
|
||||
"from google.colab import drive\n",
|
||||
"from huggingface_hub import login\n",
|
||||
"from google.colab import userdata\n",
|
||||
"from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig\n",
|
||||
"import torch\n",
|
||||
"import gradio as gr\n",
|
||||
"\n",
|
||||
"hf_token = userdata.get('HF_TOKEN')\n",
|
||||
"login(hf_token, add_to_git_credential=True)"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "-Apd7-p-hyLk"
|
||||
},
|
||||
"execution_count": 2,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"**Model**"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "xa0qYqZrwQ66"
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"model_name = \"meta-llama/Meta-Llama-3.1-8B-Instruct\"\n",
|
||||
"quant_config = BitsAndBytesConfig(\n",
|
||||
" load_in_4bit=True,\n",
|
||||
" bnb_4bit_use_double_quant=True,\n",
|
||||
" bnb_4bit_compute_dtype=torch.bfloat16,\n",
|
||||
" bnb_4bit_quant_type=\"nf4\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"model = AutoModelForCausalLM.from_pretrained(\n",
|
||||
" model_name,\n",
|
||||
" device_map=\"auto\",\n",
|
||||
" quantization_config=quant_config\n",
|
||||
")"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "z5enGmuKjtJu"
|
||||
},
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"**Tokenizer**"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "y1hUSmWlwSbp"
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
|
||||
"tokenizer.pad_token = tokenizer.eos_token"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "WjxNWW6bvdgj"
|
||||
},
|
||||
"execution_count": 4,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"**Functions**"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "1pg2U-B3wbIK"
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"def generate_dataset(topic, number_of_data, inst1, resp1, inst2, resp2, inst3, resp3):\n",
|
||||
" # Convert user inputs into multi-shot examples\n",
|
||||
" multi_shot_examples = [\n",
|
||||
" {\"instruction\": inst1, \"response\": resp1},\n",
|
||||
" {\"instruction\": inst2, \"response\": resp2},\n",
|
||||
" {\"instruction\": inst3, \"response\": resp3}\n",
|
||||
" ]\n",
|
||||
"\n",
|
||||
" # System prompt\n",
|
||||
" system_prompt = f\"\"\"\n",
|
||||
" You are a helpful assistant whose main purpose is to generate datasets.\n",
|
||||
" Topic: {topic}\n",
|
||||
" Return the dataset in JSON format. Use examples with simple, fun, and easy-to-understand instructions for kids.\n",
|
||||
" Include the following examples: {multi_shot_examples}\n",
|
||||
" Return {number_of_data} examples each time.\n",
|
||||
" Do not repeat the provided examples.\n",
|
||||
" \"\"\"\n",
|
||||
"\n",
|
||||
" # Example Messages\n",
|
||||
" messages = [\n",
|
||||
" {\"role\": \"system\", \"content\": system_prompt},\n",
|
||||
" {\"role\": \"user\", \"content\": f\"Please generate my dataset for {topic}\"}\n",
|
||||
" ]\n",
|
||||
"\n",
|
||||
" # Tokenize Input\n",
|
||||
" inputs = tokenizer.apply_chat_template(messages, return_tensors=\"pt\").to(\"cuda\")\n",
|
||||
" streamer = TextStreamer(tokenizer)\n",
|
||||
"\n",
|
||||
" # Generate Output\n",
|
||||
" outputs = model.generate(inputs, max_new_tokens=2000, streamer=streamer)\n",
|
||||
"\n",
|
||||
" # Decode and Return\n",
|
||||
" return tokenizer.decode(outputs[0], skip_special_tokens=True)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def gradio_interface(topic, number_of_data, inst1, resp1, inst2, resp2, inst3, resp3):\n",
|
||||
" return generate_dataset(topic, number_of_data, inst1, resp1, inst2, resp2, inst3, resp3)"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "ZvljDKdji8iV"
|
||||
},
|
||||
"execution_count": 12,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"**Default Values**"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "_WDZ5dvRwmng"
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"default_topic = \"Talking to a (5-8) years old and teaching them manners.\"\n",
|
||||
"default_number_of_data = 10\n",
|
||||
"default_multi_shot_examples = [\n",
|
||||
" {\n",
|
||||
" \"instruction\": \"Why do I have to say please when I want something?\",\n",
|
||||
" \"response\": \"Because it’s like magic! It shows you’re nice, and people want to help you more.\"\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"instruction\": \"What should I say if someone gives me a toy?\",\n",
|
||||
" \"response\": \"You say, 'Thank you!' because it makes them happy you liked it.\"\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"instruction\": \"why should I listen to my parents?\",\n",
|
||||
" \"response\": \"Because parents want the best for you and they love you the most.\"\n",
|
||||
" }\n",
|
||||
"]"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "JAdfqYXnvEDE"
|
||||
},
|
||||
"execution_count": 13,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"**Init gradio**"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "JwZtD032wuK8"
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"gr_interface = gr.Interface(\n",
|
||||
" fn=gradio_interface,\n",
|
||||
" inputs=[\n",
|
||||
" gr.Textbox(label=\"Topic\", value=default_topic, lines=2),\n",
|
||||
" gr.Number(label=\"Number of Examples\", value=default_number_of_data, precision=0),\n",
|
||||
" gr.Textbox(label=\"Instruction 1\", value=default_multi_shot_examples[0][\"instruction\"]),\n",
|
||||
" gr.Textbox(label=\"Response 1\", value=default_multi_shot_examples[0][\"response\"]),\n",
|
||||
" gr.Textbox(label=\"Instruction 2\", value=default_multi_shot_examples[1][\"instruction\"]),\n",
|
||||
" gr.Textbox(label=\"Response 2\", value=default_multi_shot_examples[1][\"response\"]),\n",
|
||||
" gr.Textbox(label=\"Instruction 3\", value=default_multi_shot_examples[2][\"instruction\"]),\n",
|
||||
" gr.Textbox(label=\"Response 3\", value=default_multi_shot_examples[2][\"response\"]),\n",
|
||||
" ],\n",
|
||||
" outputs=gr.Textbox(label=\"Generated Dataset\")\n",
|
||||
")"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "xy2RP5T-vxXg"
|
||||
},
|
||||
"execution_count": 14,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"**Run the app**"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "HZx-mm9Uw3Ph"
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"gr_interface.launch()"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "bfGs5ip8mndg"
|
||||
},
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [],
|
||||
"metadata": {
|
||||
"id": "Cveqx392x7Mm"
|
||||
},
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
}
|
||||
]
|
||||
}
|
||||
Reference in New Issue
Block a user