Merge pull request #47 from innovoe/patch-1

Dataset Generator with Gradio
2024-12-21 22:17:26 +00:00
parent bed454f02d 09b19637a1
commit 1d953ca630
1 changed files with 267 additions and 0 deletions
--- a/week3/community-contributions/dataset_generator.ipynb
+++ b/week3/community-contributions/dataset_generator.ipynb
@@ -0,0 +1,267 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "gpuType": "T4"
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    },
+    "accelerator": "GPU"
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "source": [
+        "!pip install -q requests torch bitsandbytes transformers sentencepiece accelerate gradio"
+      ],
+      "metadata": {
+        "id": "kU2JrcPlhwd9"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "**Imports**"
+      ],
+      "metadata": {
+        "id": "lAMIVT4iwNg0"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import os\n",
+        "import requests\n",
+        "from google.colab import drive\n",
+        "from huggingface_hub import login\n",
+        "from google.colab import userdata\n",
+        "from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig\n",
+        "import torch\n",
+        "import gradio as gr\n",
+        "\n",
+        "hf_token = userdata.get('HF_TOKEN')\n",
+        "login(hf_token, add_to_git_credential=True)"
+      ],
+      "metadata": {
+        "id": "-Apd7-p-hyLk"
+      },
+      "execution_count": 2,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "**Model**"
+      ],
+      "metadata": {
+        "id": "xa0qYqZrwQ66"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "model_name = \"meta-llama/Meta-Llama-3.1-8B-Instruct\"\n",
+        "quant_config = BitsAndBytesConfig(\n",
+        "    load_in_4bit=True,\n",
+        "    bnb_4bit_use_double_quant=True,\n",
+        "    bnb_4bit_compute_dtype=torch.bfloat16,\n",
+        "    bnb_4bit_quant_type=\"nf4\"\n",
+        ")\n",
+        "\n",
+        "model = AutoModelForCausalLM.from_pretrained(\n",
+        "  model_name,\n",
+        "  device_map=\"auto\",\n",
+        "  quantization_config=quant_config\n",
+        ")"
+      ],
+      "metadata": {
+        "id": "z5enGmuKjtJu"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "**Tokenizer**"
+      ],
+      "metadata": {
+        "id": "y1hUSmWlwSbp"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
+        "tokenizer.pad_token = tokenizer.eos_token"
+      ],
+      "metadata": {
+        "id": "WjxNWW6bvdgj"
+      },
+      "execution_count": 4,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "**Functions**"
+      ],
+      "metadata": {
+        "id": "1pg2U-B3wbIK"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def generate_dataset(topic, number_of_data, inst1, resp1, inst2, resp2, inst3, resp3):\n",
+        "    # Convert user inputs into multi-shot examples\n",
+        "    multi_shot_examples = [\n",
+        "        {\"instruction\": inst1, \"response\": resp1},\n",
+        "        {\"instruction\": inst2, \"response\": resp2},\n",
+        "        {\"instruction\": inst3, \"response\": resp3}\n",
+        "    ]\n",
+        "\n",
+        "    # System prompt\n",
+        "    system_prompt = f\"\"\"\n",
+        "    You are a helpful assistant whose main purpose is to generate datasets.\n",
+        "    Topic: {topic}\n",
+        "    Return the dataset in JSON format. Use examples with simple, fun, and easy-to-understand instructions for kids.\n",
+        "    Include the following examples: {multi_shot_examples}\n",
+        "    Return {number_of_data} examples each time.\n",
+        "    Do not repeat the provided examples.\n",
+        "    \"\"\"\n",
+        "\n",
+        "    # Example Messages\n",
+        "    messages = [\n",
+        "        {\"role\": \"system\", \"content\": system_prompt},\n",
+        "        {\"role\": \"user\", \"content\": f\"Please generate my dataset for {topic}\"}\n",
+        "    ]\n",
+        "\n",
+        "    # Tokenize Input\n",
+        "    inputs = tokenizer.apply_chat_template(messages, return_tensors=\"pt\").to(\"cuda\")\n",
+        "    streamer = TextStreamer(tokenizer)\n",
+        "\n",
+        "    # Generate Output\n",
+        "    outputs = model.generate(inputs, max_new_tokens=2000, streamer=streamer)\n",
+        "\n",
+        "    # Decode and Return\n",
+        "    return tokenizer.decode(outputs[0], skip_special_tokens=True)\n",
+        "\n",
+        "\n",
+        "def gradio_interface(topic, number_of_data, inst1, resp1, inst2, resp2, inst3, resp3):\n",
+        "    return generate_dataset(topic, number_of_data, inst1, resp1, inst2, resp2, inst3, resp3)"
+      ],
+      "metadata": {
+        "id": "ZvljDKdji8iV"
+      },
+      "execution_count": 12,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "**Default Values**"
+      ],
+      "metadata": {
+        "id": "_WDZ5dvRwmng"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "default_topic = \"Talking to a (5-8) years old and teaching them manners.\"\n",
+        "default_number_of_data = 10\n",
+        "default_multi_shot_examples = [\n",
+        "    {\n",
+        "        \"instruction\": \"Why do I have to say please when I want something?\",\n",
+        "        \"response\": \"Because it’s like magic! It shows you’re nice, and people want to help you more.\"\n",
+        "    },\n",
+        "    {\n",
+        "        \"instruction\": \"What should I say if someone gives me a toy?\",\n",
+        "        \"response\": \"You say, 'Thank you!' because it makes them happy you liked it.\"\n",
+        "    },\n",
+        "    {\n",
+        "        \"instruction\": \"why should I listen to my parents?\",\n",
+        "        \"response\": \"Because parents want the best for you and they love you the most.\"\n",
+        "    }\n",
+        "]"
+      ],
+      "metadata": {
+        "id": "JAdfqYXnvEDE"
+      },
+      "execution_count": 13,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "**Init gradio**"
+      ],
+      "metadata": {
+        "id": "JwZtD032wuK8"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "gr_interface = gr.Interface(\n",
+        "    fn=gradio_interface,\n",
+        "    inputs=[\n",
+        "        gr.Textbox(label=\"Topic\", value=default_topic, lines=2),\n",
+        "        gr.Number(label=\"Number of Examples\", value=default_number_of_data, precision=0),\n",
+        "        gr.Textbox(label=\"Instruction 1\", value=default_multi_shot_examples[0][\"instruction\"]),\n",
+        "        gr.Textbox(label=\"Response 1\", value=default_multi_shot_examples[0][\"response\"]),\n",
+        "        gr.Textbox(label=\"Instruction 2\", value=default_multi_shot_examples[1][\"instruction\"]),\n",
+        "        gr.Textbox(label=\"Response 2\", value=default_multi_shot_examples[1][\"response\"]),\n",
+        "        gr.Textbox(label=\"Instruction 3\", value=default_multi_shot_examples[2][\"instruction\"]),\n",
+        "        gr.Textbox(label=\"Response 3\", value=default_multi_shot_examples[2][\"response\"]),\n",
+        "    ],\n",
+        "    outputs=gr.Textbox(label=\"Generated Dataset\")\n",
+        ")"
+      ],
+      "metadata": {
+        "id": "xy2RP5T-vxXg"
+      },
+      "execution_count": 14,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "**Run the app**"
+      ],
+      "metadata": {
+        "id": "HZx-mm9Uw3Ph"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "gr_interface.launch()"
+      ],
+      "metadata": {
+        "id": "bfGs5ip8mndg"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [],
+      "metadata": {
+        "id": "Cveqx392x7Mm"
+      },
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}