333 lines
14 KiB
Plaintext
333 lines
14 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "yqlQTsxNdKrN"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"!pip install -q requests torch bitsandbytes transformers sentencepiece accelerate openai httpx==0.27.2 gradio"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "eyfvQrLxdkGT"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"import os\n",
|
|
"import requests\n",
|
|
"from IPython.display import Markdown, display, update_display\n",
|
|
"from openai import OpenAI\n",
|
|
"from google.colab import drive\n",
|
|
"from huggingface_hub import login\n",
|
|
"from google.colab import userdata\n",
|
|
"from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig\n",
|
|
"import torch\n",
|
|
"import gradio as gr\n",
|
|
"import re"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "WW-cSZk7dnp6"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# one can always add more models, of course\n",
|
|
"\n",
|
|
"LLAMA = \"meta-llama/Meta-Llama-3.1-8B-Instruct\"\n",
|
|
"OPENAI_MODEL = \"gpt-4o-mini\""
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "XG7Iam6Rdw8F"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"hf_token = userdata.get('HF_TOKEN')\n",
|
|
"login(hf_token, add_to_git_credential=True)\n",
|
|
"openai_api_key = userdata.get('OPENAI_API_KEY')\n",
|
|
"openai = OpenAI(api_key=openai_api_key)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "Ov7WSdx9dzSt"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"force_dark_mode = \"\"\"\n",
|
|
"function refresh() {\n",
|
|
" const url = new URL(window.location);\n",
|
|
" if (url.searchParams.get('__theme') !== 'dark') {\n",
|
|
" url.searchParams.set('__theme', 'dark');\n",
|
|
" window.location.href = url.href;\n",
|
|
" }\n",
|
|
"}\n",
|
|
"\"\"\""
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "bEF8w_Mdd2Nb"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"def dataset_generator(model, nature, shots, volume, language):\n",
|
|
"\n",
|
|
" examples = \"Instruction: 'Make a random sentence.'\\nAnswer: 'When I got home last night, I couldn't believe my eyes: All the pineapples had been removed from the pizza.'\"\n",
|
|
" system_message = \"You are a random sentence generator. Generate 10 diverse English sentences.\"\n",
|
|
" user_prompt = f\"Generate 10 random English sentences, like so:\\n{examples}\"\n",
|
|
" sentences = \"\"\n",
|
|
"\n",
|
|
" if language == \"English\":\n",
|
|
"\n",
|
|
" for shot in list(shots.keys()):\n",
|
|
" examples += f\"\\nExample instruction: '{shot}'\\nExample answer: '{shots[shot]}'\\n\"\n",
|
|
"\n",
|
|
" system_message = f\"You are a state-of-the art linguistic dataset compiler. You are given a 'Type' of sentence to create. \\\n",
|
|
"Within the bounds of that type, create {volume} diverse sentences with differing structures and lengths. Make the sentences plausible, \\\n",
|
|
"but be creative in filling them with random concrete information, names, and data. Here are some examples for how to go about that:\\n{examples}\\n\\\n",
|
|
"Just output one sentence per line. Do not comment or format yor output in any way, shape, or form.\"\n",
|
|
"\n",
|
|
" user_prompt = f\"Generate {volume} English sentences of the following Type: {nature}. Just output one sentence per line. \\\n",
|
|
"Do not comment or format yor output in any way, shape, or form.\"\n",
|
|
"\n",
|
|
" elif language == \"German\":\n",
|
|
"\n",
|
|
" for shot in list(shots.keys()):\n",
|
|
" examples += f\"\\nAnweisung: '{shot}'\\nAntwort: '{shots[shot]}'\\n\"\n",
|
|
"\n",
|
|
" system_message = f\"Du bist ein weltklasse Datensatz-Sammler für Sprachdaten. Du erhältst einen 'Typ' von Sätzen, die du erstellen sollst. \\\n",
|
|
"Im Rahmen dieses Typs, generiere {volume} untereinander verschiedene Sätze mit unterschiedlichen Satzlängen und -strukturen. Mache die Beispielsätze \\\n",
|
|
"plausibel, aber fülle sie kreativ mit willkürlichen Informationen, Namen, und Daten aller Art. Hier sind ein paar Beispiel, wie du vorgehen sollst:\\n{examples}\\n\\\n",
|
|
"Gib einfach einen Satz pro Zeile aus. Kommentiere oder formatiere deine Antwort in keinster Weise.\"\n",
|
|
"\n",
|
|
" user_prompt = f\"Generiere {volume} deutsche Sätze des folgenden Typs: {nature}. Gib einfach einen Satz pro Zeile aus. \\\n",
|
|
"Kommentiere oder formatiere deine Antwort in keiner Weise.\"\n",
|
|
"\n",
|
|
" elif language == \"French\":\n",
|
|
"\n",
|
|
" for shot in list(shots.keys()):\n",
|
|
" examples += f\"\\nConsigne: '{shot}'\\nRéponse: '{shots[shot]}'\\n\"\n",
|
|
"\n",
|
|
" system_message = f\"Tu es un outil linguistique de pointe, à savoir, un genérateur de données linguistiques. Tu seras assigné un 'Type' de phrases à créer. \\\n",
|
|
"Dans le cadre de ce type-là, crée {volume} phrases diverses, avec des structures et longueurs qui varient. Génère des phrases qui soient plausibles, \\\n",
|
|
"mais sois créatif, et sers-toi de données, noms, et informations aléatoires pour rendre les phrases plus naturelles. Voici quelques examples comment faire:\\n{examples}\\n\\\n",
|
|
"Sors une seule phrase par ligne. Ne formatte ni commente ta réponse en aucune manière que ce soit.\"\n",
|
|
"\n",
|
|
" user_prompt = f\"S'il te plaît, crée {volume} phrases en français du Type suivant: {nature}. Sors une seule phrase par ligne. \\\n",
|
|
"Ne formatte ni commente ta réponse en aucune manière que ce soit.\"\n",
|
|
"\n",
|
|
" messages = [\n",
|
|
" {\"role\": \"system\", \"content\": system_message},\n",
|
|
" {\"role\": \"user\", \"content\": user_prompt}\n",
|
|
" ]\n",
|
|
"\n",
|
|
" if model == \"Llama\":\n",
|
|
"\n",
|
|
" quant_config = BitsAndBytesConfig(\n",
|
|
" load_in_4bit=True,\n",
|
|
" bnb_4bit_use_double_quant=True,\n",
|
|
" bnb_4bit_compute_dtype=torch.bfloat16,\n",
|
|
" bnb_4bit_quant_type=\"nf4\"\n",
|
|
" )\n",
|
|
"\n",
|
|
" tokenizer = AutoTokenizer.from_pretrained(LLAMA)\n",
|
|
" tokenizer.pad_token = tokenizer.eos_token\n",
|
|
" inputs = tokenizer.apply_chat_template(messages, return_tensors=\"pt\").to(\"cuda\")\n",
|
|
" streamer = TextStreamer(tokenizer)\n",
|
|
" model = AutoModelForCausalLM.from_pretrained(LLAMA, device_map=\"auto\", quantization_config=quant_config)\n",
|
|
" outputs = model.generate(inputs, max_new_tokens=10000)\n",
|
|
"\n",
|
|
" response = tokenizer.decode(outputs[0])\n",
|
|
" sentences = list(re.finditer(\"(?:<\\|end_header_id\\|>)([^<]+)(?:<\\|eot_id\\|>)\", str(response), re.DOTALL))[-1].group(1)\n",
|
|
"\n",
|
|
" elif model == \"OpenAI\":\n",
|
|
" response = openai.chat.completions.create(model=OPENAI_MODEL, messages=messages)\n",
|
|
" sentences = response.choices[0].message.content\n",
|
|
"\n",
|
|
" return sentences"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "VRKdu0fEt8mg"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"global data\n",
|
|
"data = \"\"\n",
|
|
"\n",
|
|
"with gr.Blocks(\n",
|
|
" css=\"\"\"\n",
|
|
" .red-button {\n",
|
|
" background-color: darkred !important;\n",
|
|
" border-color: red !important;\n",
|
|
" }\n",
|
|
" .blue-button {\n",
|
|
" background-color: darkblue !important;\n",
|
|
" border-color: blue !important;\n",
|
|
" }\n",
|
|
" .green-button {\n",
|
|
" background-color: green !important;\n",
|
|
" border-color: green !important;\n",
|
|
" }\n",
|
|
" \"\"\"\n",
|
|
") as view:\n",
|
|
" with gr.Row():\n",
|
|
" title = gr.HTML(\"<h1><big>D</big>ataset Generator <small>PLUS</small></h1><h2>for English, German, and French</h2>\")\n",
|
|
" subtitle = gr.HTML(\"<h3>Instructions:</h3><ol><li>Pick the language</li>\\\n",
|
|
"<li>Select a model</li><li>Indicate how many sentences you need</li>\\\n",
|
|
"<li>Describe the type of sentence you're looking for</li><li>Give up to three examples of the desired output sentence, and describe each of them briefly</li>\\\n",
|
|
"<li>Hit <q>Create Dataset</q></li>\\\n",
|
|
"<li>Save the output (.txt) to your Google Drive</li>\")\n",
|
|
" with gr.Row():\n",
|
|
" language_choice = gr.Dropdown(choices=[\"English\", \"German\", \"French\"], label=\"Select language\", value=\"English\", interactive=True)\n",
|
|
" model_choice = gr.Dropdown(choices=[\"Llama\", \"OpenAI\"], label=\"Select model\", value=\"Llama\", interactive=True)\n",
|
|
" volume = gr.Textbox(label=\"Required number of sentences\", interactive=True)\n",
|
|
" with gr.Row():\n",
|
|
" typeInput = gr.Textbox(label=\"Short description of the kind of sentence you need\", interactive=True)\n",
|
|
" with gr.Row():\n",
|
|
" sentence_1 = gr.Textbox(label=\"Example sentence 1\", interactive=True)\n",
|
|
" instruction_1 = gr.Textbox(label=\"Description\", interactive=True)\n",
|
|
" with gr.Row():\n",
|
|
" sentence_2 = gr.Textbox(label=\"Example sentence 2\", interactive=True)\n",
|
|
" instruction_2 = gr.Textbox(label=\"Description\", interactive=True)\n",
|
|
" with gr.Row():\n",
|
|
" sentence_3 = gr.Textbox(label=\"Example sentence 3\", interactive=True)\n",
|
|
" instruction_3 = gr.Textbox(label=\"Description\", interactive=True)\n",
|
|
" with gr.Row():\n",
|
|
" liveSentences = gr.Markdown(\n",
|
|
" value='<div style=\"color: #999; padding: 10px;\">Your sentences will be displayed here …</div>',\n",
|
|
" label=\"Generated sentences:\",\n",
|
|
" min_height=60,\n",
|
|
" max_height=200\n",
|
|
" )\n",
|
|
" with gr.Row():\n",
|
|
" generate = gr.Button(value=\"Generate sentences\", elem_classes=\"blue-button\")\n",
|
|
" with gr.Row():\n",
|
|
" clear = gr.Button(value=\"Clear everything\", elem_classes=\"red-button\")\n",
|
|
" with gr.Row():\n",
|
|
" outputPath = gr.Textbox(label=\"Specify the desired name and location on your Google Drive for the sentences (plain text) to be saved\", interactive=True)\n",
|
|
" with gr.Row():\n",
|
|
" save = gr.Button(value=\"Save generated data\", elem_classes=\"blue-button\")\n",
|
|
"\n",
|
|
" def generateSentences(typeInput, s1, i1, s2, i2, s3, i3, volume, language, model):\n",
|
|
" global data\n",
|
|
" nature = \"\"\n",
|
|
" shots = {}\n",
|
|
" amount = int(volume) if re.search(\"^[0-9]+$\", volume) is not None else 10\n",
|
|
"\n",
|
|
" if typeInput != None:\n",
|
|
" nature = typeInput\n",
|
|
" else:\n",
|
|
" nature = \"Random sentences of mixed nature\"\n",
|
|
"\n",
|
|
" if s1 != None:\n",
|
|
" if i1 != None:\n",
|
|
" shots[i1] = s1\n",
|
|
" else:\n",
|
|
" shots[\"A medium-long random sentence about anything\"] = s1\n",
|
|
" else:\n",
|
|
" shots[\"A medium-long random sentence about anything\"] = \"Paul, waking up out of his half-drunken haze, clearly couldn't tell left from right and ran right into the door.\"\n",
|
|
"\n",
|
|
" if s2 != None:\n",
|
|
" if i2 != None:\n",
|
|
" shots[i2] = s2\n",
|
|
" else:\n",
|
|
" shots[\"A medium-long random sentence about anything\"] = s2\n",
|
|
"\n",
|
|
" if s3 != None:\n",
|
|
" if i3 != None:\n",
|
|
" shots[i3] = s3\n",
|
|
" else:\n",
|
|
" shots[\"A medium-long random sentence about anything\"] = s3\n",
|
|
"\n",
|
|
" sentences = dataset_generator(model, nature, shots, amount, language)\n",
|
|
" data = sentences\n",
|
|
"\n",
|
|
" return sentences\n",
|
|
"\n",
|
|
" def saveData(path):\n",
|
|
" global data\n",
|
|
" drive.mount(\"/content/drive\")\n",
|
|
"\n",
|
|
" dir_path = os.path.dirname(\"/content/drive/MyDrive/\" + path)\n",
|
|
"\n",
|
|
" if not os.path.exists(dir_path):\n",
|
|
" os.makedirs(dir_path)\n",
|
|
"\n",
|
|
" with open(\"/content/drive/MyDrive/\" + path, \"w\", encoding=\"utf-8\") as f:\n",
|
|
" f.write(data)\n",
|
|
"\n",
|
|
" generate.click(generateSentences, inputs=[typeInput, sentence_1, instruction_1, sentence_2, instruction_2, sentence_3, instruction_3, volume, language_choice, model_choice], outputs=liveSentences)\n",
|
|
" clear.click(\n",
|
|
" lambda: [\n",
|
|
" gr.update(value=\"\"),\n",
|
|
" gr.update(value=\"\"),\n",
|
|
" gr.update(value=\"\"),\n",
|
|
" gr.update(value=\"\"),\n",
|
|
" gr.update(value=\"\"),\n",
|
|
" gr.update(value=\"\"),\n",
|
|
" gr.update(value=\"\"),\n",
|
|
" gr.update(value=\"\"),\n",
|
|
" gr.update(value='<div style=\"color: #999; padding: 10px;\">Your sentences will be displayed here …</div>'),\n",
|
|
" gr.update(value=\"\"),\n",
|
|
" gr.update(value=\"Save generated data\", elem_classes=\"blue-button\")],\n",
|
|
" None,\n",
|
|
" [volume, typeInput, sentence_1, instruction_1, sentence_2, instruction_2,\n",
|
|
" sentence_3, instruction_3, liveSentences, outputPath, save],\n",
|
|
" queue=False\n",
|
|
" )\n",
|
|
" save.click(saveData, inputs=outputPath, outputs=None).then(lambda: gr.update(value=\"Your data has been saved\", elem_classes=\"green-button\"), [], [save])\n",
|
|
"\n",
|
|
"view.launch(share=True) #, debug=True)"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"accelerator": "GPU",
|
|
"colab": {
|
|
"authorship_tag": "ABX9TyPxJzufoQPtui+nhl1J1xiR",
|
|
"gpuType": "T4",
|
|
"provenance": []
|
|
},
|
|
"kernelspec": {
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.11.11"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 4
|
|
}
|