Add week 3 exercise: Educational Q&A Dataset Generator

This commit is contained in:
Philip Omoigui
2025-10-31 15:40:47 +01:00
parent 80ae1df886
commit 6e2e5064b6

View File

@@ -0,0 +1,529 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%pip install -q transformers accelerate bitsandbytes torch gradio\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import torch\n",
"import json\n",
"import pandas as pd\n",
"import gradio as gr\n",
"from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig\n",
"from huggingface_hub import login\n",
"from google.colab import userdata\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Authenticate with HuggingFace\n",
"hf_token = userdata.get('HF_TOKEN')\n",
"login(hf_token, add_to_git_credential=True)\n",
"print(\"Successfully authenticated with HuggingFace\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Model configuration\n",
"MODEL_NAME = \"meta-llama/Meta-Llama-3.1-8B-Instruct\"\n",
"\n",
"# 4-bit quantization for efficiency on T4 GPU\n",
"quant_config = BitsAndBytesConfig(\n",
" load_in_4bit=True,\n",
" bnb_4bit_use_double_quant=True,\n",
" bnb_4bit_compute_dtype=torch.bfloat16,\n",
" bnb_4bit_quant_type=\"nf4\"\n",
")\n",
"\n",
"# Load tokenizer and model\n",
"tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)\n",
"tokenizer.pad_token = tokenizer.eos_token\n",
"\n",
"model = AutoModelForCausalLM.from_pretrained(\n",
" MODEL_NAME,\n",
" device_map=\"auto\",\n",
" quantization_config=quant_config\n",
")\n",
"\n",
"print(\"Model loaded successfully!\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Topic definitions based on course content\n",
"TOPICS = {\n",
" \"Week 1: LLM APIs & Prompting\": {\n",
" \"concepts\": [\n",
" \"OpenAI API usage and parameters\",\n",
" \"Prompt engineering techniques\",\n",
" \"Temperature and top_p parameters\",\n",
" \"System vs user messages\",\n",
" \"JSON mode and structured outputs\",\n",
" \"Token counting and pricing\",\n",
" \"Chat completions vs completions\",\n",
" \"Few-shot learning\"\n",
" ]\n",
" },\n",
" \"Week 2: Function Calling & Agents\": {\n",
" \"concepts\": [\n",
" \"Function calling syntax and format\",\n",
" \"Tool definitions and schemas\",\n",
" \"Parallel function calling\",\n",
" \"Function calling best practices\",\n",
" \"Agent patterns and workflows\",\n",
" \"Structured outputs with Pydantic\",\n",
" \"Error handling in function calls\"\n",
" ]\n",
" },\n",
" \"Week 3: Transformers & Models\": {\n",
" \"concepts\": [\n",
" \"Tokenizers and tokenization strategies\",\n",
" \"BPE, WordPiece, and SentencePiece\",\n",
" \"HuggingFace pipelines\",\n",
" \"AutoModel and AutoTokenizer\",\n",
" \"Model quantization (4-bit, 8-bit)\",\n",
" \"Speech-to-text with Whisper\",\n",
" \"Local vs cloud model inference\",\n",
" \"Model architectures (encoder, decoder, encoder-decoder)\"\n",
" ]\n",
" }\n",
"}\n",
"\n",
"# Difficulty level descriptions\n",
"DIFFICULTY_LEVELS = {\n",
" \"Beginner\": \"Basic understanding of concepts and definitions\",\n",
" \"Intermediate\": \"Application of concepts with some technical depth\",\n",
" \"Advanced\": \"Edge cases, optimization, and deep technical understanding\"\n",
"}\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def generate_questions(topic, difficulty, num_questions):\n",
" \"\"\"\n",
" Generate educational Q&A questions using the LLM.\n",
" \n",
" Args:\n",
" topic: Topic category to generate questions for\n",
" difficulty: Difficulty level (Beginner/Intermediate/Advanced)\n",
" num_questions: Number of questions to generate\n",
" \n",
" Returns:\n",
" List of dictionaries containing questions and answers\n",
" \"\"\"\n",
" \n",
" # Get topic details\n",
" topic_info = TOPICS[topic]\n",
" concepts = \", \".join(topic_info[\"concepts\"])\n",
" \n",
" # Build the prompt using Llama's chat format\n",
" system_message = \"\"\"You are an expert educator creating high-quality multiple-choice questions for an LLM Engineering course.\n",
"\n",
"Format each question EXACTLY as shown below:\n",
"\n",
"QUESTION: [question text]\n",
"A) [option A]\n",
"B) [option B]\n",
"C) [option C]\n",
"D) [option D]\n",
"ANSWER: [correct letter]\n",
"EXPLANATION: [brief explanation]\n",
"---\"\"\"\n",
"\n",
" user_prompt = f\"\"\"Create {num_questions} multiple-choice questions about: {topic}\n",
"\n",
"Difficulty Level: {difficulty}\n",
"\n",
"Cover these concepts: {concepts}\n",
"\n",
"Requirements:\n",
"- Questions should be practical and relevant to real LLM engineering\n",
"- All 4 options should be plausible\n",
"- Explanations should be clear and educational\n",
"- Vary the correct answer position\n",
"\n",
"Generate {num_questions} questions now:\"\"\"\n",
"\n",
" # Prepare messages for Llama\n",
" messages = [\n",
" {\"role\": \"system\", \"content\": system_message},\n",
" {\"role\": \"user\", \"content\": user_prompt}\n",
" ]\n",
" \n",
" # Tokenize using Llama's chat template\n",
" input_ids = tokenizer.apply_chat_template(\n",
" messages,\n",
" return_tensors=\"pt\",\n",
" add_generation_prompt=True\n",
" ).to(model.device)\n",
" \n",
" attention_mask = torch.ones_like(input_ids).to(model.device)\n",
" \n",
" # Generate\n",
" print(f\"Generating {num_questions} questions...\")\n",
" max_tokens = min(2500, num_questions * 200)\n",
" \n",
" with torch.no_grad():\n",
" outputs = model.generate(\n",
" input_ids,\n",
" attention_mask=attention_mask,\n",
" max_new_tokens=max_tokens,\n",
" temperature=0.7,\n",
" do_sample=True,\n",
" top_p=0.9,\n",
" pad_token_id=tokenizer.eos_token_id\n",
" )\n",
" \n",
" # Decode\n",
" response = tokenizer.decode(outputs[0], skip_special_tokens=True)\n",
" \n",
" # Extract just the assistant's response\n",
" if \"assistant\" in response:\n",
" response = response.split(\"assistant\")[-1].strip()\n",
" \n",
" # Debug: print what we got\n",
" print(\"Generated text preview:\")\n",
" print(response[:500] + \"...\" if len(response) > 500 else response)\n",
" print()\n",
" \n",
" # Parse the questions\n",
" questions = parse_questions(response, topic, difficulty)\n",
" \n",
" print(f\"Successfully generated {len(questions)} questions\")\n",
" return questions\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def parse_questions(text, topic, difficulty):\n",
" \"\"\"\n",
" Parse the generated text into structured question objects.\n",
" More robust parsing that handles various formats.\n",
" \"\"\"\n",
" questions = []\n",
" \n",
" # Split by \"QUESTION:\" to get individual question blocks\n",
" blocks = text.split(\"QUESTION:\")\n",
" \n",
" for i, block in enumerate(blocks):\n",
" if not block.strip() or i == 0 and len(block) < 20:\n",
" continue\n",
" \n",
" try:\n",
" # Extract components\n",
" question_text = \"\"\n",
" options = {}\n",
" answer = \"\"\n",
" explanation = \"\"\n",
" \n",
" lines = block.strip().split(\"\\n\")\n",
" \n",
" for line in lines:\n",
" line = line.strip()\n",
" if not line or line == \"---\":\n",
" continue\n",
" \n",
" # Handle question text (first non-empty line before options)\n",
" if not question_text and not any(line.startswith(x) for x in [\"A)\", \"B)\", \"C)\", \"D)\", \"ANSWER:\", \"EXPLANATION:\", \"Answer:\", \"Explanation:\"]):\n",
" question_text = line\n",
" \n",
" # Handle options - be flexible with formatting\n",
" elif line.startswith(\"A)\") or line.startswith(\"A.\"):\n",
" options[\"A\"] = line[2:].strip()\n",
" elif line.startswith(\"B)\") or line.startswith(\"B.\"):\n",
" options[\"B\"] = line[2:].strip()\n",
" elif line.startswith(\"C)\") or line.startswith(\"C.\"):\n",
" options[\"C\"] = line[2:].strip()\n",
" elif line.startswith(\"D)\") or line.startswith(\"D.\"):\n",
" options[\"D\"] = line[2:].strip()\n",
" \n",
" # Handle answer\n",
" elif line.upper().startswith(\"ANSWER:\"):\n",
" answer = line.split(\":\", 1)[1].strip()\n",
" \n",
" # Handle explanation\n",
" elif line.upper().startswith(\"EXPLANATION:\"):\n",
" explanation = line.split(\":\", 1)[1].strip()\n",
" elif explanation and len(explanation) < 200:\n",
" # Continue multi-line explanation (up to reasonable length)\n",
" explanation += \" \" + line\n",
" \n",
" # Extract just the letter from answer\n",
" if answer:\n",
" answer_letter = \"\"\n",
" for char in answer.upper():\n",
" if char in [\"A\", \"B\", \"C\", \"D\"]:\n",
" answer_letter = char\n",
" break\n",
" answer = answer_letter\n",
" \n",
" # Only add if we have minimum required components\n",
" if question_text and len(options) >= 3 and answer:\n",
" # Fill missing option if needed\n",
" if len(options) == 3:\n",
" for letter in [\"A\", \"B\", \"C\", \"D\"]:\n",
" if letter not in options:\n",
" options[letter] = \"Not applicable\"\n",
" break\n",
" \n",
" # Use placeholder explanation if none provided\n",
" if not explanation:\n",
" explanation = f\"The correct answer is {answer}.\"\n",
" \n",
" questions.append({\n",
" \"id\": len(questions) + 1,\n",
" \"topic\": topic,\n",
" \"difficulty\": difficulty,\n",
" \"question\": question_text,\n",
" \"options\": options,\n",
" \"correct_answer\": answer,\n",
" \"explanation\": explanation.strip()\n",
" })\n",
" print(f\"Parsed question {len(questions)}\")\n",
" else:\n",
" print(f\"Skipped incomplete block: Q={bool(question_text)}, Opts={len(options)}, Ans={bool(answer)}\")\n",
" \n",
" except Exception as e:\n",
" print(f\"Error parsing block {i+1}: {str(e)}\")\n",
" continue\n",
" \n",
" return questions\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def format_questions_display(questions):\n",
" \"\"\"Format questions for display in Gradio.\"\"\"\n",
" if not questions:\n",
" return \"No questions generated.\"\n",
" \n",
" output = f\"# Generated Questions\\n\\n\"\n",
" output += f\"**Total Questions:** {len(questions)}\\n\\n\"\n",
" output += \"---\\n\\n\"\n",
" \n",
" for q in questions:\n",
" output += f\"## Question {q['id']}\\n\\n\"\n",
" output += f\"**Topic:** {q['topic']} \\n\"\n",
" output += f\"**Difficulty:** {q['difficulty']} \\n\\n\"\n",
" output += f\"**Q:** {q['question']}\\n\\n\"\n",
" \n",
" for letter in ['A', 'B', 'C', 'D']:\n",
" prefix = \"✅ \" if letter == q['correct_answer'] else \"\"\n",
" output += f\"{prefix}{letter}) {q['options'][letter]}\\n\\n\"\n",
" \n",
" output += f\"**Answer:** {q['correct_answer']}\\n\\n\"\n",
" output += f\"**Explanation:** {q['explanation']}\\n\\n\"\n",
" output += \"---\\n\\n\"\n",
" \n",
" return output\n",
"\n",
"\n",
"def export_to_json(questions):\n",
" \"\"\"Export questions to JSON file.\"\"\"\n",
" if not questions:\n",
" return None\n",
" \n",
" filename = \"educational_qa_dataset.json\"\n",
" with open(filename, 'w') as f:\n",
" json.dump(questions, f, indent=2)\n",
" \n",
" return filename\n",
"\n",
"\n",
"def export_to_csv(questions):\n",
" \"\"\"Export questions to CSV file.\"\"\"\n",
" if not questions:\n",
" return None\n",
" \n",
" # Flatten the data for CSV\n",
" flattened = []\n",
" for q in questions:\n",
" flattened.append({\n",
" 'id': q['id'],\n",
" 'topic': q['topic'],\n",
" 'difficulty': q['difficulty'],\n",
" 'question': q['question'],\n",
" 'option_A': q['options']['A'],\n",
" 'option_B': q['options']['B'],\n",
" 'option_C': q['options']['C'],\n",
" 'option_D': q['options']['D'],\n",
" 'correct_answer': q['correct_answer'],\n",
" 'explanation': q['explanation']\n",
" })\n",
" \n",
" filename = \"educational_qa_dataset.csv\"\n",
" df = pd.DataFrame(flattened)\n",
" df.to_csv(filename, index=False)\n",
" \n",
" return filename\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def gradio_generate(topic, difficulty, num_questions):\n",
" \"\"\"\n",
" Wrapper function for Gradio interface.\n",
" Generates questions and returns formatted output plus download files.\n",
" \"\"\"\n",
" try:\n",
" # Generate questions\n",
" questions = generate_questions(topic, difficulty, num_questions)\n",
" \n",
" if not questions:\n",
" return \"Failed to generate questions. Please try again.\", None, None\n",
" \n",
" # Format for display\n",
" display_text = format_questions_display(questions)\n",
" \n",
" # Export files\n",
" json_file = export_to_json(questions)\n",
" csv_file = export_to_csv(questions)\n",
" \n",
" return display_text, json_file, csv_file\n",
" \n",
" except Exception as e:\n",
" return f\"Error: {str(e)}\", None, None\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Build the Gradio UI\n",
"with gr.Blocks(title=\"Educational Q&A Generator\", theme=gr.themes.Soft()) as demo:\n",
" \n",
" gr.Markdown(\"\"\"\n",
" # 📚 Educational Q&A Dataset Generator\n",
" Generate high-quality multiple-choice questions for LLM Engineering topics\n",
" \"\"\")\n",
" \n",
" with gr.Row():\n",
" with gr.Column(scale=1):\n",
" gr.Markdown(\"### ⚙️ Configuration\")\n",
" \n",
" topic_dropdown = gr.Dropdown(\n",
" choices=list(TOPICS.keys()),\n",
" value=\"Week 3: Transformers & Models\",\n",
" label=\"Select Topic\",\n",
" info=\"Choose which week's content to generate questions for\"\n",
" )\n",
" \n",
" difficulty_dropdown = gr.Dropdown(\n",
" choices=[\"Beginner\", \"Intermediate\", \"Advanced\"],\n",
" value=\"Intermediate\",\n",
" label=\"Difficulty Level\",\n",
" info=\"Select the difficulty of the questions\"\n",
" )\n",
" \n",
" num_questions_slider = gr.Slider(\n",
" minimum=5,\n",
" maximum=20,\n",
" value=10,\n",
" step=5,\n",
" label=\"Number of Questions\",\n",
" info=\"How many questions to generate (5-20)\"\n",
" )\n",
" \n",
" generate_btn = gr.Button(\"🚀 Generate Questions\", variant=\"primary\", size=\"lg\")\n",
" \n",
" gr.Markdown(\"\"\"\n",
" ---\n",
" ### 📥 Download Files\n",
" After generation, download your dataset in JSON or CSV format\n",
" \"\"\")\n",
" \n",
" with gr.Row():\n",
" json_download = gr.File(label=\"JSON File\", interactive=False)\n",
" csv_download = gr.File(label=\"CSV File\", interactive=False)\n",
" \n",
" with gr.Column(scale=2):\n",
" gr.Markdown(\"### 📝 Generated Questions\")\n",
" \n",
" output_display = gr.Markdown(\n",
" value=\"Click 'Generate Questions' to start...\",\n",
" label=\"Questions\"\n",
" )\n",
" \n",
" # Connect the generate button\n",
" generate_btn.click(\n",
" fn=gradio_generate,\n",
" inputs=[topic_dropdown, difficulty_dropdown, num_questions_slider],\n",
" outputs=[output_display, json_download, csv_download]\n",
" )\n",
" \n",
" gr.Markdown(\"\"\"\n",
" ---\n",
" ### 💡 Tips:\n",
" - Start with 5 questions to test the system\n",
" - Beginner questions cover definitions and basic concepts\n",
" - Intermediate questions test application and understanding\n",
" - Advanced questions explore edge cases and optimization\n",
" - Generation takes ~30-60 seconds depending on number of questions\n",
" \n",
" ### 📊 Output Formats:\n",
" - **JSON**: Structured data for programmatic use\n",
" - **CSV**: Easy to view in spreadsheets or import into other tools\n",
" \"\"\")\n",
"\n",
"print(\"✅ Gradio interface configured!\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Launch the Gradio app\n",
"demo.launch(share=True, debug=True)\n"
]
}
],
"metadata": {
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 2
}