Add week 3 exercise: Educational Q&A Dataset Generator

2025-10-31 15:40:47 +01:00
parent 80ae1df886
commit 6e2e5064b6
1 changed files with 529 additions and 0 deletions
--- a/week3/community-contributions/philip/week3_EXERCISE.ipynb
+++ b/week3/community-contributions/philip/week3_EXERCISE.ipynb
@@ -0,0 +1,529 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "%pip install -q transformers accelerate bitsandbytes torch gradio\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "import torch\n",
+        "import json\n",
+        "import pandas as pd\n",
+        "import gradio as gr\n",
+        "from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig\n",
+        "from huggingface_hub import login\n",
+        "from google.colab import userdata\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Authenticate with HuggingFace\n",
+        "hf_token = userdata.get('HF_TOKEN')\n",
+        "login(hf_token, add_to_git_credential=True)\n",
+        "print(\"Successfully authenticated with HuggingFace\")\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Model configuration\n",
+        "MODEL_NAME = \"meta-llama/Meta-Llama-3.1-8B-Instruct\"\n",
+        "\n",
+        "# 4-bit quantization for efficiency on T4 GPU\n",
+        "quant_config = BitsAndBytesConfig(\n",
+        "    load_in_4bit=True,\n",
+        "    bnb_4bit_use_double_quant=True,\n",
+        "    bnb_4bit_compute_dtype=torch.bfloat16,\n",
+        "    bnb_4bit_quant_type=\"nf4\"\n",
+        ")\n",
+        "\n",
+        "# Load tokenizer and model\n",
+        "tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)\n",
+        "tokenizer.pad_token = tokenizer.eos_token\n",
+        "\n",
+        "model = AutoModelForCausalLM.from_pretrained(\n",
+        "    MODEL_NAME,\n",
+        "    device_map=\"auto\",\n",
+        "    quantization_config=quant_config\n",
+        ")\n",
+        "\n",
+        "print(\"Model loaded successfully!\")\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Topic definitions based on course content\n",
+        "TOPICS = {\n",
+        "    \"Week 1: LLM APIs & Prompting\": {\n",
+        "        \"concepts\": [\n",
+        "            \"OpenAI API usage and parameters\",\n",
+        "            \"Prompt engineering techniques\",\n",
+        "            \"Temperature and top_p parameters\",\n",
+        "            \"System vs user messages\",\n",
+        "            \"JSON mode and structured outputs\",\n",
+        "            \"Token counting and pricing\",\n",
+        "            \"Chat completions vs completions\",\n",
+        "            \"Few-shot learning\"\n",
+        "        ]\n",
+        "    },\n",
+        "    \"Week 2: Function Calling & Agents\": {\n",
+        "        \"concepts\": [\n",
+        "            \"Function calling syntax and format\",\n",
+        "            \"Tool definitions and schemas\",\n",
+        "            \"Parallel function calling\",\n",
+        "            \"Function calling best practices\",\n",
+        "            \"Agent patterns and workflows\",\n",
+        "            \"Structured outputs with Pydantic\",\n",
+        "            \"Error handling in function calls\"\n",
+        "        ]\n",
+        "    },\n",
+        "    \"Week 3: Transformers & Models\": {\n",
+        "        \"concepts\": [\n",
+        "            \"Tokenizers and tokenization strategies\",\n",
+        "            \"BPE, WordPiece, and SentencePiece\",\n",
+        "            \"HuggingFace pipelines\",\n",
+        "            \"AutoModel and AutoTokenizer\",\n",
+        "            \"Model quantization (4-bit, 8-bit)\",\n",
+        "            \"Speech-to-text with Whisper\",\n",
+        "            \"Local vs cloud model inference\",\n",
+        "            \"Model architectures (encoder, decoder, encoder-decoder)\"\n",
+        "        ]\n",
+        "    }\n",
+        "}\n",
+        "\n",
+        "# Difficulty level descriptions\n",
+        "DIFFICULTY_LEVELS = {\n",
+        "    \"Beginner\": \"Basic understanding of concepts and definitions\",\n",
+        "    \"Intermediate\": \"Application of concepts with some technical depth\",\n",
+        "    \"Advanced\": \"Edge cases, optimization, and deep technical understanding\"\n",
+        "}\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "def generate_questions(topic, difficulty, num_questions):\n",
+        "    \"\"\"\n",
+        "    Generate educational Q&A questions using the LLM.\n",
+        "    \n",
+        "    Args:\n",
+        "        topic: Topic category to generate questions for\n",
+        "        difficulty: Difficulty level (Beginner/Intermediate/Advanced)\n",
+        "        num_questions: Number of questions to generate\n",
+        "    \n",
+        "    Returns:\n",
+        "        List of dictionaries containing questions and answers\n",
+        "    \"\"\"\n",
+        "    \n",
+        "    # Get topic details\n",
+        "    topic_info = TOPICS[topic]\n",
+        "    concepts = \", \".join(topic_info[\"concepts\"])\n",
+        "    \n",
+        "    # Build the prompt using Llama's chat format\n",
+        "    system_message = \"\"\"You are an expert educator creating high-quality multiple-choice questions for an LLM Engineering course.\n",
+        "\n",
+        "Format each question EXACTLY as shown below:\n",
+        "\n",
+        "QUESTION: [question text]\n",
+        "A) [option A]\n",
+        "B) [option B]\n",
+        "C) [option C]\n",
+        "D) [option D]\n",
+        "ANSWER: [correct letter]\n",
+        "EXPLANATION: [brief explanation]\n",
+        "---\"\"\"\n",
+        "\n",
+        "    user_prompt = f\"\"\"Create {num_questions} multiple-choice questions about: {topic}\n",
+        "\n",
+        "Difficulty Level: {difficulty}\n",
+        "\n",
+        "Cover these concepts: {concepts}\n",
+        "\n",
+        "Requirements:\n",
+        "- Questions should be practical and relevant to real LLM engineering\n",
+        "- All 4 options should be plausible\n",
+        "- Explanations should be clear and educational\n",
+        "- Vary the correct answer position\n",
+        "\n",
+        "Generate {num_questions} questions now:\"\"\"\n",
+        "\n",
+        "    # Prepare messages for Llama\n",
+        "    messages = [\n",
+        "        {\"role\": \"system\", \"content\": system_message},\n",
+        "        {\"role\": \"user\", \"content\": user_prompt}\n",
+        "    ]\n",
+        "    \n",
+        "    # Tokenize using Llama's chat template\n",
+        "    input_ids = tokenizer.apply_chat_template(\n",
+        "        messages,\n",
+        "        return_tensors=\"pt\",\n",
+        "        add_generation_prompt=True\n",
+        "    ).to(model.device)\n",
+        "    \n",
+        "    attention_mask = torch.ones_like(input_ids).to(model.device)\n",
+        "    \n",
+        "    # Generate\n",
+        "    print(f\"Generating {num_questions} questions...\")\n",
+        "    max_tokens = min(2500, num_questions * 200)\n",
+        "    \n",
+        "    with torch.no_grad():\n",
+        "        outputs = model.generate(\n",
+        "            input_ids,\n",
+        "            attention_mask=attention_mask,\n",
+        "            max_new_tokens=max_tokens,\n",
+        "            temperature=0.7,\n",
+        "            do_sample=True,\n",
+        "            top_p=0.9,\n",
+        "            pad_token_id=tokenizer.eos_token_id\n",
+        "        )\n",
+        "    \n",
+        "    # Decode\n",
+        "    response = tokenizer.decode(outputs[0], skip_special_tokens=True)\n",
+        "    \n",
+        "    # Extract just the assistant's response\n",
+        "    if \"assistant\" in response:\n",
+        "        response = response.split(\"assistant\")[-1].strip()\n",
+        "    \n",
+        "    # Debug: print what we got\n",
+        "    print(\"Generated text preview:\")\n",
+        "    print(response[:500] + \"...\" if len(response) > 500 else response)\n",
+        "    print()\n",
+        "    \n",
+        "    # Parse the questions\n",
+        "    questions = parse_questions(response, topic, difficulty)\n",
+        "    \n",
+        "    print(f\"Successfully generated {len(questions)} questions\")\n",
+        "    return questions\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "def parse_questions(text, topic, difficulty):\n",
+        "    \"\"\"\n",
+        "    Parse the generated text into structured question objects.\n",
+        "    More robust parsing that handles various formats.\n",
+        "    \"\"\"\n",
+        "    questions = []\n",
+        "    \n",
+        "    # Split by \"QUESTION:\" to get individual question blocks\n",
+        "    blocks = text.split(\"QUESTION:\")\n",
+        "    \n",
+        "    for i, block in enumerate(blocks):\n",
+        "        if not block.strip() or i == 0 and len(block) < 20:\n",
+        "            continue\n",
+        "            \n",
+        "        try:\n",
+        "            # Extract components\n",
+        "            question_text = \"\"\n",
+        "            options = {}\n",
+        "            answer = \"\"\n",
+        "            explanation = \"\"\n",
+        "            \n",
+        "            lines = block.strip().split(\"\\n\")\n",
+        "            \n",
+        "            for line in lines:\n",
+        "                line = line.strip()\n",
+        "                if not line or line == \"---\":\n",
+        "                    continue\n",
+        "                \n",
+        "                # Handle question text (first non-empty line before options)\n",
+        "                if not question_text and not any(line.startswith(x) for x in [\"A)\", \"B)\", \"C)\", \"D)\", \"ANSWER:\", \"EXPLANATION:\", \"Answer:\", \"Explanation:\"]):\n",
+        "                    question_text = line\n",
+        "                    \n",
+        "                # Handle options - be flexible with formatting\n",
+        "                elif line.startswith(\"A)\") or line.startswith(\"A.\"):\n",
+        "                    options[\"A\"] = line[2:].strip()\n",
+        "                elif line.startswith(\"B)\") or line.startswith(\"B.\"):\n",
+        "                    options[\"B\"] = line[2:].strip()\n",
+        "                elif line.startswith(\"C)\") or line.startswith(\"C.\"):\n",
+        "                    options[\"C\"] = line[2:].strip()\n",
+        "                elif line.startswith(\"D)\") or line.startswith(\"D.\"):\n",
+        "                    options[\"D\"] = line[2:].strip()\n",
+        "                    \n",
+        "                # Handle answer\n",
+        "                elif line.upper().startswith(\"ANSWER:\"):\n",
+        "                    answer = line.split(\":\", 1)[1].strip()\n",
+        "                    \n",
+        "                # Handle explanation\n",
+        "                elif line.upper().startswith(\"EXPLANATION:\"):\n",
+        "                    explanation = line.split(\":\", 1)[1].strip()\n",
+        "                elif explanation and len(explanation) < 200:\n",
+        "                    # Continue multi-line explanation (up to reasonable length)\n",
+        "                    explanation += \" \" + line\n",
+        "            \n",
+        "            # Extract just the letter from answer\n",
+        "            if answer:\n",
+        "                answer_letter = \"\"\n",
+        "                for char in answer.upper():\n",
+        "                    if char in [\"A\", \"B\", \"C\", \"D\"]:\n",
+        "                        answer_letter = char\n",
+        "                        break\n",
+        "                answer = answer_letter\n",
+        "            \n",
+        "            # Only add if we have minimum required components\n",
+        "            if question_text and len(options) >= 3 and answer:\n",
+        "                # Fill missing option if needed\n",
+        "                if len(options) == 3:\n",
+        "                    for letter in [\"A\", \"B\", \"C\", \"D\"]:\n",
+        "                        if letter not in options:\n",
+        "                            options[letter] = \"Not applicable\"\n",
+        "                            break\n",
+        "                \n",
+        "                # Use placeholder explanation if none provided\n",
+        "                if not explanation:\n",
+        "                    explanation = f\"The correct answer is {answer}.\"\n",
+        "                \n",
+        "                questions.append({\n",
+        "                    \"id\": len(questions) + 1,\n",
+        "                    \"topic\": topic,\n",
+        "                    \"difficulty\": difficulty,\n",
+        "                    \"question\": question_text,\n",
+        "                    \"options\": options,\n",
+        "                    \"correct_answer\": answer,\n",
+        "                    \"explanation\": explanation.strip()\n",
+        "                })\n",
+        "                print(f\"Parsed question {len(questions)}\")\n",
+        "            else:\n",
+        "                print(f\"Skipped incomplete block: Q={bool(question_text)}, Opts={len(options)}, Ans={bool(answer)}\")\n",
+        "                \n",
+        "        except Exception as e:\n",
+        "            print(f\"Error parsing block {i+1}: {str(e)}\")\n",
+        "            continue\n",
+        "    \n",
+        "    return questions\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "def format_questions_display(questions):\n",
+        "    \"\"\"Format questions for display in Gradio.\"\"\"\n",
+        "    if not questions:\n",
+        "        return \"No questions generated.\"\n",
+        "    \n",
+        "    output = f\"# Generated Questions\\n\\n\"\n",
+        "    output += f\"**Total Questions:** {len(questions)}\\n\\n\"\n",
+        "    output += \"---\\n\\n\"\n",
+        "    \n",
+        "    for q in questions:\n",
+        "        output += f\"## Question {q['id']}\\n\\n\"\n",
+        "        output += f\"**Topic:** {q['topic']}  \\n\"\n",
+        "        output += f\"**Difficulty:** {q['difficulty']}  \\n\\n\"\n",
+        "        output += f\"**Q:** {q['question']}\\n\\n\"\n",
+        "        \n",
+        "        for letter in ['A', 'B', 'C', 'D']:\n",
+        "            prefix = \"✅ \" if letter == q['correct_answer'] else \"\"\n",
+        "            output += f\"{prefix}{letter}) {q['options'][letter]}\\n\\n\"\n",
+        "        \n",
+        "        output += f\"**Answer:** {q['correct_answer']}\\n\\n\"\n",
+        "        output += f\"**Explanation:** {q['explanation']}\\n\\n\"\n",
+        "        output += \"---\\n\\n\"\n",
+        "    \n",
+        "    return output\n",
+        "\n",
+        "\n",
+        "def export_to_json(questions):\n",
+        "    \"\"\"Export questions to JSON file.\"\"\"\n",
+        "    if not questions:\n",
+        "        return None\n",
+        "    \n",
+        "    filename = \"educational_qa_dataset.json\"\n",
+        "    with open(filename, 'w') as f:\n",
+        "        json.dump(questions, f, indent=2)\n",
+        "    \n",
+        "    return filename\n",
+        "\n",
+        "\n",
+        "def export_to_csv(questions):\n",
+        "    \"\"\"Export questions to CSV file.\"\"\"\n",
+        "    if not questions:\n",
+        "        return None\n",
+        "    \n",
+        "    # Flatten the data for CSV\n",
+        "    flattened = []\n",
+        "    for q in questions:\n",
+        "        flattened.append({\n",
+        "            'id': q['id'],\n",
+        "            'topic': q['topic'],\n",
+        "            'difficulty': q['difficulty'],\n",
+        "            'question': q['question'],\n",
+        "            'option_A': q['options']['A'],\n",
+        "            'option_B': q['options']['B'],\n",
+        "            'option_C': q['options']['C'],\n",
+        "            'option_D': q['options']['D'],\n",
+        "            'correct_answer': q['correct_answer'],\n",
+        "            'explanation': q['explanation']\n",
+        "        })\n",
+        "    \n",
+        "    filename = \"educational_qa_dataset.csv\"\n",
+        "    df = pd.DataFrame(flattened)\n",
+        "    df.to_csv(filename, index=False)\n",
+        "    \n",
+        "    return filename\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "def gradio_generate(topic, difficulty, num_questions):\n",
+        "    \"\"\"\n",
+        "    Wrapper function for Gradio interface.\n",
+        "    Generates questions and returns formatted output plus download files.\n",
+        "    \"\"\"\n",
+        "    try:\n",
+        "        # Generate questions\n",
+        "        questions = generate_questions(topic, difficulty, num_questions)\n",
+        "        \n",
+        "        if not questions:\n",
+        "            return \"Failed to generate questions. Please try again.\", None, None\n",
+        "        \n",
+        "        # Format for display\n",
+        "        display_text = format_questions_display(questions)\n",
+        "        \n",
+        "        # Export files\n",
+        "        json_file = export_to_json(questions)\n",
+        "        csv_file = export_to_csv(questions)\n",
+        "        \n",
+        "        return display_text, json_file, csv_file\n",
+        "        \n",
+        "    except Exception as e:\n",
+        "        return f\"Error: {str(e)}\", None, None\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Build the Gradio UI\n",
+        "with gr.Blocks(title=\"Educational Q&A Generator\", theme=gr.themes.Soft()) as demo:\n",
+        "    \n",
+        "    gr.Markdown(\"\"\"\n",
+        "    # 📚 Educational Q&A Dataset Generator\n",
+        "    Generate high-quality multiple-choice questions for LLM Engineering topics\n",
+        "    \"\"\")\n",
+        "    \n",
+        "    with gr.Row():\n",
+        "        with gr.Column(scale=1):\n",
+        "            gr.Markdown(\"### ⚙️ Configuration\")\n",
+        "            \n",
+        "            topic_dropdown = gr.Dropdown(\n",
+        "                choices=list(TOPICS.keys()),\n",
+        "                value=\"Week 3: Transformers & Models\",\n",
+        "                label=\"Select Topic\",\n",
+        "                info=\"Choose which week's content to generate questions for\"\n",
+        "            )\n",
+        "            \n",
+        "            difficulty_dropdown = gr.Dropdown(\n",
+        "                choices=[\"Beginner\", \"Intermediate\", \"Advanced\"],\n",
+        "                value=\"Intermediate\",\n",
+        "                label=\"Difficulty Level\",\n",
+        "                info=\"Select the difficulty of the questions\"\n",
+        "            )\n",
+        "            \n",
+        "            num_questions_slider = gr.Slider(\n",
+        "                minimum=5,\n",
+        "                maximum=20,\n",
+        "                value=10,\n",
+        "                step=5,\n",
+        "                label=\"Number of Questions\",\n",
+        "                info=\"How many questions to generate (5-20)\"\n",
+        "            )\n",
+        "            \n",
+        "            generate_btn = gr.Button(\"🚀 Generate Questions\", variant=\"primary\", size=\"lg\")\n",
+        "            \n",
+        "            gr.Markdown(\"\"\"\n",
+        "            ---\n",
+        "            ### 📥 Download Files\n",
+        "            After generation, download your dataset in JSON or CSV format\n",
+        "            \"\"\")\n",
+        "            \n",
+        "            with gr.Row():\n",
+        "                json_download = gr.File(label=\"JSON File\", interactive=False)\n",
+        "                csv_download = gr.File(label=\"CSV File\", interactive=False)\n",
+        "        \n",
+        "        with gr.Column(scale=2):\n",
+        "            gr.Markdown(\"### 📝 Generated Questions\")\n",
+        "            \n",
+        "            output_display = gr.Markdown(\n",
+        "                value=\"Click 'Generate Questions' to start...\",\n",
+        "                label=\"Questions\"\n",
+        "            )\n",
+        "    \n",
+        "    # Connect the generate button\n",
+        "    generate_btn.click(\n",
+        "        fn=gradio_generate,\n",
+        "        inputs=[topic_dropdown, difficulty_dropdown, num_questions_slider],\n",
+        "        outputs=[output_display, json_download, csv_download]\n",
+        "    )\n",
+        "    \n",
+        "    gr.Markdown(\"\"\"\n",
+        "    ---\n",
+        "    ### 💡 Tips:\n",
+        "    - Start with 5 questions to test the system\n",
+        "    - Beginner questions cover definitions and basic concepts\n",
+        "    - Intermediate questions test application and understanding\n",
+        "    - Advanced questions explore edge cases and optimization\n",
+        "    - Generation takes ~30-60 seconds depending on number of questions\n",
+        "    \n",
+        "    ### 📊 Output Formats:\n",
+        "    - **JSON**: Structured data for programmatic use\n",
+        "    - **CSV**: Easy to view in spreadsheets or import into other tools\n",
+        "    \"\"\")\n",
+        "\n",
+        "print(\"✅ Gradio interface configured!\")\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Launch the Gradio app\n",
+        "demo.launch(share=True, debug=True)\n"
+      ]
+    }
+  ],
+  "metadata": {
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 2
+}