From 6e2e5064b610fd87d9bd0b28a2ccd4169a23972f Mon Sep 17 00:00:00 2001 From: Philip Omoigui Date: Fri, 31 Oct 2025 15:40:47 +0100 Subject: [PATCH] Add week 3 exercise: Educational Q&A Dataset Generator --- .../philip/week3_EXERCISE.ipynb | 529 ++++++++++++++++++ 1 file changed, 529 insertions(+) create mode 100644 week3/community-contributions/philip/week3_EXERCISE.ipynb diff --git a/week3/community-contributions/philip/week3_EXERCISE.ipynb b/week3/community-contributions/philip/week3_EXERCISE.ipynb new file mode 100644 index 0000000..19cbacd --- /dev/null +++ b/week3/community-contributions/philip/week3_EXERCISE.ipynb @@ -0,0 +1,529 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -q transformers accelerate bitsandbytes torch gradio\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "import json\n", + "import pandas as pd\n", + "import gradio as gr\n", + "from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig\n", + "from huggingface_hub import login\n", + "from google.colab import userdata\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Authenticate with HuggingFace\n", + "hf_token = userdata.get('HF_TOKEN')\n", + "login(hf_token, add_to_git_credential=True)\n", + "print(\"Successfully authenticated with HuggingFace\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Model configuration\n", + "MODEL_NAME = \"meta-llama/Meta-Llama-3.1-8B-Instruct\"\n", + "\n", + "# 4-bit quantization for efficiency on T4 GPU\n", + "quant_config = BitsAndBytesConfig(\n", + " load_in_4bit=True,\n", + " bnb_4bit_use_double_quant=True,\n", + " bnb_4bit_compute_dtype=torch.bfloat16,\n", + " bnb_4bit_quant_type=\"nf4\"\n", + ")\n", + "\n", + "# Load tokenizer and model\n", + "tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)\n", + "tokenizer.pad_token = tokenizer.eos_token\n", + "\n", + "model = AutoModelForCausalLM.from_pretrained(\n", + " MODEL_NAME,\n", + " device_map=\"auto\",\n", + " quantization_config=quant_config\n", + ")\n", + "\n", + "print(\"Model loaded successfully!\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Topic definitions based on course content\n", + "TOPICS = {\n", + " \"Week 1: LLM APIs & Prompting\": {\n", + " \"concepts\": [\n", + " \"OpenAI API usage and parameters\",\n", + " \"Prompt engineering techniques\",\n", + " \"Temperature and top_p parameters\",\n", + " \"System vs user messages\",\n", + " \"JSON mode and structured outputs\",\n", + " \"Token counting and pricing\",\n", + " \"Chat completions vs completions\",\n", + " \"Few-shot learning\"\n", + " ]\n", + " },\n", + " \"Week 2: Function Calling & Agents\": {\n", + " \"concepts\": [\n", + " \"Function calling syntax and format\",\n", + " \"Tool definitions and schemas\",\n", + " \"Parallel function calling\",\n", + " \"Function calling best practices\",\n", + " \"Agent patterns and workflows\",\n", + " \"Structured outputs with Pydantic\",\n", + " \"Error handling in function calls\"\n", + " ]\n", + " },\n", + " \"Week 3: Transformers & Models\": {\n", + " \"concepts\": [\n", + " \"Tokenizers and tokenization strategies\",\n", + " \"BPE, WordPiece, and SentencePiece\",\n", + " \"HuggingFace pipelines\",\n", + " \"AutoModel and AutoTokenizer\",\n", + " \"Model quantization (4-bit, 8-bit)\",\n", + " \"Speech-to-text with Whisper\",\n", + " \"Local vs cloud model inference\",\n", + " \"Model architectures (encoder, decoder, encoder-decoder)\"\n", + " ]\n", + " }\n", + "}\n", + "\n", + "# Difficulty level descriptions\n", + "DIFFICULTY_LEVELS = {\n", + " \"Beginner\": \"Basic understanding of concepts and definitions\",\n", + " \"Intermediate\": \"Application of concepts with some technical depth\",\n", + " \"Advanced\": \"Edge cases, optimization, and deep technical understanding\"\n", + "}\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def generate_questions(topic, difficulty, num_questions):\n", + " \"\"\"\n", + " Generate educational Q&A questions using the LLM.\n", + " \n", + " Args:\n", + " topic: Topic category to generate questions for\n", + " difficulty: Difficulty level (Beginner/Intermediate/Advanced)\n", + " num_questions: Number of questions to generate\n", + " \n", + " Returns:\n", + " List of dictionaries containing questions and answers\n", + " \"\"\"\n", + " \n", + " # Get topic details\n", + " topic_info = TOPICS[topic]\n", + " concepts = \", \".join(topic_info[\"concepts\"])\n", + " \n", + " # Build the prompt using Llama's chat format\n", + " system_message = \"\"\"You are an expert educator creating high-quality multiple-choice questions for an LLM Engineering course.\n", + "\n", + "Format each question EXACTLY as shown below:\n", + "\n", + "QUESTION: [question text]\n", + "A) [option A]\n", + "B) [option B]\n", + "C) [option C]\n", + "D) [option D]\n", + "ANSWER: [correct letter]\n", + "EXPLANATION: [brief explanation]\n", + "---\"\"\"\n", + "\n", + " user_prompt = f\"\"\"Create {num_questions} multiple-choice questions about: {topic}\n", + "\n", + "Difficulty Level: {difficulty}\n", + "\n", + "Cover these concepts: {concepts}\n", + "\n", + "Requirements:\n", + "- Questions should be practical and relevant to real LLM engineering\n", + "- All 4 options should be plausible\n", + "- Explanations should be clear and educational\n", + "- Vary the correct answer position\n", + "\n", + "Generate {num_questions} questions now:\"\"\"\n", + "\n", + " # Prepare messages for Llama\n", + " messages = [\n", + " {\"role\": \"system\", \"content\": system_message},\n", + " {\"role\": \"user\", \"content\": user_prompt}\n", + " ]\n", + " \n", + " # Tokenize using Llama's chat template\n", + " input_ids = tokenizer.apply_chat_template(\n", + " messages,\n", + " return_tensors=\"pt\",\n", + " add_generation_prompt=True\n", + " ).to(model.device)\n", + " \n", + " attention_mask = torch.ones_like(input_ids).to(model.device)\n", + " \n", + " # Generate\n", + " print(f\"Generating {num_questions} questions...\")\n", + " max_tokens = min(2500, num_questions * 200)\n", + " \n", + " with torch.no_grad():\n", + " outputs = model.generate(\n", + " input_ids,\n", + " attention_mask=attention_mask,\n", + " max_new_tokens=max_tokens,\n", + " temperature=0.7,\n", + " do_sample=True,\n", + " top_p=0.9,\n", + " pad_token_id=tokenizer.eos_token_id\n", + " )\n", + " \n", + " # Decode\n", + " response = tokenizer.decode(outputs[0], skip_special_tokens=True)\n", + " \n", + " # Extract just the assistant's response\n", + " if \"assistant\" in response:\n", + " response = response.split(\"assistant\")[-1].strip()\n", + " \n", + " # Debug: print what we got\n", + " print(\"Generated text preview:\")\n", + " print(response[:500] + \"...\" if len(response) > 500 else response)\n", + " print()\n", + " \n", + " # Parse the questions\n", + " questions = parse_questions(response, topic, difficulty)\n", + " \n", + " print(f\"Successfully generated {len(questions)} questions\")\n", + " return questions\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def parse_questions(text, topic, difficulty):\n", + " \"\"\"\n", + " Parse the generated text into structured question objects.\n", + " More robust parsing that handles various formats.\n", + " \"\"\"\n", + " questions = []\n", + " \n", + " # Split by \"QUESTION:\" to get individual question blocks\n", + " blocks = text.split(\"QUESTION:\")\n", + " \n", + " for i, block in enumerate(blocks):\n", + " if not block.strip() or i == 0 and len(block) < 20:\n", + " continue\n", + " \n", + " try:\n", + " # Extract components\n", + " question_text = \"\"\n", + " options = {}\n", + " answer = \"\"\n", + " explanation = \"\"\n", + " \n", + " lines = block.strip().split(\"\\n\")\n", + " \n", + " for line in lines:\n", + " line = line.strip()\n", + " if not line or line == \"---\":\n", + " continue\n", + " \n", + " # Handle question text (first non-empty line before options)\n", + " if not question_text and not any(line.startswith(x) for x in [\"A)\", \"B)\", \"C)\", \"D)\", \"ANSWER:\", \"EXPLANATION:\", \"Answer:\", \"Explanation:\"]):\n", + " question_text = line\n", + " \n", + " # Handle options - be flexible with formatting\n", + " elif line.startswith(\"A)\") or line.startswith(\"A.\"):\n", + " options[\"A\"] = line[2:].strip()\n", + " elif line.startswith(\"B)\") or line.startswith(\"B.\"):\n", + " options[\"B\"] = line[2:].strip()\n", + " elif line.startswith(\"C)\") or line.startswith(\"C.\"):\n", + " options[\"C\"] = line[2:].strip()\n", + " elif line.startswith(\"D)\") or line.startswith(\"D.\"):\n", + " options[\"D\"] = line[2:].strip()\n", + " \n", + " # Handle answer\n", + " elif line.upper().startswith(\"ANSWER:\"):\n", + " answer = line.split(\":\", 1)[1].strip()\n", + " \n", + " # Handle explanation\n", + " elif line.upper().startswith(\"EXPLANATION:\"):\n", + " explanation = line.split(\":\", 1)[1].strip()\n", + " elif explanation and len(explanation) < 200:\n", + " # Continue multi-line explanation (up to reasonable length)\n", + " explanation += \" \" + line\n", + " \n", + " # Extract just the letter from answer\n", + " if answer:\n", + " answer_letter = \"\"\n", + " for char in answer.upper():\n", + " if char in [\"A\", \"B\", \"C\", \"D\"]:\n", + " answer_letter = char\n", + " break\n", + " answer = answer_letter\n", + " \n", + " # Only add if we have minimum required components\n", + " if question_text and len(options) >= 3 and answer:\n", + " # Fill missing option if needed\n", + " if len(options) == 3:\n", + " for letter in [\"A\", \"B\", \"C\", \"D\"]:\n", + " if letter not in options:\n", + " options[letter] = \"Not applicable\"\n", + " break\n", + " \n", + " # Use placeholder explanation if none provided\n", + " if not explanation:\n", + " explanation = f\"The correct answer is {answer}.\"\n", + " \n", + " questions.append({\n", + " \"id\": len(questions) + 1,\n", + " \"topic\": topic,\n", + " \"difficulty\": difficulty,\n", + " \"question\": question_text,\n", + " \"options\": options,\n", + " \"correct_answer\": answer,\n", + " \"explanation\": explanation.strip()\n", + " })\n", + " print(f\"Parsed question {len(questions)}\")\n", + " else:\n", + " print(f\"Skipped incomplete block: Q={bool(question_text)}, Opts={len(options)}, Ans={bool(answer)}\")\n", + " \n", + " except Exception as e:\n", + " print(f\"Error parsing block {i+1}: {str(e)}\")\n", + " continue\n", + " \n", + " return questions\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def format_questions_display(questions):\n", + " \"\"\"Format questions for display in Gradio.\"\"\"\n", + " if not questions:\n", + " return \"No questions generated.\"\n", + " \n", + " output = f\"# Generated Questions\\n\\n\"\n", + " output += f\"**Total Questions:** {len(questions)}\\n\\n\"\n", + " output += \"---\\n\\n\"\n", + " \n", + " for q in questions:\n", + " output += f\"## Question {q['id']}\\n\\n\"\n", + " output += f\"**Topic:** {q['topic']} \\n\"\n", + " output += f\"**Difficulty:** {q['difficulty']} \\n\\n\"\n", + " output += f\"**Q:** {q['question']}\\n\\n\"\n", + " \n", + " for letter in ['A', 'B', 'C', 'D']:\n", + " prefix = \"✅ \" if letter == q['correct_answer'] else \"\"\n", + " output += f\"{prefix}{letter}) {q['options'][letter]}\\n\\n\"\n", + " \n", + " output += f\"**Answer:** {q['correct_answer']}\\n\\n\"\n", + " output += f\"**Explanation:** {q['explanation']}\\n\\n\"\n", + " output += \"---\\n\\n\"\n", + " \n", + " return output\n", + "\n", + "\n", + "def export_to_json(questions):\n", + " \"\"\"Export questions to JSON file.\"\"\"\n", + " if not questions:\n", + " return None\n", + " \n", + " filename = \"educational_qa_dataset.json\"\n", + " with open(filename, 'w') as f:\n", + " json.dump(questions, f, indent=2)\n", + " \n", + " return filename\n", + "\n", + "\n", + "def export_to_csv(questions):\n", + " \"\"\"Export questions to CSV file.\"\"\"\n", + " if not questions:\n", + " return None\n", + " \n", + " # Flatten the data for CSV\n", + " flattened = []\n", + " for q in questions:\n", + " flattened.append({\n", + " 'id': q['id'],\n", + " 'topic': q['topic'],\n", + " 'difficulty': q['difficulty'],\n", + " 'question': q['question'],\n", + " 'option_A': q['options']['A'],\n", + " 'option_B': q['options']['B'],\n", + " 'option_C': q['options']['C'],\n", + " 'option_D': q['options']['D'],\n", + " 'correct_answer': q['correct_answer'],\n", + " 'explanation': q['explanation']\n", + " })\n", + " \n", + " filename = \"educational_qa_dataset.csv\"\n", + " df = pd.DataFrame(flattened)\n", + " df.to_csv(filename, index=False)\n", + " \n", + " return filename\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def gradio_generate(topic, difficulty, num_questions):\n", + " \"\"\"\n", + " Wrapper function for Gradio interface.\n", + " Generates questions and returns formatted output plus download files.\n", + " \"\"\"\n", + " try:\n", + " # Generate questions\n", + " questions = generate_questions(topic, difficulty, num_questions)\n", + " \n", + " if not questions:\n", + " return \"Failed to generate questions. Please try again.\", None, None\n", + " \n", + " # Format for display\n", + " display_text = format_questions_display(questions)\n", + " \n", + " # Export files\n", + " json_file = export_to_json(questions)\n", + " csv_file = export_to_csv(questions)\n", + " \n", + " return display_text, json_file, csv_file\n", + " \n", + " except Exception as e:\n", + " return f\"Error: {str(e)}\", None, None\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Build the Gradio UI\n", + "with gr.Blocks(title=\"Educational Q&A Generator\", theme=gr.themes.Soft()) as demo:\n", + " \n", + " gr.Markdown(\"\"\"\n", + " # 📚 Educational Q&A Dataset Generator\n", + " Generate high-quality multiple-choice questions for LLM Engineering topics\n", + " \"\"\")\n", + " \n", + " with gr.Row():\n", + " with gr.Column(scale=1):\n", + " gr.Markdown(\"### ⚙️ Configuration\")\n", + " \n", + " topic_dropdown = gr.Dropdown(\n", + " choices=list(TOPICS.keys()),\n", + " value=\"Week 3: Transformers & Models\",\n", + " label=\"Select Topic\",\n", + " info=\"Choose which week's content to generate questions for\"\n", + " )\n", + " \n", + " difficulty_dropdown = gr.Dropdown(\n", + " choices=[\"Beginner\", \"Intermediate\", \"Advanced\"],\n", + " value=\"Intermediate\",\n", + " label=\"Difficulty Level\",\n", + " info=\"Select the difficulty of the questions\"\n", + " )\n", + " \n", + " num_questions_slider = gr.Slider(\n", + " minimum=5,\n", + " maximum=20,\n", + " value=10,\n", + " step=5,\n", + " label=\"Number of Questions\",\n", + " info=\"How many questions to generate (5-20)\"\n", + " )\n", + " \n", + " generate_btn = gr.Button(\"🚀 Generate Questions\", variant=\"primary\", size=\"lg\")\n", + " \n", + " gr.Markdown(\"\"\"\n", + " ---\n", + " ### 📥 Download Files\n", + " After generation, download your dataset in JSON or CSV format\n", + " \"\"\")\n", + " \n", + " with gr.Row():\n", + " json_download = gr.File(label=\"JSON File\", interactive=False)\n", + " csv_download = gr.File(label=\"CSV File\", interactive=False)\n", + " \n", + " with gr.Column(scale=2):\n", + " gr.Markdown(\"### 📝 Generated Questions\")\n", + " \n", + " output_display = gr.Markdown(\n", + " value=\"Click 'Generate Questions' to start...\",\n", + " label=\"Questions\"\n", + " )\n", + " \n", + " # Connect the generate button\n", + " generate_btn.click(\n", + " fn=gradio_generate,\n", + " inputs=[topic_dropdown, difficulty_dropdown, num_questions_slider],\n", + " outputs=[output_display, json_download, csv_download]\n", + " )\n", + " \n", + " gr.Markdown(\"\"\"\n", + " ---\n", + " ### 💡 Tips:\n", + " - Start with 5 questions to test the system\n", + " - Beginner questions cover definitions and basic concepts\n", + " - Intermediate questions test application and understanding\n", + " - Advanced questions explore edge cases and optimization\n", + " - Generation takes ~30-60 seconds depending on number of questions\n", + " \n", + " ### 📊 Output Formats:\n", + " - **JSON**: Structured data for programmatic use\n", + " - **CSV**: Easy to view in spreadsheets or import into other tools\n", + " \"\"\")\n", + "\n", + "print(\"✅ Gradio interface configured!\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Launch the Gradio app\n", + "demo.launch(share=True, debug=True)\n" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}