530 lines
22 KiB
Plaintext
530 lines
22 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"%pip install -q transformers accelerate bitsandbytes torch gradio\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import torch\n",
|
|
"import json\n",
|
|
"import pandas as pd\n",
|
|
"import gradio as gr\n",
|
|
"from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig\n",
|
|
"from huggingface_hub import login\n",
|
|
"from google.colab import userdata\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Authenticate with HuggingFace\n",
|
|
"hf_token = userdata.get('HF_TOKEN')\n",
|
|
"login(hf_token, add_to_git_credential=True)\n",
|
|
"print(\"Successfully authenticated with HuggingFace\")\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Model configuration\n",
|
|
"MODEL_NAME = \"meta-llama/Meta-Llama-3.1-8B-Instruct\"\n",
|
|
"\n",
|
|
"# 4-bit quantization for efficiency on T4 GPU\n",
|
|
"quant_config = BitsAndBytesConfig(\n",
|
|
" load_in_4bit=True,\n",
|
|
" bnb_4bit_use_double_quant=True,\n",
|
|
" bnb_4bit_compute_dtype=torch.bfloat16,\n",
|
|
" bnb_4bit_quant_type=\"nf4\"\n",
|
|
")\n",
|
|
"\n",
|
|
"# Load tokenizer and model\n",
|
|
"tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)\n",
|
|
"tokenizer.pad_token = tokenizer.eos_token\n",
|
|
"\n",
|
|
"model = AutoModelForCausalLM.from_pretrained(\n",
|
|
" MODEL_NAME,\n",
|
|
" device_map=\"auto\",\n",
|
|
" quantization_config=quant_config\n",
|
|
")\n",
|
|
"\n",
|
|
"print(\"Model loaded successfully!\")\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Topic definitions based on course content\n",
|
|
"TOPICS = {\n",
|
|
" \"Week 1: LLM APIs & Prompting\": {\n",
|
|
" \"concepts\": [\n",
|
|
" \"OpenAI API usage and parameters\",\n",
|
|
" \"Prompt engineering techniques\",\n",
|
|
" \"Temperature and top_p parameters\",\n",
|
|
" \"System vs user messages\",\n",
|
|
" \"JSON mode and structured outputs\",\n",
|
|
" \"Token counting and pricing\",\n",
|
|
" \"Chat completions vs completions\",\n",
|
|
" \"Few-shot learning\"\n",
|
|
" ]\n",
|
|
" },\n",
|
|
" \"Week 2: Function Calling & Agents\": {\n",
|
|
" \"concepts\": [\n",
|
|
" \"Function calling syntax and format\",\n",
|
|
" \"Tool definitions and schemas\",\n",
|
|
" \"Parallel function calling\",\n",
|
|
" \"Function calling best practices\",\n",
|
|
" \"Agent patterns and workflows\",\n",
|
|
" \"Structured outputs with Pydantic\",\n",
|
|
" \"Error handling in function calls\"\n",
|
|
" ]\n",
|
|
" },\n",
|
|
" \"Week 3: Transformers & Models\": {\n",
|
|
" \"concepts\": [\n",
|
|
" \"Tokenizers and tokenization strategies\",\n",
|
|
" \"BPE, WordPiece, and SentencePiece\",\n",
|
|
" \"HuggingFace pipelines\",\n",
|
|
" \"AutoModel and AutoTokenizer\",\n",
|
|
" \"Model quantization (4-bit, 8-bit)\",\n",
|
|
" \"Speech-to-text with Whisper\",\n",
|
|
" \"Local vs cloud model inference\",\n",
|
|
" \"Model architectures (encoder, decoder, encoder-decoder)\"\n",
|
|
" ]\n",
|
|
" }\n",
|
|
"}\n",
|
|
"\n",
|
|
"# Difficulty level descriptions\n",
|
|
"DIFFICULTY_LEVELS = {\n",
|
|
" \"Beginner\": \"Basic understanding of concepts and definitions\",\n",
|
|
" \"Intermediate\": \"Application of concepts with some technical depth\",\n",
|
|
" \"Advanced\": \"Edge cases, optimization, and deep technical understanding\"\n",
|
|
"}\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def generate_questions(topic, difficulty, num_questions):\n",
|
|
" \"\"\"\n",
|
|
" Generate educational Q&A questions using the LLM.\n",
|
|
" \n",
|
|
" Args:\n",
|
|
" topic: Topic category to generate questions for\n",
|
|
" difficulty: Difficulty level (Beginner/Intermediate/Advanced)\n",
|
|
" num_questions: Number of questions to generate\n",
|
|
" \n",
|
|
" Returns:\n",
|
|
" List of dictionaries containing questions and answers\n",
|
|
" \"\"\"\n",
|
|
" \n",
|
|
" # Get topic details\n",
|
|
" topic_info = TOPICS[topic]\n",
|
|
" concepts = \", \".join(topic_info[\"concepts\"])\n",
|
|
" \n",
|
|
" # Build the prompt using Llama's chat format\n",
|
|
" system_message = \"\"\"You are an expert educator creating high-quality multiple-choice questions for an LLM Engineering course.\n",
|
|
"\n",
|
|
"Format each question EXACTLY as shown below:\n",
|
|
"\n",
|
|
"QUESTION: [question text]\n",
|
|
"A) [option A]\n",
|
|
"B) [option B]\n",
|
|
"C) [option C]\n",
|
|
"D) [option D]\n",
|
|
"ANSWER: [correct letter]\n",
|
|
"EXPLANATION: [brief explanation]\n",
|
|
"---\"\"\"\n",
|
|
"\n",
|
|
" user_prompt = f\"\"\"Create {num_questions} multiple-choice questions about: {topic}\n",
|
|
"\n",
|
|
"Difficulty Level: {difficulty}\n",
|
|
"\n",
|
|
"Cover these concepts: {concepts}\n",
|
|
"\n",
|
|
"Requirements:\n",
|
|
"- Questions should be practical and relevant to real LLM engineering\n",
|
|
"- All 4 options should be plausible\n",
|
|
"- Explanations should be clear and educational\n",
|
|
"- Vary the correct answer position\n",
|
|
"\n",
|
|
"Generate {num_questions} questions now:\"\"\"\n",
|
|
"\n",
|
|
" # Prepare messages for Llama\n",
|
|
" messages = [\n",
|
|
" {\"role\": \"system\", \"content\": system_message},\n",
|
|
" {\"role\": \"user\", \"content\": user_prompt}\n",
|
|
" ]\n",
|
|
" \n",
|
|
" # Tokenize using Llama's chat template\n",
|
|
" input_ids = tokenizer.apply_chat_template(\n",
|
|
" messages,\n",
|
|
" return_tensors=\"pt\",\n",
|
|
" add_generation_prompt=True\n",
|
|
" ).to(model.device)\n",
|
|
" \n",
|
|
" attention_mask = torch.ones_like(input_ids).to(model.device)\n",
|
|
" \n",
|
|
" # Generate\n",
|
|
" print(f\"Generating {num_questions} questions...\")\n",
|
|
" max_tokens = min(2500, num_questions * 200)\n",
|
|
" \n",
|
|
" with torch.no_grad():\n",
|
|
" outputs = model.generate(\n",
|
|
" input_ids,\n",
|
|
" attention_mask=attention_mask,\n",
|
|
" max_new_tokens=max_tokens,\n",
|
|
" temperature=0.7,\n",
|
|
" do_sample=True,\n",
|
|
" top_p=0.9,\n",
|
|
" pad_token_id=tokenizer.eos_token_id\n",
|
|
" )\n",
|
|
" \n",
|
|
" # Decode\n",
|
|
" response = tokenizer.decode(outputs[0], skip_special_tokens=True)\n",
|
|
" \n",
|
|
" # Extract just the assistant's response\n",
|
|
" if \"assistant\" in response:\n",
|
|
" response = response.split(\"assistant\")[-1].strip()\n",
|
|
" \n",
|
|
" # Debug: print what we got\n",
|
|
" print(\"Generated text preview:\")\n",
|
|
" print(response[:500] + \"...\" if len(response) > 500 else response)\n",
|
|
" print()\n",
|
|
" \n",
|
|
" # Parse the questions\n",
|
|
" questions = parse_questions(response, topic, difficulty)\n",
|
|
" \n",
|
|
" print(f\"Successfully generated {len(questions)} questions\")\n",
|
|
" return questions\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def parse_questions(text, topic, difficulty):\n",
|
|
" \"\"\"\n",
|
|
" Parse the generated text into structured question objects.\n",
|
|
" More robust parsing that handles various formats.\n",
|
|
" \"\"\"\n",
|
|
" questions = []\n",
|
|
" \n",
|
|
" # Split by \"QUESTION:\" to get individual question blocks\n",
|
|
" blocks = text.split(\"QUESTION:\")\n",
|
|
" \n",
|
|
" for i, block in enumerate(blocks):\n",
|
|
" if not block.strip() or i == 0 and len(block) < 20:\n",
|
|
" continue\n",
|
|
" \n",
|
|
" try:\n",
|
|
" # Extract components\n",
|
|
" question_text = \"\"\n",
|
|
" options = {}\n",
|
|
" answer = \"\"\n",
|
|
" explanation = \"\"\n",
|
|
" \n",
|
|
" lines = block.strip().split(\"\\n\")\n",
|
|
" \n",
|
|
" for line in lines:\n",
|
|
" line = line.strip()\n",
|
|
" if not line or line == \"---\":\n",
|
|
" continue\n",
|
|
" \n",
|
|
" # Handle question text (first non-empty line before options)\n",
|
|
" if not question_text and not any(line.startswith(x) for x in [\"A)\", \"B)\", \"C)\", \"D)\", \"ANSWER:\", \"EXPLANATION:\", \"Answer:\", \"Explanation:\"]):\n",
|
|
" question_text = line\n",
|
|
" \n",
|
|
" # Handle options - be flexible with formatting\n",
|
|
" elif line.startswith(\"A)\") or line.startswith(\"A.\"):\n",
|
|
" options[\"A\"] = line[2:].strip()\n",
|
|
" elif line.startswith(\"B)\") or line.startswith(\"B.\"):\n",
|
|
" options[\"B\"] = line[2:].strip()\n",
|
|
" elif line.startswith(\"C)\") or line.startswith(\"C.\"):\n",
|
|
" options[\"C\"] = line[2:].strip()\n",
|
|
" elif line.startswith(\"D)\") or line.startswith(\"D.\"):\n",
|
|
" options[\"D\"] = line[2:].strip()\n",
|
|
" \n",
|
|
" # Handle answer\n",
|
|
" elif line.upper().startswith(\"ANSWER:\"):\n",
|
|
" answer = line.split(\":\", 1)[1].strip()\n",
|
|
" \n",
|
|
" # Handle explanation\n",
|
|
" elif line.upper().startswith(\"EXPLANATION:\"):\n",
|
|
" explanation = line.split(\":\", 1)[1].strip()\n",
|
|
" elif explanation and len(explanation) < 200:\n",
|
|
" # Continue multi-line explanation (up to reasonable length)\n",
|
|
" explanation += \" \" + line\n",
|
|
" \n",
|
|
" # Extract just the letter from answer\n",
|
|
" if answer:\n",
|
|
" answer_letter = \"\"\n",
|
|
" for char in answer.upper():\n",
|
|
" if char in [\"A\", \"B\", \"C\", \"D\"]:\n",
|
|
" answer_letter = char\n",
|
|
" break\n",
|
|
" answer = answer_letter\n",
|
|
" \n",
|
|
" # Only add if we have minimum required components\n",
|
|
" if question_text and len(options) >= 3 and answer:\n",
|
|
" # Fill missing option if needed\n",
|
|
" if len(options) == 3:\n",
|
|
" for letter in [\"A\", \"B\", \"C\", \"D\"]:\n",
|
|
" if letter not in options:\n",
|
|
" options[letter] = \"Not applicable\"\n",
|
|
" break\n",
|
|
" \n",
|
|
" # Use placeholder explanation if none provided\n",
|
|
" if not explanation:\n",
|
|
" explanation = f\"The correct answer is {answer}.\"\n",
|
|
" \n",
|
|
" questions.append({\n",
|
|
" \"id\": len(questions) + 1,\n",
|
|
" \"topic\": topic,\n",
|
|
" \"difficulty\": difficulty,\n",
|
|
" \"question\": question_text,\n",
|
|
" \"options\": options,\n",
|
|
" \"correct_answer\": answer,\n",
|
|
" \"explanation\": explanation.strip()\n",
|
|
" })\n",
|
|
" print(f\"Parsed question {len(questions)}\")\n",
|
|
" else:\n",
|
|
" print(f\"Skipped incomplete block: Q={bool(question_text)}, Opts={len(options)}, Ans={bool(answer)}\")\n",
|
|
" \n",
|
|
" except Exception as e:\n",
|
|
" print(f\"Error parsing block {i+1}: {str(e)}\")\n",
|
|
" continue\n",
|
|
" \n",
|
|
" return questions\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def format_questions_display(questions):\n",
|
|
" \"\"\"Format questions for display in Gradio.\"\"\"\n",
|
|
" if not questions:\n",
|
|
" return \"No questions generated.\"\n",
|
|
" \n",
|
|
" output = f\"# Generated Questions\\n\\n\"\n",
|
|
" output += f\"**Total Questions:** {len(questions)}\\n\\n\"\n",
|
|
" output += \"---\\n\\n\"\n",
|
|
" \n",
|
|
" for q in questions:\n",
|
|
" output += f\"## Question {q['id']}\\n\\n\"\n",
|
|
" output += f\"**Topic:** {q['topic']} \\n\"\n",
|
|
" output += f\"**Difficulty:** {q['difficulty']} \\n\\n\"\n",
|
|
" output += f\"**Q:** {q['question']}\\n\\n\"\n",
|
|
" \n",
|
|
" for letter in ['A', 'B', 'C', 'D']:\n",
|
|
" prefix = \"✅ \" if letter == q['correct_answer'] else \"\"\n",
|
|
" output += f\"{prefix}{letter}) {q['options'][letter]}\\n\\n\"\n",
|
|
" \n",
|
|
" output += f\"**Answer:** {q['correct_answer']}\\n\\n\"\n",
|
|
" output += f\"**Explanation:** {q['explanation']}\\n\\n\"\n",
|
|
" output += \"---\\n\\n\"\n",
|
|
" \n",
|
|
" return output\n",
|
|
"\n",
|
|
"\n",
|
|
"def export_to_json(questions):\n",
|
|
" \"\"\"Export questions to JSON file.\"\"\"\n",
|
|
" if not questions:\n",
|
|
" return None\n",
|
|
" \n",
|
|
" filename = \"educational_qa_dataset.json\"\n",
|
|
" with open(filename, 'w') as f:\n",
|
|
" json.dump(questions, f, indent=2)\n",
|
|
" \n",
|
|
" return filename\n",
|
|
"\n",
|
|
"\n",
|
|
"def export_to_csv(questions):\n",
|
|
" \"\"\"Export questions to CSV file.\"\"\"\n",
|
|
" if not questions:\n",
|
|
" return None\n",
|
|
" \n",
|
|
" # Flatten the data for CSV\n",
|
|
" flattened = []\n",
|
|
" for q in questions:\n",
|
|
" flattened.append({\n",
|
|
" 'id': q['id'],\n",
|
|
" 'topic': q['topic'],\n",
|
|
" 'difficulty': q['difficulty'],\n",
|
|
" 'question': q['question'],\n",
|
|
" 'option_A': q['options']['A'],\n",
|
|
" 'option_B': q['options']['B'],\n",
|
|
" 'option_C': q['options']['C'],\n",
|
|
" 'option_D': q['options']['D'],\n",
|
|
" 'correct_answer': q['correct_answer'],\n",
|
|
" 'explanation': q['explanation']\n",
|
|
" })\n",
|
|
" \n",
|
|
" filename = \"educational_qa_dataset.csv\"\n",
|
|
" df = pd.DataFrame(flattened)\n",
|
|
" df.to_csv(filename, index=False)\n",
|
|
" \n",
|
|
" return filename\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def gradio_generate(topic, difficulty, num_questions):\n",
|
|
" \"\"\"\n",
|
|
" Wrapper function for Gradio interface.\n",
|
|
" Generates questions and returns formatted output plus download files.\n",
|
|
" \"\"\"\n",
|
|
" try:\n",
|
|
" # Generate questions\n",
|
|
" questions = generate_questions(topic, difficulty, num_questions)\n",
|
|
" \n",
|
|
" if not questions:\n",
|
|
" return \"Failed to generate questions. Please try again.\", None, None\n",
|
|
" \n",
|
|
" # Format for display\n",
|
|
" display_text = format_questions_display(questions)\n",
|
|
" \n",
|
|
" # Export files\n",
|
|
" json_file = export_to_json(questions)\n",
|
|
" csv_file = export_to_csv(questions)\n",
|
|
" \n",
|
|
" return display_text, json_file, csv_file\n",
|
|
" \n",
|
|
" except Exception as e:\n",
|
|
" return f\"Error: {str(e)}\", None, None\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Build the Gradio UI\n",
|
|
"with gr.Blocks(title=\"Educational Q&A Generator\", theme=gr.themes.Soft()) as demo:\n",
|
|
" \n",
|
|
" gr.Markdown(\"\"\"\n",
|
|
" # 📚 Educational Q&A Dataset Generator\n",
|
|
" Generate high-quality multiple-choice questions for LLM Engineering topics\n",
|
|
" \"\"\")\n",
|
|
" \n",
|
|
" with gr.Row():\n",
|
|
" with gr.Column(scale=1):\n",
|
|
" gr.Markdown(\"### ⚙️ Configuration\")\n",
|
|
" \n",
|
|
" topic_dropdown = gr.Dropdown(\n",
|
|
" choices=list(TOPICS.keys()),\n",
|
|
" value=\"Week 3: Transformers & Models\",\n",
|
|
" label=\"Select Topic\",\n",
|
|
" info=\"Choose which week's content to generate questions for\"\n",
|
|
" )\n",
|
|
" \n",
|
|
" difficulty_dropdown = gr.Dropdown(\n",
|
|
" choices=[\"Beginner\", \"Intermediate\", \"Advanced\"],\n",
|
|
" value=\"Intermediate\",\n",
|
|
" label=\"Difficulty Level\",\n",
|
|
" info=\"Select the difficulty of the questions\"\n",
|
|
" )\n",
|
|
" \n",
|
|
" num_questions_slider = gr.Slider(\n",
|
|
" minimum=5,\n",
|
|
" maximum=20,\n",
|
|
" value=10,\n",
|
|
" step=5,\n",
|
|
" label=\"Number of Questions\",\n",
|
|
" info=\"How many questions to generate (5-20)\"\n",
|
|
" )\n",
|
|
" \n",
|
|
" generate_btn = gr.Button(\"🚀 Generate Questions\", variant=\"primary\", size=\"lg\")\n",
|
|
" \n",
|
|
" gr.Markdown(\"\"\"\n",
|
|
" ---\n",
|
|
" ### 📥 Download Files\n",
|
|
" After generation, download your dataset in JSON or CSV format\n",
|
|
" \"\"\")\n",
|
|
" \n",
|
|
" with gr.Row():\n",
|
|
" json_download = gr.File(label=\"JSON File\", interactive=False)\n",
|
|
" csv_download = gr.File(label=\"CSV File\", interactive=False)\n",
|
|
" \n",
|
|
" with gr.Column(scale=2):\n",
|
|
" gr.Markdown(\"### 📝 Generated Questions\")\n",
|
|
" \n",
|
|
" output_display = gr.Markdown(\n",
|
|
" value=\"Click 'Generate Questions' to start...\",\n",
|
|
" label=\"Questions\"\n",
|
|
" )\n",
|
|
" \n",
|
|
" # Connect the generate button\n",
|
|
" generate_btn.click(\n",
|
|
" fn=gradio_generate,\n",
|
|
" inputs=[topic_dropdown, difficulty_dropdown, num_questions_slider],\n",
|
|
" outputs=[output_display, json_download, csv_download]\n",
|
|
" )\n",
|
|
" \n",
|
|
" gr.Markdown(\"\"\"\n",
|
|
" ---\n",
|
|
" ### 💡 Tips:\n",
|
|
" - Start with 5 questions to test the system\n",
|
|
" - Beginner questions cover definitions and basic concepts\n",
|
|
" - Intermediate questions test application and understanding\n",
|
|
" - Advanced questions explore edge cases and optimization\n",
|
|
" - Generation takes ~30-60 seconds depending on number of questions\n",
|
|
" \n",
|
|
" ### 📊 Output Formats:\n",
|
|
" - **JSON**: Structured data for programmatic use\n",
|
|
" - **CSV**: Easy to view in spreadsheets or import into other tools\n",
|
|
" \"\"\")\n",
|
|
"\n",
|
|
"print(\"✅ Gradio interface configured!\")\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Launch the Gradio app\n",
|
|
"demo.launch(share=True, debug=True)\n"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"language_info": {
|
|
"name": "python"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|