From 998d04f8a374d3ce741fafc38ddf45d45bebb6d4 Mon Sep 17 00:00:00 2001 From: aashahid Date: Tue, 28 Oct 2025 23:19:41 +0500 Subject: [PATCH] Add Week 3 submission for muhammad_qasim_sheikh --- .../Day 5/synthetic_data_generator.ipynb | 172 ++++++++++++++++++ 1 file changed, 172 insertions(+) create mode 100644 community-contributions/muhammad_qasim_sheikh/Week 3/Day 5/synthetic_data_generator.ipynb diff --git a/community-contributions/muhammad_qasim_sheikh/Week 3/Day 5/synthetic_data_generator.ipynb b/community-contributions/muhammad_qasim_sheikh/Week 3/Day 5/synthetic_data_generator.ipynb new file mode 100644 index 0000000..ac3a30a --- /dev/null +++ b/community-contributions/muhammad_qasim_sheikh/Week 3/Day 5/synthetic_data_generator.ipynb @@ -0,0 +1,172 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 6, + "id": "236461b6", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import json\n", + "from dotenv import load_dotenv\n", + "import gradio as gr\n", + "import json\n", + "from openai import OpenAI\n", + "import re" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "4c493ebf", + "metadata": {}, + "outputs": [], + "source": [ + "load_dotenv(override=True)\n", + "api_key = os.getenv('OPENAI_API_KEY')\n", + " \n", + "client = OpenAI()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "349fa758", + "metadata": {}, + "outputs": [], + "source": [ + "system_prompt = \"\"\"\n", + " You are an expert technical writer and knowledge engineer.\n", + " Your task is to generate well-structured Markdown (.md) documentation files that can be used as a knowledge base for a RAG.\n", + "\n", + " Follow these rules carefully:\n", + " 1. Write the content in clear, concise Markdown format.\n", + " 2. Use appropriate Markdown headers (#, ##, ###) to structure the document.\n", + " 3. Include lists, tables, or code blocks only when necessary.\n", + " 4. Keep each document self-contained and focused on a single topic.\n", + " 5. Do not include any text outside the Markdown content (no explanations, no code fences).\n", + " 6. The style should be factual, structured, and helpful for machine retrieval.\n", + " 7. Use consistent tone and terminology across sections.\n", + " \"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "e65071d6", + "metadata": {}, + "outputs": [], + "source": [ + "def create_kb_prompt(topic, kb_type=\"tutorial\"):\n", + " return f\"\"\"\n", + " Generate a comprehensive Markdown document for the following technical topic.\n", + " Topic: {topic}\n", + " Document Type: {kb_type}\n", + " The document should include structured sections, concise explanations, and clear formatting suitable for a technical knowledge base.\n", + " \"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "1045db44", + "metadata": {}, + "outputs": [], + "source": [ + "def generate_markdown_doc(topic, kb_type=\"tutorial\"):\n", + " \n", + " user_prompt = create_kb_prompt(topic, kb_type)\n", + " messages = [\n", + " {\"role\": \"system\", \"content\": system_prompt},\n", + " {\"role\": \"user\", \"content\": user_prompt},\n", + " ]\n", + " \n", + " response = client.chat.completions.create(\n", + " model=\"gpt-4o-mini\",\n", + " messages=messages,\n", + " temperature=0.7\n", + " )\n", + " markdown_output = response.choices[0].message.content.strip()\n", + " markdown_output = re.sub(r'^```[a-z]*\\\\s*', '', markdown_output, flags=re.MULTILINE)\n", + " markdown_output = re.sub(r'\\\\s*```$', '', markdown_output, flags=re.MULTILINE)\n", + " return markdown_output" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "24ba021b", + "metadata": {}, + "outputs": [], + "source": [ + "def create_kb_gradio_interface():\n", + " with gr.Blocks(theme=gr.themes.Soft()) as app:\n", + " gr.Markdown(\"## Technical Knowledge Base Generator\")\n", + "\n", + " with gr.Row():\n", + " with gr.Column():\n", + " topic_input = gr.Textbox(\n", + " label=\"Technical Topic\",\n", + " placeholder=\"e.g., Building a RAG pipeline with LangChain...\",\n", + " lines=2\n", + " )\n", + " kb_type_input = gr.Radio(\n", + " label=\"Document Type\",\n", + " choices=[\"Overview\", \"FAQ\", \"Use Case\"],\n", + " value=\"FAQ\"\n", + " )\n", + " generate_button = gr.Button(\"Generate Markdown Document\", variant=\"primary\")\n", + "\n", + " with gr.Column():\n", + " output_md = gr.Textbox(\n", + " label=\"Generated Markdown Content\",\n", + " lines=25,\n", + " interactive=False,\n", + " placeholder=\"Generated Markdown will appear here...\"\n", + " )\n", + "\n", + " generate_button.click(\n", + " fn=generate_markdown_doc,\n", + " inputs=[topic_input, kb_type_input],\n", + " outputs=[output_md],\n", + " api_name=\"generate_kb_doc\"\n", + " )\n", + "\n", + " return app" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "db17cde4", + "metadata": {}, + "outputs": [], + "source": [ + "app = create_kb_gradio_interface()\n", + "app.launch(debug=True, share=True)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "llm-engineering", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}