Introduce audio input and transcription

2025-08-11 10:26:27 +03:00
parent f4e9798f3d
commit 06b191a2d0
1 changed files with 79 additions and 2 deletions
--- a/week2/community-contributions/rwothoromo/day5.ipynb
+++ b/week2/community-contributions/rwothoromo/day5.ipynb
@@ -602,6 +602,40 @@
    "# talker(\"Well, hi there\")"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e927f333-7ed5-4625-9e5a-5e0b62f8a684",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# To transcribe an audio prompt/input\n",
+    "\n",
+    "import tempfile\n",
+    "from pydub import AudioSegment\n",
+    "from pydub.playback import play\n",
+    "\n",
+    "def transcribe_audio(audio_file):\n",
+    "    \"\"\"\n",
+    "    Transcribes an audio file using OpenAI's Whisper model.\n",
+    "    \"\"\"\n",
+    "    if audio_file is None:\n",
+    "        return \"\"\n",
+    "    \n",
+    "    # The Gradio Audio component returns a tuple (sample_rate, numpy_array)\n",
+    "    # We need to save this to a file to pass to the OpenAI API\n",
+    "    with tempfile.NamedTemporaryFile(suffix=\".wav\", delete=True) as tmpfile:\n",
+    "        audio = AudioSegment.from_file(audio_file, format=\"wav\")\n",
+    "        audio.export(tmpfile.name, format=\"wav\")\n",
+    "        \n",
+    "        with open(tmpfile.name, \"rb\") as audio_file_obj:\n",
+    "            transcript = openai.audio.transcriptions.create(\n",
+    "                model=\"whisper-1\", \n",
+    "                file=audio_file_obj\n",
+    "            )\n",
+    "        return transcript.text"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,
@@ -621,6 +655,12 @@
    "        entry = gr.Textbox(label=\"Chat with our AI Assistant:\", scale=4)\n",
    "        submit_btn = gr.Button(\"Submit\", scale=1)\n",
    "    with gr.Row():\n",
+    "        # Provide a microphone input\n",
+    "        audio_input = gr.Audio(sources=[\"microphone\"], type=\"filepath\", label=\"Speak to our AI Assistant\", scale=4)\n",
+    "        submit_audio_btn = gr.Button(\"Submit Audio\", scale=1)\n",
+    "\n",
+    "\n",
+    "    with gr.Row():\n",
    "        languages = [\"English\", \"Swahili\", \"French\", \"Chinese\", \"German\"]\n",
    "        language_dropdown = gr.Dropdown(\n",
    "            label=\"Select a language for translation\",\n",
@@ -641,7 +681,7 @@
    "    def user_message_updater(user_message, history):\n",
    "        return \"\", history + [[user_message, None]]\n",
    "\n",
-    "    def chat_with_assistant(history, target_language, use_audio):\n",
+    "    def chat_with_assistant(history, target_language, use_audio_output):\n",
    "        message = history[-1][0] # Get the user's message from the last list in history\n",
    "    \n",
    "        messages = [{\"role\": \"system\", \"content\": system_message}]\n",
@@ -677,11 +717,25 @@
    "\n",
    "        history[-1][1] = final_response_content\n",
    "\n",
-    "        if use_audio != \"No\":\n",
+    "        if use_audio_output != \"No\":\n",
    "            talker(final_response_content)\n",
    "\n",
    "        return history, image # Return a tuple of (the updated history, an image)\n",
    "\n",
+    "    # This function ties together the transcription and the chat logic\n",
+    "    def transcribe_and_chat(audio_file, history, target_language, use_audio_output):\n",
+    "        if audio_file:\n",
+    "            # Transcribe the audio file to text\n",
+    "            transcribed_text = transcribe_audio(audio_file)\n",
+    "            \n",
+    "            # Update history with the transcribed text\n",
+    "            new_history = history + [[transcribed_text, None]]\n",
+    "            \n",
+    "            # Call the main chat function with the new history\n",
+    "            return chat_with_assistant(new_history, target_language, use_audio_output)\n",
+    "        else:\n",
+    "            return history, None\n",
+    "\n",
    "    # The event listeners are updated to be triggered by both the textbox and the new button\n",
    "    entry.submit(\n",
    "        user_message_updater,\n",
@@ -704,6 +758,21 @@
    "        inputs=[chatbot, language_dropdown, audio_dropdown],\n",
    "        outputs=[chatbot, image]\n",
    "    )\n",
+    "\n",
+    "    # Event listener to trigger on audio stop\n",
+    "    audio_input.stop(\n",
+    "        transcribe_and_chat,\n",
+    "        inputs=[audio_input, chatbot, language_dropdown, audio_dropdown],\n",
+    "        outputs=[chatbot, image],\n",
+    "        queue=False\n",
+    "    )\n",
+    "\n",
+    "    submit_audio_btn.click(\n",
+    "        transcribe_and_chat,\n",
+    "        inputs=[audio_input, chatbot, language_dropdown, audio_dropdown],\n",
+    "        outputs=[chatbot, image],\n",
+    "        queue=False\n",
+    "    )\n",
    "    \n",
    "    clear.click(lambda: None, inputs=None, outputs=[chatbot, image], queue=False)\n",
    "\n",
@@ -717,6 +786,14 @@
   "metadata": {},
   "outputs": [],
   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3469b07d-2b9a-4409-bb1c-fbdab3248974",
+   "metadata": {},
+   "outputs": [],
+   "source": []
  }
 ],
 "metadata": {