From 06b191a2d0ef72bf826ceb0dda78d694ad57d952 Mon Sep 17 00:00:00 2001 From: Elijah Rwothoromo Date: Mon, 11 Aug 2025 10:26:27 +0300 Subject: [PATCH] Introduce audio input and transcription --- .../rwothoromo/day5.ipynb | 81 ++++++++++++++++++- 1 file changed, 79 insertions(+), 2 deletions(-) diff --git a/week2/community-contributions/rwothoromo/day5.ipynb b/week2/community-contributions/rwothoromo/day5.ipynb index edd2a73..b51d15b 100644 --- a/week2/community-contributions/rwothoromo/day5.ipynb +++ b/week2/community-contributions/rwothoromo/day5.ipynb @@ -602,6 +602,40 @@ "# talker(\"Well, hi there\")" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "e927f333-7ed5-4625-9e5a-5e0b62f8a684", + "metadata": {}, + "outputs": [], + "source": [ + "# To transcribe an audio prompt/input\n", + "\n", + "import tempfile\n", + "from pydub import AudioSegment\n", + "from pydub.playback import play\n", + "\n", + "def transcribe_audio(audio_file):\n", + " \"\"\"\n", + " Transcribes an audio file using OpenAI's Whisper model.\n", + " \"\"\"\n", + " if audio_file is None:\n", + " return \"\"\n", + " \n", + " # The Gradio Audio component returns a tuple (sample_rate, numpy_array)\n", + " # We need to save this to a file to pass to the OpenAI API\n", + " with tempfile.NamedTemporaryFile(suffix=\".wav\", delete=True) as tmpfile:\n", + " audio = AudioSegment.from_file(audio_file, format=\"wav\")\n", + " audio.export(tmpfile.name, format=\"wav\")\n", + " \n", + " with open(tmpfile.name, \"rb\") as audio_file_obj:\n", + " transcript = openai.audio.transcriptions.create(\n", + " model=\"whisper-1\", \n", + " file=audio_file_obj\n", + " )\n", + " return transcript.text" + ] + }, { "cell_type": "code", "execution_count": null, @@ -621,6 +655,12 @@ " entry = gr.Textbox(label=\"Chat with our AI Assistant:\", scale=4)\n", " submit_btn = gr.Button(\"Submit\", scale=1)\n", " with gr.Row():\n", + " # Provide a microphone input\n", + " audio_input = gr.Audio(sources=[\"microphone\"], type=\"filepath\", label=\"Speak to our AI Assistant\", scale=4)\n", + " submit_audio_btn = gr.Button(\"Submit Audio\", scale=1)\n", + "\n", + "\n", + " with gr.Row():\n", " languages = [\"English\", \"Swahili\", \"French\", \"Chinese\", \"German\"]\n", " language_dropdown = gr.Dropdown(\n", " label=\"Select a language for translation\",\n", @@ -641,7 +681,7 @@ " def user_message_updater(user_message, history):\n", " return \"\", history + [[user_message, None]]\n", "\n", - " def chat_with_assistant(history, target_language, use_audio):\n", + " def chat_with_assistant(history, target_language, use_audio_output):\n", " message = history[-1][0] # Get the user's message from the last list in history\n", " \n", " messages = [{\"role\": \"system\", \"content\": system_message}]\n", @@ -677,11 +717,25 @@ "\n", " history[-1][1] = final_response_content\n", "\n", - " if use_audio != \"No\":\n", + " if use_audio_output != \"No\":\n", " talker(final_response_content)\n", "\n", " return history, image # Return a tuple of (the updated history, an image)\n", "\n", + " # This function ties together the transcription and the chat logic\n", + " def transcribe_and_chat(audio_file, history, target_language, use_audio_output):\n", + " if audio_file:\n", + " # Transcribe the audio file to text\n", + " transcribed_text = transcribe_audio(audio_file)\n", + " \n", + " # Update history with the transcribed text\n", + " new_history = history + [[transcribed_text, None]]\n", + " \n", + " # Call the main chat function with the new history\n", + " return chat_with_assistant(new_history, target_language, use_audio_output)\n", + " else:\n", + " return history, None\n", + "\n", " # The event listeners are updated to be triggered by both the textbox and the new button\n", " entry.submit(\n", " user_message_updater,\n", @@ -704,6 +758,21 @@ " inputs=[chatbot, language_dropdown, audio_dropdown],\n", " outputs=[chatbot, image]\n", " )\n", + "\n", + " # Event listener to trigger on audio stop\n", + " audio_input.stop(\n", + " transcribe_and_chat,\n", + " inputs=[audio_input, chatbot, language_dropdown, audio_dropdown],\n", + " outputs=[chatbot, image],\n", + " queue=False\n", + " )\n", + "\n", + " submit_audio_btn.click(\n", + " transcribe_and_chat,\n", + " inputs=[audio_input, chatbot, language_dropdown, audio_dropdown],\n", + " outputs=[chatbot, image],\n", + " queue=False\n", + " )\n", " \n", " clear.click(lambda: None, inputs=None, outputs=[chatbot, image], queue=False)\n", "\n", @@ -717,6 +786,14 @@ "metadata": {}, "outputs": [], "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3469b07d-2b9a-4409-bb1c-fbdab3248974", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": {