Introduce audio input and transcription

This commit is contained in:
Elijah Rwothoromo
2025-08-11 10:26:27 +03:00
parent f4e9798f3d
commit 06b191a2d0

View File

@@ -602,6 +602,40 @@
"# talker(\"Well, hi there\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e927f333-7ed5-4625-9e5a-5e0b62f8a684",
"metadata": {},
"outputs": [],
"source": [
"# To transcribe an audio prompt/input\n",
"\n",
"import tempfile\n",
"from pydub import AudioSegment\n",
"from pydub.playback import play\n",
"\n",
"def transcribe_audio(audio_file):\n",
" \"\"\"\n",
" Transcribes an audio file using OpenAI's Whisper model.\n",
" \"\"\"\n",
" if audio_file is None:\n",
" return \"\"\n",
" \n",
" # The Gradio Audio component returns a tuple (sample_rate, numpy_array)\n",
" # We need to save this to a file to pass to the OpenAI API\n",
" with tempfile.NamedTemporaryFile(suffix=\".wav\", delete=True) as tmpfile:\n",
" audio = AudioSegment.from_file(audio_file, format=\"wav\")\n",
" audio.export(tmpfile.name, format=\"wav\")\n",
" \n",
" with open(tmpfile.name, \"rb\") as audio_file_obj:\n",
" transcript = openai.audio.transcriptions.create(\n",
" model=\"whisper-1\", \n",
" file=audio_file_obj\n",
" )\n",
" return transcript.text"
]
},
{
"cell_type": "code",
"execution_count": null,
@@ -621,6 +655,12 @@
" entry = gr.Textbox(label=\"Chat with our AI Assistant:\", scale=4)\n",
" submit_btn = gr.Button(\"Submit\", scale=1)\n",
" with gr.Row():\n",
" # Provide a microphone input\n",
" audio_input = gr.Audio(sources=[\"microphone\"], type=\"filepath\", label=\"Speak to our AI Assistant\", scale=4)\n",
" submit_audio_btn = gr.Button(\"Submit Audio\", scale=1)\n",
"\n",
"\n",
" with gr.Row():\n",
" languages = [\"English\", \"Swahili\", \"French\", \"Chinese\", \"German\"]\n",
" language_dropdown = gr.Dropdown(\n",
" label=\"Select a language for translation\",\n",
@@ -641,7 +681,7 @@
" def user_message_updater(user_message, history):\n",
" return \"\", history + [[user_message, None]]\n",
"\n",
" def chat_with_assistant(history, target_language, use_audio):\n",
" def chat_with_assistant(history, target_language, use_audio_output):\n",
" message = history[-1][0] # Get the user's message from the last list in history\n",
" \n",
" messages = [{\"role\": \"system\", \"content\": system_message}]\n",
@@ -677,11 +717,25 @@
"\n",
" history[-1][1] = final_response_content\n",
"\n",
" if use_audio != \"No\":\n",
" if use_audio_output != \"No\":\n",
" talker(final_response_content)\n",
"\n",
" return history, image # Return a tuple of (the updated history, an image)\n",
"\n",
" # This function ties together the transcription and the chat logic\n",
" def transcribe_and_chat(audio_file, history, target_language, use_audio_output):\n",
" if audio_file:\n",
" # Transcribe the audio file to text\n",
" transcribed_text = transcribe_audio(audio_file)\n",
" \n",
" # Update history with the transcribed text\n",
" new_history = history + [[transcribed_text, None]]\n",
" \n",
" # Call the main chat function with the new history\n",
" return chat_with_assistant(new_history, target_language, use_audio_output)\n",
" else:\n",
" return history, None\n",
"\n",
" # The event listeners are updated to be triggered by both the textbox and the new button\n",
" entry.submit(\n",
" user_message_updater,\n",
@@ -704,6 +758,21 @@
" inputs=[chatbot, language_dropdown, audio_dropdown],\n",
" outputs=[chatbot, image]\n",
" )\n",
"\n",
" # Event listener to trigger on audio stop\n",
" audio_input.stop(\n",
" transcribe_and_chat,\n",
" inputs=[audio_input, chatbot, language_dropdown, audio_dropdown],\n",
" outputs=[chatbot, image],\n",
" queue=False\n",
" )\n",
"\n",
" submit_audio_btn.click(\n",
" transcribe_and_chat,\n",
" inputs=[audio_input, chatbot, language_dropdown, audio_dropdown],\n",
" outputs=[chatbot, image],\n",
" queue=False\n",
" )\n",
" \n",
" clear.click(lambda: None, inputs=None, outputs=[chatbot, image], queue=False)\n",
"\n",
@@ -717,6 +786,14 @@
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "3469b07d-2b9a-4409-bb1c-fbdab3248974",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {