Introduce audio input and transcription
This commit is contained in:
@@ -602,6 +602,40 @@
|
||||
"# talker(\"Well, hi there\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "e927f333-7ed5-4625-9e5a-5e0b62f8a684",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# To transcribe an audio prompt/input\n",
|
||||
"\n",
|
||||
"import tempfile\n",
|
||||
"from pydub import AudioSegment\n",
|
||||
"from pydub.playback import play\n",
|
||||
"\n",
|
||||
"def transcribe_audio(audio_file):\n",
|
||||
" \"\"\"\n",
|
||||
" Transcribes an audio file using OpenAI's Whisper model.\n",
|
||||
" \"\"\"\n",
|
||||
" if audio_file is None:\n",
|
||||
" return \"\"\n",
|
||||
" \n",
|
||||
" # The Gradio Audio component returns a tuple (sample_rate, numpy_array)\n",
|
||||
" # We need to save this to a file to pass to the OpenAI API\n",
|
||||
" with tempfile.NamedTemporaryFile(suffix=\".wav\", delete=True) as tmpfile:\n",
|
||||
" audio = AudioSegment.from_file(audio_file, format=\"wav\")\n",
|
||||
" audio.export(tmpfile.name, format=\"wav\")\n",
|
||||
" \n",
|
||||
" with open(tmpfile.name, \"rb\") as audio_file_obj:\n",
|
||||
" transcript = openai.audio.transcriptions.create(\n",
|
||||
" model=\"whisper-1\", \n",
|
||||
" file=audio_file_obj\n",
|
||||
" )\n",
|
||||
" return transcript.text"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
@@ -621,6 +655,12 @@
|
||||
" entry = gr.Textbox(label=\"Chat with our AI Assistant:\", scale=4)\n",
|
||||
" submit_btn = gr.Button(\"Submit\", scale=1)\n",
|
||||
" with gr.Row():\n",
|
||||
" # Provide a microphone input\n",
|
||||
" audio_input = gr.Audio(sources=[\"microphone\"], type=\"filepath\", label=\"Speak to our AI Assistant\", scale=4)\n",
|
||||
" submit_audio_btn = gr.Button(\"Submit Audio\", scale=1)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
" with gr.Row():\n",
|
||||
" languages = [\"English\", \"Swahili\", \"French\", \"Chinese\", \"German\"]\n",
|
||||
" language_dropdown = gr.Dropdown(\n",
|
||||
" label=\"Select a language for translation\",\n",
|
||||
@@ -641,7 +681,7 @@
|
||||
" def user_message_updater(user_message, history):\n",
|
||||
" return \"\", history + [[user_message, None]]\n",
|
||||
"\n",
|
||||
" def chat_with_assistant(history, target_language, use_audio):\n",
|
||||
" def chat_with_assistant(history, target_language, use_audio_output):\n",
|
||||
" message = history[-1][0] # Get the user's message from the last list in history\n",
|
||||
" \n",
|
||||
" messages = [{\"role\": \"system\", \"content\": system_message}]\n",
|
||||
@@ -677,11 +717,25 @@
|
||||
"\n",
|
||||
" history[-1][1] = final_response_content\n",
|
||||
"\n",
|
||||
" if use_audio != \"No\":\n",
|
||||
" if use_audio_output != \"No\":\n",
|
||||
" talker(final_response_content)\n",
|
||||
"\n",
|
||||
" return history, image # Return a tuple of (the updated history, an image)\n",
|
||||
"\n",
|
||||
" # This function ties together the transcription and the chat logic\n",
|
||||
" def transcribe_and_chat(audio_file, history, target_language, use_audio_output):\n",
|
||||
" if audio_file:\n",
|
||||
" # Transcribe the audio file to text\n",
|
||||
" transcribed_text = transcribe_audio(audio_file)\n",
|
||||
" \n",
|
||||
" # Update history with the transcribed text\n",
|
||||
" new_history = history + [[transcribed_text, None]]\n",
|
||||
" \n",
|
||||
" # Call the main chat function with the new history\n",
|
||||
" return chat_with_assistant(new_history, target_language, use_audio_output)\n",
|
||||
" else:\n",
|
||||
" return history, None\n",
|
||||
"\n",
|
||||
" # The event listeners are updated to be triggered by both the textbox and the new button\n",
|
||||
" entry.submit(\n",
|
||||
" user_message_updater,\n",
|
||||
@@ -704,6 +758,21 @@
|
||||
" inputs=[chatbot, language_dropdown, audio_dropdown],\n",
|
||||
" outputs=[chatbot, image]\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # Event listener to trigger on audio stop\n",
|
||||
" audio_input.stop(\n",
|
||||
" transcribe_and_chat,\n",
|
||||
" inputs=[audio_input, chatbot, language_dropdown, audio_dropdown],\n",
|
||||
" outputs=[chatbot, image],\n",
|
||||
" queue=False\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" submit_audio_btn.click(\n",
|
||||
" transcribe_and_chat,\n",
|
||||
" inputs=[audio_input, chatbot, language_dropdown, audio_dropdown],\n",
|
||||
" outputs=[chatbot, image],\n",
|
||||
" queue=False\n",
|
||||
" )\n",
|
||||
" \n",
|
||||
" clear.click(lambda: None, inputs=None, outputs=[chatbot, image], queue=False)\n",
|
||||
"\n",
|
||||
@@ -717,6 +786,14 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "3469b07d-2b9a-4409-bb1c-fbdab3248974",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
|
||||
Reference in New Issue
Block a user