gpt4free/g4f/integration/markitdown/_transcribe_audio.py
hlohaus c0d31c2abb refactor: improve media rendering and response formatting with precise changes
- Modified g4f/providers/response.py to ensure format_images_markdown returns the result directly without additional flags in the 'format_images_markdown' function.
- Updated g4f/gui/server/api.py to add 'tempfiles' parameter with default empty list to '_create_response_stream' method.
- Changed or added code in API response handling to iterate over 'tempfiles' and attempt to remove each file after response completion, with exception handling (try-except block with logger.exception).
- Adjusted g4f/Tools/files.py to fix tempfile creation: corrected the 'suffix' parameter in 'get_tempfile' to use 'suffix' directly instead of splitting.
- In g4f/tools/media.py, changed 'render_part' function to handle 'text' key properly, checking 'part.get("text")' and returning a dictionary with 'type': 'text' and 'text': value, if present.
2025-05-19 08:15:21 +02:00

49 lines
1.9 KiB
Python

import io
import sys
from typing import BinaryIO
from markitdown._exceptions import MissingDependencyException
# Try loading optional (but in this case, required) dependencies
# Save reporting of any exceptions for later
_dependency_exc_info = None
try:
# Suppress some warnings on library import
import warnings
with warnings.catch_warnings():
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=SyntaxWarning)
import speech_recognition as sr
import pydub
except ImportError:
# Preserve the error and stack trace for later
_dependency_exc_info = sys.exc_info()
def transcribe_audio(file_stream: BinaryIO, *, audio_format: str = "wav", language: str = "en-US") -> str:
# Check for installed dependencies
if _dependency_exc_info is not None:
raise MissingDependencyException(
"Speech transcription requires installing MarkItdown with the [audio-transcription] optional dependencies. E.g., `pip install markitdown[audio-transcription]` or `pip install markitdown[all]`"
) from _dependency_exc_info[
1
].with_traceback( # type: ignore[union-attr]
_dependency_exc_info[2]
)
if audio_format in ["wav", "aiff", "flac"]:
audio_source = file_stream
elif audio_format in ["mp3", "mp4", "webm"]:
audio_segment = pydub.AudioSegment.from_file(file_stream, format=audio_format)
audio_source = io.BytesIO()
audio_segment.export(audio_source, format="wav")
audio_source.seek(0)
else:
raise ValueError(f"Unsupported audio format: {audio_format}")
recognizer = sr.Recognizer()
with sr.AudioFile(audio_source) as source:
audio = recognizer.record(source)
transcript = recognizer.recognize_google(audio, language=language).strip()
return "[No speech detected]" if transcript == "" else transcript