gpt4free/g4f/integration/markitdown/_transcribe_audio.py
hlohaus 9461949542 feat: improve media handling, file conversion, and error management
- Added UUID-based "x-xai-request-id" header and 403 error handling in Grok.py
- Updated backend_api.py to handle empty media results and unsupported file types with error raising and file cleanup
- Simplified render logic in website.py by removing is_live flag and related code
- Changed "audio/wav" MIME type to "audio/x-wav" in image/__init__.py
- Added is_valid_media and is_valid_audio functions to image/__init__.py for stricter media validation
- Enhanced MarkItDown integration in markitdown/__init__.py with convert_stream method supporting non-seekable streams
- Modified _transcribe_audio.py to use recognize_faster_whisper if available, fallback to recognize_google
- Updated providers/helper.py to prioritize "text" key in to_string function
- Improved stream_read_files in files.py to skip DOWNLOADS_FILE and adjust code block formatting
- Added get_filename_from_url utility in files.py for consistent filename generation from URLs
- Enhanced download_urls in files.py to use MarkItDown for URL conversion and improved error logging
- Improved render_part and related functions in media.py to use new media validation logic and handle more cases
- Adjusted merge_media and render_messages in media.py for stricter part filtering and validation
2025-05-20 22:40:12 +02:00

54 lines
2.1 KiB
Python

import io
import sys
from typing import BinaryIO
from markitdown._exceptions import MissingDependencyException
# Try loading optional (but in this case, required) dependencies
# Save reporting of any exceptions for later
_dependency_exc_info = None
try:
# Suppress some warnings on library import
import warnings
with warnings.catch_warnings():
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=SyntaxWarning)
import speech_recognition as sr
import pydub
except ImportError:
# Preserve the error and stack trace for later
_dependency_exc_info = sys.exc_info()
def transcribe_audio(file_stream: BinaryIO, *, audio_format: str = "wav", language: str = None) -> str:
# Check for installed dependencies
if _dependency_exc_info is not None:
raise MissingDependencyException(
"Speech transcription requires installing MarkItdown with the [audio-transcription] optional dependencies. E.g., `pip install markitdown[audio-transcription]` or `pip install markitdown[all]`"
) from _dependency_exc_info[
1
].with_traceback( # type: ignore[union-attr]
_dependency_exc_info[2]
)
if audio_format in ["wav", "aiff", "flac"]:
audio_source = file_stream
elif audio_format in ["mp3", "mp4", "webm"]:
audio_segment = pydub.AudioSegment.from_file(file_stream, format=audio_format)
audio_source = io.BytesIO()
audio_segment.export(audio_source, format="wav")
audio_source.seek(0)
else:
raise ValueError(f"Unsupported audio format: {audio_format}")
recognizer = sr.Recognizer()
with sr.AudioFile(audio_source) as source:
audio = recognizer.record(source)
if language is None:
language = "en-US"
try:
transcript = recognizer.recognize_faster_whisper(audio, language=language.split("-")[0]).strip()
except ImportError:
transcript = recognizer.recognize_google(audio, language=language).strip()
return "[No speech detected]" if transcript == "" else transcript.strip()