mirror of
https://github.com/xtekky/gpt4free.git
synced 2025-12-06 02:30:41 -08:00
feat: add audio speech generation endpoint and media handling refactor
- Added new `/v1/audio/speech` and `/api/{path_provider}/audio/speech` endpoints in `g4f/api/__init__.py` for generating speech from text
- Introduced `AudioSpeechConfig` model in `g4f/api/stubs.py` with fields for input, model, provider, voice, instructions, and response format
- Updated `PollinationsAI.py` to support `modalities` in `kwargs` when checking for audio
- Set default voice for audio models in `PollinationsAI.py` if not provided in `kwargs`
- Added debug print in `PollinationsAI.py` to log request data to text API endpoint
- Extended supported FastAPI response types in `g4f/api/__init__.py` to include `FileResponse` from `starlette.responses`
- Added `BackgroundTask` to clean up generated audio files after serving in `g4f/api/__init__.py`
- Modified `AnyProvider.py` to include `EdgeTTS`, `gTTS`, and `MarkItDown` as audio providers when `audio` is in `kwargs` or `modalities`
- Created `resolve_media` helper in `g4f/client/__init__.py` to standardize media handling for audio/image input
- Replaced manual media preprocessing in `Completions`, `AsyncCompletions`, and `Images` classes with `resolve_media`
- Added `/docs/README.md` with a link to the documentation site
This commit is contained in:
parent
b15a83ae13
commit
c3632984f7
6 changed files with 90 additions and 29 deletions
1
docs/README.md
Normal file
1
docs/README.md
Normal file
|
|
@ -0,0 +1 @@
|
||||||
|
[Documentation](https://gpt4free.github.io/docs/main.html)
|
||||||
|
|
@ -177,7 +177,7 @@ class PollinationsAI(AsyncGeneratorProvider, ProviderModelMixin):
|
||||||
# Load model list
|
# Load model list
|
||||||
cls.get_models()
|
cls.get_models()
|
||||||
if not model:
|
if not model:
|
||||||
has_audio = "audio" in kwargs
|
has_audio = "audio" in kwargs or "audio" in kwargs.get("modalities", [])
|
||||||
if not has_audio and media is not None:
|
if not has_audio and media is not None:
|
||||||
for media_data, filename in media:
|
for media_data, filename in media:
|
||||||
if is_data_an_audio(media_data, filename):
|
if is_data_an_audio(media_data, filename):
|
||||||
|
|
@ -311,6 +311,8 @@ class PollinationsAI(AsyncGeneratorProvider, ProviderModelMixin):
|
||||||
|
|
||||||
async with ClientSession(headers=DEFAULT_HEADERS, connector=get_connector(proxy=proxy)) as session:
|
async with ClientSession(headers=DEFAULT_HEADERS, connector=get_connector(proxy=proxy)) as session:
|
||||||
if model in cls.audio_models:
|
if model in cls.audio_models:
|
||||||
|
if "audio" in kwargs and kwargs.get("audio", {}).get("voice") is None:
|
||||||
|
kwargs["audio"]["voice"] = cls.audio_models[model][0]
|
||||||
url = cls.text_api_endpoint
|
url = cls.text_api_endpoint
|
||||||
stream = False
|
stream = False
|
||||||
else:
|
else:
|
||||||
|
|
@ -329,6 +331,7 @@ class PollinationsAI(AsyncGeneratorProvider, ProviderModelMixin):
|
||||||
"cache": cache,
|
"cache": cache,
|
||||||
**extra_parameters
|
**extra_parameters
|
||||||
})
|
})
|
||||||
|
print(f"Requesting {url} with data: {data}")
|
||||||
async with session.post(url, json=data) as response:
|
async with session.post(url, json=data) as response:
|
||||||
await raise_for_status(response)
|
await raise_for_status(response)
|
||||||
if response.headers["content-type"].startswith("text/plain"):
|
if response.headers["content-type"].startswith("text/plain"):
|
||||||
|
|
|
||||||
|
|
@ -13,7 +13,7 @@ import asyncio
|
||||||
from urllib.parse import quote_plus
|
from urllib.parse import quote_plus
|
||||||
from fastapi import FastAPI, Response, Request, UploadFile, Form, Depends
|
from fastapi import FastAPI, Response, Request, UploadFile, Form, Depends
|
||||||
from fastapi.middleware.wsgi import WSGIMiddleware
|
from fastapi.middleware.wsgi import WSGIMiddleware
|
||||||
from fastapi.responses import StreamingResponse, RedirectResponse, HTMLResponse, JSONResponse
|
from fastapi.responses import StreamingResponse, RedirectResponse, HTMLResponse, JSONResponse, FileResponse
|
||||||
from fastapi.exceptions import RequestValidationError
|
from fastapi.exceptions import RequestValidationError
|
||||||
from fastapi.security import APIKeyHeader
|
from fastapi.security import APIKeyHeader
|
||||||
from starlette.exceptions import HTTPException
|
from starlette.exceptions import HTTPException
|
||||||
|
|
@ -30,6 +30,7 @@ from fastapi.encoders import jsonable_encoder
|
||||||
from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials, HTTPBasic
|
from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials, HTTPBasic
|
||||||
from fastapi.middleware.cors import CORSMiddleware
|
from fastapi.middleware.cors import CORSMiddleware
|
||||||
from starlette.responses import FileResponse
|
from starlette.responses import FileResponse
|
||||||
|
from starlette.background import BackgroundTask
|
||||||
from types import SimpleNamespace
|
from types import SimpleNamespace
|
||||||
from typing import Union, Optional, List
|
from typing import Union, Optional, List
|
||||||
|
|
||||||
|
|
@ -49,6 +50,7 @@ from g4f.image.copy_images import get_media_dir, copy_media, get_source_url
|
||||||
from g4f.errors import ProviderNotFoundError, ModelNotFoundError, MissingAuthError, NoValidHarFileError
|
from g4f.errors import ProviderNotFoundError, ModelNotFoundError, MissingAuthError, NoValidHarFileError
|
||||||
from g4f.cookies import read_cookie_files, get_cookies_dir
|
from g4f.cookies import read_cookie_files, get_cookies_dir
|
||||||
from g4f.providers.types import ProviderType
|
from g4f.providers.types import ProviderType
|
||||||
|
from g4f.providers.response import AudioResponse
|
||||||
from g4f.providers.any_provider import AnyProvider
|
from g4f.providers.any_provider import AnyProvider
|
||||||
from g4f import Provider
|
from g4f import Provider
|
||||||
from g4f.gui import get_gui_app
|
from g4f.gui import get_gui_app
|
||||||
|
|
@ -58,7 +60,7 @@ from .stubs import (
|
||||||
ProviderResponseModel, ModelResponseModel,
|
ProviderResponseModel, ModelResponseModel,
|
||||||
ErrorResponseModel, ProviderResponseDetailModel,
|
ErrorResponseModel, ProviderResponseDetailModel,
|
||||||
FileResponseModel, UploadResponseModel,
|
FileResponseModel, UploadResponseModel,
|
||||||
TranscriptionResponseModel
|
TranscriptionResponseModel, AudioSpeechConfig
|
||||||
)
|
)
|
||||||
from g4f import debug
|
from g4f import debug
|
||||||
|
|
||||||
|
|
@ -492,10 +494,11 @@ class Api:
|
||||||
}
|
}
|
||||||
@self.app.post("/v1/audio/transcriptions", responses=responses)
|
@self.app.post("/v1/audio/transcriptions", responses=responses)
|
||||||
@self.app.post("/api/{path_provider}/audio/transcriptions", responses=responses)
|
@self.app.post("/api/{path_provider}/audio/transcriptions", responses=responses)
|
||||||
async def generate_image(
|
@self.app.post("/api/markitdown", responses=responses)
|
||||||
|
async def convert(
|
||||||
file: UploadFile,
|
file: UploadFile,
|
||||||
model: Annotated[Optional[str], Form()] = None,
|
model: Annotated[Optional[str], Form()] = None,
|
||||||
provider: Annotated[Optional[str], Form()] = AppConfig.media_provider,
|
provider: Annotated[Optional[str], Form()] = "MarkItDown",
|
||||||
path_provider: str = None,
|
path_provider: str = None,
|
||||||
prompt: Annotated[Optional[str], Form()] = "Transcribe this audio",
|
prompt: Annotated[Optional[str], Form()] = "Transcribe this audio",
|
||||||
api_key: Annotated[Optional[str], Form()] = None,
|
api_key: Annotated[Optional[str], Form()] = None,
|
||||||
|
|
@ -525,6 +528,54 @@ class Api:
|
||||||
logger.exception(e)
|
logger.exception(e)
|
||||||
return ErrorResponse.from_exception(e, None, HTTP_500_INTERNAL_SERVER_ERROR)
|
return ErrorResponse.from_exception(e, None, HTTP_500_INTERNAL_SERVER_ERROR)
|
||||||
|
|
||||||
|
responses = {
|
||||||
|
HTTP_200_OK: {"class": FileResponse},
|
||||||
|
HTTP_401_UNAUTHORIZED: {"model": ErrorResponseModel},
|
||||||
|
HTTP_404_NOT_FOUND: {"model": ErrorResponseModel},
|
||||||
|
HTTP_500_INTERNAL_SERVER_ERROR: {"model": ErrorResponseModel},
|
||||||
|
}
|
||||||
|
@self.app.post("/v1/audio/speech", responses=responses)
|
||||||
|
@self.app.post("/api/{path_provider}/audio/speech", responses=responses)
|
||||||
|
async def generate_speech(
|
||||||
|
config: AudioSpeechConfig,
|
||||||
|
provider: str = AppConfig.media_provider,
|
||||||
|
credentials: Annotated[HTTPAuthorizationCredentials, Depends(Api.security)] = None
|
||||||
|
):
|
||||||
|
api_key = None
|
||||||
|
if credentials is not None and credentials.credentials != "secret":
|
||||||
|
api_key = credentials.credentials
|
||||||
|
try:
|
||||||
|
response = await self.client.chat.completions.create(
|
||||||
|
messages=[
|
||||||
|
{"role": "user", "content": f"{config.instrcutions} Text: {config.input}"}
|
||||||
|
],
|
||||||
|
model=config.model,
|
||||||
|
provider=config.provider if provider is None else provider,
|
||||||
|
prompt=config.input,
|
||||||
|
audio=filter_none(voice=config.voice, format=config.response_format),
|
||||||
|
**filter_none(
|
||||||
|
api_key=api_key,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
if isinstance(response.choices[0].message.content, AudioResponse):
|
||||||
|
response = response.choices[0].message.content.data
|
||||||
|
response = response.replace("/media", get_media_dir())
|
||||||
|
def delete_file():
|
||||||
|
try:
|
||||||
|
os.remove(response)
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception(e)
|
||||||
|
return FileResponse(response, background=BackgroundTask(delete_file))
|
||||||
|
except (ModelNotFoundError, ProviderNotFoundError) as e:
|
||||||
|
logger.exception(e)
|
||||||
|
return ErrorResponse.from_exception(e, None, HTTP_404_NOT_FOUND)
|
||||||
|
except MissingAuthError as e:
|
||||||
|
logger.exception(e)
|
||||||
|
return ErrorResponse.from_exception(e, None, HTTP_401_UNAUTHORIZED)
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception(e)
|
||||||
|
return ErrorResponse.from_exception(e, None, HTTP_500_INTERNAL_SERVER_ERROR)
|
||||||
|
|
||||||
@self.app.post("/v1/upload_cookies", responses={
|
@self.app.post("/v1/upload_cookies", responses={
|
||||||
HTTP_200_OK: {"model": List[FileResponseModel]},
|
HTTP_200_OK: {"model": List[FileResponseModel]},
|
||||||
})
|
})
|
||||||
|
|
|
||||||
|
|
@ -118,4 +118,12 @@ class FileResponseModel(BaseModel):
|
||||||
class TranscriptionResponseModel(BaseModel):
|
class TranscriptionResponseModel(BaseModel):
|
||||||
text: str
|
text: str
|
||||||
model: str
|
model: str
|
||||||
provider: str
|
provider: str
|
||||||
|
|
||||||
|
class AudioSpeechConfig(BaseModel):
|
||||||
|
input: str
|
||||||
|
model: Optional[str] = None
|
||||||
|
provider: Optional[str] = None
|
||||||
|
voice: Optional[str] = None
|
||||||
|
instrcutions: str = "Speech this text in a natural way."
|
||||||
|
response_format: Optional[str] = None
|
||||||
|
|
@ -45,6 +45,17 @@ def add_chunk(content, chunk):
|
||||||
content = str(content) + str(chunk)
|
content = str(content) + str(chunk)
|
||||||
return content
|
return content
|
||||||
|
|
||||||
|
def resolve_media(kwargs: dict, image = None, image_name: str = None) -> None:
|
||||||
|
if image is not None:
|
||||||
|
kwargs["media"] = [(image, image_name)]
|
||||||
|
elif "images" in kwargs:
|
||||||
|
kwargs["media"] = kwargs.pop("images")
|
||||||
|
if "media" in kwargs and not isinstance(kwargs["media"], list):
|
||||||
|
kwargs["media"] = [kwargs["media"]]
|
||||||
|
for idx, media in enumerate(kwargs.get("media", [])):
|
||||||
|
if not isinstance(media, (list, tuple)):
|
||||||
|
kwargs["media"][idx] = (media, getattr(media, "name", None))
|
||||||
|
|
||||||
# Synchronous iter_response function
|
# Synchronous iter_response function
|
||||||
def iter_response(
|
def iter_response(
|
||||||
response: Union[Iterator[Union[str, ResponseType]]],
|
response: Union[Iterator[Union[str, ResponseType]]],
|
||||||
|
|
@ -296,13 +307,7 @@ class Completions:
|
||||||
) -> ChatCompletion:
|
) -> ChatCompletion:
|
||||||
if isinstance(messages, str):
|
if isinstance(messages, str):
|
||||||
messages = [{"role": "user", "content": messages}]
|
messages = [{"role": "user", "content": messages}]
|
||||||
if image is not None:
|
resolve_media(kwargs, image, image_name)
|
||||||
kwargs["media"] = [(image, image_name)]
|
|
||||||
elif "images" in kwargs:
|
|
||||||
kwargs["media"] = kwargs.pop("images")
|
|
||||||
for idx, media in enumerate(kwargs.get("media", [])):
|
|
||||||
if not isinstance(media, (list, tuple)):
|
|
||||||
kwargs["media"][idx] = (media, getattr(media, "name", None))
|
|
||||||
if provider is None:
|
if provider is None:
|
||||||
provider = self.provider
|
provider = self.provider
|
||||||
if provider is None:
|
if provider is None:
|
||||||
|
|
@ -483,6 +488,7 @@ class Images:
|
||||||
async def async_create_variation(
|
async def async_create_variation(
|
||||||
self,
|
self,
|
||||||
image: ImageType,
|
image: ImageType,
|
||||||
|
image_name: str = None,
|
||||||
model: Optional[str] = None,
|
model: Optional[str] = None,
|
||||||
provider: Optional[ProviderType] = None,
|
provider: Optional[ProviderType] = None,
|
||||||
response_format: Optional[str] = None,
|
response_format: Optional[str] = None,
|
||||||
|
|
@ -494,11 +500,7 @@ class Images:
|
||||||
if proxy is None:
|
if proxy is None:
|
||||||
proxy = self.client.proxy
|
proxy = self.client.proxy
|
||||||
prompt = "create a variation of this image"
|
prompt = "create a variation of this image"
|
||||||
if image is not None:
|
resolve_media(kwargs, image, image_name)
|
||||||
kwargs["media"] = image
|
|
||||||
for idx, media in enumerate(kwargs.get("media", [])):
|
|
||||||
if not isinstance(media, (list, tuple)):
|
|
||||||
kwargs["media"][idx] = (media, getattr(media, "name", None))
|
|
||||||
error = None
|
error = None
|
||||||
response = None
|
response = None
|
||||||
if isinstance(provider_handler, IterListProvider):
|
if isinstance(provider_handler, IterListProvider):
|
||||||
|
|
@ -600,13 +602,7 @@ class AsyncCompletions:
|
||||||
) -> Awaitable[ChatCompletion]:
|
) -> Awaitable[ChatCompletion]:
|
||||||
if isinstance(messages, str):
|
if isinstance(messages, str):
|
||||||
messages = [{"role": "user", "content": messages}]
|
messages = [{"role": "user", "content": messages}]
|
||||||
if image is not None:
|
resolve_media(kwargs, image, image_name)
|
||||||
kwargs["media"] = [(image, image_name)]
|
|
||||||
elif "images" in kwargs:
|
|
||||||
kwargs["media"] = kwargs.pop("images")
|
|
||||||
for idx, media in enumerate(kwargs.get("media", [])):
|
|
||||||
if not isinstance(media, (list, tuple)):
|
|
||||||
kwargs["media"][idx] = (media, getattr(media, "name", None))
|
|
||||||
if provider is None:
|
if provider is None:
|
||||||
provider = self.provider
|
provider = self.provider
|
||||||
if provider is None:
|
if provider is None:
|
||||||
|
|
|
||||||
|
|
@ -11,7 +11,7 @@ from ..Provider.hf_space import HuggingSpace
|
||||||
from .. import Provider
|
from .. import Provider
|
||||||
from .. import models
|
from .. import models
|
||||||
from ..Provider import Cloudflare, LMArenaProvider, Gemini, Grok, DeepSeekAPI, PerplexityLabs, LambdaChat, PollinationsAI, FreeRouter
|
from ..Provider import Cloudflare, LMArenaProvider, Gemini, Grok, DeepSeekAPI, PerplexityLabs, LambdaChat, PollinationsAI, FreeRouter
|
||||||
from ..Provider import Microsoft_Phi_4, DeepInfraChat, Blackbox
|
from ..Provider import Microsoft_Phi_4, DeepInfraChat, Blackbox, EdgeTTS, gTTS, MarkItDown
|
||||||
from .base_provider import AsyncGeneratorProvider, ProviderModelMixin
|
from .base_provider import AsyncGeneratorProvider, ProviderModelMixin
|
||||||
|
|
||||||
class AnyProvider(AsyncGeneratorProvider, ProviderModelMixin):
|
class AnyProvider(AsyncGeneratorProvider, ProviderModelMixin):
|
||||||
|
|
@ -124,15 +124,17 @@ class AnyProvider(AsyncGeneratorProvider, ProviderModelMixin):
|
||||||
elif not model or model == cls.default_model:
|
elif not model or model == cls.default_model:
|
||||||
model = ""
|
model = ""
|
||||||
has_image = False
|
has_image = False
|
||||||
has_audio = "audio" in kwargs
|
has_audio = False
|
||||||
if not has_audio and media is not None:
|
if not has_audio and media is not None:
|
||||||
for media_data, filename in media:
|
for media_data, filename in media:
|
||||||
if is_data_an_audio(media_data, filename):
|
if is_data_an_audio(media_data, filename):
|
||||||
has_audio = True
|
has_audio = True
|
||||||
break
|
break
|
||||||
has_image = True
|
has_image = True
|
||||||
if has_audio:
|
if "audio" in kwargs or "audio" in kwargs.get("modalities", []):
|
||||||
providers = [PollinationsAI, Microsoft_Phi_4]
|
providers = [PollinationsAI, EdgeTTS, gTTS]
|
||||||
|
elif has_audio:
|
||||||
|
providers = [PollinationsAI, Microsoft_Phi_4, MarkItDown]
|
||||||
elif has_image:
|
elif has_image:
|
||||||
providers = models.default_vision.best_provider.providers
|
providers = models.default_vision.best_provider.providers
|
||||||
else:
|
else:
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue