mirror of
https://github.com/xtekky/gpt4free.git
synced 2025-12-06 02:30:41 -08:00
Add MarkItDownTool and TextToAudioTool; enhance scrape_text and fetch_and_scrape functions with metadata support
This commit is contained in:
parent
8df4bc7118
commit
5d53e58d2c
4 changed files with 201 additions and 11 deletions
|
|
@ -8,6 +8,6 @@ through the Model Context Protocol standard, allowing AI assistants to access:
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from .server import MCPServer
|
from .server import MCPServer
|
||||||
from .tools import WebSearchTool, WebScrapeTool, ImageGenerationTool
|
from .tools import MarkItDownTool, TextToAudioTool, WebSearchTool, WebScrapeTool, ImageGenerationTool
|
||||||
|
|
||||||
__all__ = ['MCPServer', 'WebSearchTool', 'WebScrapeTool', 'ImageGenerationTool']
|
__all__ = ['MCPServer', 'MarkItDownTool', 'TextToAudioTool', 'WebSearchTool', 'WebScrapeTool', 'ImageGenerationTool']
|
||||||
|
|
|
||||||
|
|
@ -20,6 +20,7 @@ from ..debug import enable_logging
|
||||||
|
|
||||||
enable_logging()
|
enable_logging()
|
||||||
|
|
||||||
|
from .tools import MarkItDownTool, TextToAudioTool, WebSearchTool, WebScrapeTool, ImageGenerationTool
|
||||||
from .tools import WebSearchTool, WebScrapeTool, ImageGenerationTool
|
from .tools import WebSearchTool, WebScrapeTool, ImageGenerationTool
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -54,6 +55,8 @@ class MCPServer:
|
||||||
'web_search': WebSearchTool(),
|
'web_search': WebSearchTool(),
|
||||||
'web_scrape': WebScrapeTool(),
|
'web_scrape': WebScrapeTool(),
|
||||||
'image_generation': ImageGenerationTool(),
|
'image_generation': ImageGenerationTool(),
|
||||||
|
'text_to_audio': TextToAudioTool(),
|
||||||
|
'mark_it_down': MarkItDownTool()
|
||||||
}
|
}
|
||||||
self.server_info = {
|
self.server_info = {
|
||||||
"name": "gpt4free-mcp-server",
|
"name": "gpt4free-mcp-server",
|
||||||
|
|
|
||||||
165
g4f/mcp/tools.py
165
g4f/mcp/tools.py
|
|
@ -8,7 +8,6 @@ This module provides MCP tool implementations that wrap gpt4free capabilities:
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import asyncio
|
|
||||||
from typing import Any, Dict
|
from typing import Any, Dict
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
|
|
||||||
|
|
@ -158,7 +157,7 @@ class WebScrapeTool(MCPTool):
|
||||||
session=session,
|
session=session,
|
||||||
url=url,
|
url=url,
|
||||||
max_words=max_words,
|
max_words=max_words,
|
||||||
add_source=True
|
add_metadata=True
|
||||||
)
|
)
|
||||||
|
|
||||||
if not content:
|
if not content:
|
||||||
|
|
@ -183,7 +182,7 @@ class ImageGenerationTool(MCPTool):
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def description(self) -> str:
|
def description(self) -> str:
|
||||||
return "Generate images from text prompts using AI image generation providers. Returns base64-encoded image data."
|
return "Generate images from text prompts using AI image generation providers. Returns a URL to the generated image."
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def input_schema(self) -> Dict[str, Any]:
|
def input_schema(self) -> Dict[str, Any]:
|
||||||
|
|
@ -291,3 +290,163 @@ class ImageGenerationTool(MCPTool):
|
||||||
return {
|
return {
|
||||||
"error": f"Image generation failed: {str(e)}"
|
"error": f"Image generation failed: {str(e)}"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
class MarkItDownTool(MCPTool):
|
||||||
|
"""MarkItDown tool for converting URLs to markdown format"""
|
||||||
|
|
||||||
|
@property
|
||||||
|
def description(self) -> str:
|
||||||
|
return "Convert a URL to markdown format using MarkItDown. Supports HTTP/HTTPS URLs and returns formatted markdown content."
|
||||||
|
|
||||||
|
@property
|
||||||
|
def input_schema(self) -> Dict[str, Any]:
|
||||||
|
return {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"url": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "The URL to convert to markdown format (must be HTTP/HTTPS)"
|
||||||
|
},
|
||||||
|
"max_content_length": {
|
||||||
|
"type": "integer",
|
||||||
|
"description": "Maximum content length for processing (default: 10000)",
|
||||||
|
"default": 10000
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["url"]
|
||||||
|
}
|
||||||
|
|
||||||
|
async def execute(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
|
"""Execute MarkItDown conversion
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict[str, Any]: Markdown content or error message
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
from ..integration.markitdown import MarkItDown
|
||||||
|
except ImportError as e:
|
||||||
|
return {
|
||||||
|
"error": f"MarkItDown is not installed: {str(e)}"
|
||||||
|
}
|
||||||
|
|
||||||
|
url = arguments.get("url", "")
|
||||||
|
max_content_length = arguments.get("max_content_length", 10000)
|
||||||
|
|
||||||
|
if not url:
|
||||||
|
return {
|
||||||
|
"error": "URL parameter is required"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Validate URL format
|
||||||
|
if not url.startswith(("http://", "https://")):
|
||||||
|
return {
|
||||||
|
"error": "URL must start with http:// or https://"
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Initialize MarkItDown
|
||||||
|
md = MarkItDown()
|
||||||
|
|
||||||
|
# Convert URL to markdown
|
||||||
|
result = md.convert_url(url)
|
||||||
|
|
||||||
|
if not result:
|
||||||
|
return {
|
||||||
|
"error": "Failed to convert URL to markdown"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Truncate if content exceeds max length
|
||||||
|
if len(result) > max_content_length:
|
||||||
|
result = result[:max_content_length] + "\n\n[Content truncated...]"
|
||||||
|
|
||||||
|
return {
|
||||||
|
"url": url,
|
||||||
|
"markdown_content": result,
|
||||||
|
"content_length": len(result),
|
||||||
|
"truncated": len(result) > max_content_length
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return {
|
||||||
|
"error": f"MarkItDown conversion failed: {str(e)}"
|
||||||
|
}
|
||||||
|
|
||||||
|
class TextToAudioTool(MCPTool):
|
||||||
|
"""TextToAudio tool for generating audio from text prompts using Pollinations AI"""
|
||||||
|
|
||||||
|
@property
|
||||||
|
def description(self) -> str:
|
||||||
|
return "Generate an audio URL from a text prompt using Pollinations AI text-to-speech service. Returns a direct URL to the generated audio file."
|
||||||
|
|
||||||
|
@property
|
||||||
|
def input_schema(self) -> Dict[str, Any]:
|
||||||
|
return {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"prompt": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "The text prompt to the audio model (example: 'Read this: Hello, world!')"
|
||||||
|
},
|
||||||
|
"voice": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Voice option for text-to-speech (default: 'alloy')",
|
||||||
|
"default": "alloy"
|
||||||
|
},
|
||||||
|
"url_encode": {
|
||||||
|
"type": "boolean",
|
||||||
|
"description": "Whether to URL-encode the prompt text (default: True)",
|
||||||
|
"default": True
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["prompt"]
|
||||||
|
}
|
||||||
|
|
||||||
|
async def execute(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
|
"""Execute text-to-speech conversion
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict[str, Any]: Audio URL or error message
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
import urllib.parse
|
||||||
|
except ImportError as e:
|
||||||
|
return {
|
||||||
|
"error": f"urllib is not available: {str(e)}"
|
||||||
|
}
|
||||||
|
|
||||||
|
prompt = arguments.get("prompt", "")
|
||||||
|
voice = arguments.get("voice", "alloy")
|
||||||
|
url_encode = arguments.get("url_encode", True)
|
||||||
|
|
||||||
|
if not prompt:
|
||||||
|
return {
|
||||||
|
"error": "Prompt parameter is required"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Validate prompt length (reasonable limit for text-to-speech)
|
||||||
|
if len(prompt) > 5000:
|
||||||
|
return {
|
||||||
|
"error": "Prompt is too long (max 5000 characters)"
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Prepare the prompt for URL
|
||||||
|
if url_encode:
|
||||||
|
encoded_prompt = urllib.parse.quote(prompt)
|
||||||
|
else:
|
||||||
|
encoded_prompt = prompt.replace(" ", "%20") # Basic space encoding
|
||||||
|
|
||||||
|
# Construct the Pollinations AI text-to-speech URL
|
||||||
|
base_url = "https://text.pollinations.ai"
|
||||||
|
audio_url = f"{base_url}/{encoded_prompt}?voice={voice}"
|
||||||
|
|
||||||
|
return {
|
||||||
|
"prompt": prompt,
|
||||||
|
"voice": voice,
|
||||||
|
"audio_url": audio_url
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return {
|
||||||
|
"error": f"Text-to-speech URL generation failed: {str(e)}"
|
||||||
|
}
|
||||||
|
|
|
||||||
|
|
@ -3,14 +3,14 @@ from __future__ import annotations
|
||||||
import hashlib
|
import hashlib
|
||||||
import asyncio
|
import asyncio
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterator, Optional
|
from typing import Dict, Iterator, Optional
|
||||||
from urllib.parse import urlparse, quote_plus
|
from urllib.parse import urlparse, quote_plus
|
||||||
from aiohttp import ClientSession, ClientError
|
from aiohttp import ClientSession, ClientError
|
||||||
from datetime import date
|
from datetime import date
|
||||||
import asyncio
|
import asyncio
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup, Tag
|
||||||
has_requirements = True
|
has_requirements = True
|
||||||
except ImportError:
|
except ImportError:
|
||||||
has_requirements = False
|
has_requirements = False
|
||||||
|
|
@ -18,11 +18,39 @@ except ImportError:
|
||||||
from ..cookies import get_cookies_dir
|
from ..cookies import get_cookies_dir
|
||||||
from ..providers.response import format_link
|
from ..providers.response import format_link
|
||||||
|
|
||||||
def scrape_text(html: str, max_words: Optional[int] = None, add_source: bool = True, count_images: int = 2) -> Iterator[str]:
|
def scrape_text(html: str, max_words: Optional[int] = None, add_source: bool = True, count_images: int = 2, add_metadata: bool = False) -> Iterator[str]:
|
||||||
"""
|
"""
|
||||||
Parses the provided HTML and yields text fragments.
|
Parses the provided HTML and yields text fragments.
|
||||||
"""
|
"""
|
||||||
soup = BeautifulSoup(html, "html.parser")
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
|
|
||||||
|
# Read the meta tags
|
||||||
|
if add_metadata:
|
||||||
|
metadata: Dict[str, str] = {}
|
||||||
|
|
||||||
|
if soup.title and soup.title.string:
|
||||||
|
yield f"## {soup.title.string}\n"
|
||||||
|
seen_texts.append(soup.title.string)
|
||||||
|
max_words = None if max_words is None else max_words - len(soup.title.string.split())
|
||||||
|
|
||||||
|
for meta in soup(["meta"]):
|
||||||
|
if not isinstance(meta, Tag):
|
||||||
|
continue
|
||||||
|
|
||||||
|
for a in meta.attrs:
|
||||||
|
if a in ["itemprop", "property", "name"]:
|
||||||
|
key = str(meta.get(a, ""))
|
||||||
|
content = str(meta.get("content", ""))
|
||||||
|
if key and content: # Only add non-empty content
|
||||||
|
metadata[key] = content
|
||||||
|
break
|
||||||
|
|
||||||
|
description = metadata.get('description', metadata.get('og:description', '')).strip()
|
||||||
|
if description:
|
||||||
|
yield f"### Description\n{description}\n"
|
||||||
|
seen_texts.append(description)
|
||||||
|
max_words = None if max_words is None else max_words - len(description.split())
|
||||||
|
|
||||||
for selector in [
|
for selector in [
|
||||||
"main", ".main-content-wrapper", ".main-content", ".emt-container-inner",
|
"main", ".main-content-wrapper", ".main-content", ".emt-container-inner",
|
||||||
".content-wrapper", "#content", "#mainContent",
|
".content-wrapper", "#content", "#mainContent",
|
||||||
|
|
@ -74,14 +102,14 @@ def scrape_text(html: str, max_words: Optional[int] = None, add_source: bool = T
|
||||||
domain = urlparse(link).netloc
|
domain = urlparse(link).netloc
|
||||||
yield f"\nSource: [{domain}]({link})"
|
yield f"\nSource: [{domain}]({link})"
|
||||||
|
|
||||||
async def fetch_and_scrape(session: ClientSession, url: str, max_words: Optional[int] = None, add_source: bool = False, proxy: str = None) -> str:
|
async def fetch_and_scrape(session: ClientSession, url: str, max_words: Optional[int] = None, add_source: bool = False, add_metadata: bool = False, proxy: str = None) -> str:
|
||||||
"""
|
"""
|
||||||
Fetches a URL and returns the scraped text, using caching to avoid redundant downloads.
|
Fetches a URL and returns the scraped text, using caching to avoid redundant downloads.
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
cache_dir: Path = Path(get_cookies_dir()) / ".scrape_cache" / "fetch_and_scrape"
|
cache_dir: Path = Path(get_cookies_dir()) / ".scrape_cache" / "fetch_and_scrape"
|
||||||
cache_dir.mkdir(parents=True, exist_ok=True)
|
cache_dir.mkdir(parents=True, exist_ok=True)
|
||||||
md5_hash = hashlib.md5(url.encode(errors="ignore")).hexdigest()
|
md5_hash = hashlib.md5(url.encode(errors="ignore")+str([max_words, add_source, add_metadata]).encode(errors="ignore")).hexdigest()
|
||||||
cache_file = cache_dir / f"{quote_plus(url.split('?')[0].split('//')[1].replace('/', ' ')[:48])}.{date.today()}.{md5_hash[:16]}.cache"
|
cache_file = cache_dir / f"{quote_plus(url.split('?')[0].split('//')[1].replace('/', ' ')[:48])}.{date.today()}.{md5_hash[:16]}.cache"
|
||||||
if cache_file.exists():
|
if cache_file.exists():
|
||||||
return cache_file.read_text()
|
return cache_file.read_text()
|
||||||
|
|
@ -89,7 +117,7 @@ async def fetch_and_scrape(session: ClientSession, url: str, max_words: Optional
|
||||||
async with session.get(url, proxy=proxy) as response:
|
async with session.get(url, proxy=proxy) as response:
|
||||||
if response.status == 200:
|
if response.status == 200:
|
||||||
html = await response.text(errors="replace")
|
html = await response.text(errors="replace")
|
||||||
scraped_text = "".join(scrape_text(html, max_words, add_source))
|
scraped_text = "".join(scrape_text(html, max_words, add_source, add_metadata=add_metadata))
|
||||||
with open(cache_file, "wb") as f:
|
with open(cache_file, "wb") as f:
|
||||||
f.write(scraped_text.encode(errors="replace"))
|
f.write(scraped_text.encode(errors="replace"))
|
||||||
return scraped_text
|
return scraped_text
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue