mirror of
https://github.com/xtekky/gpt4free.git
synced 2025-12-05 18:20:35 -08:00
Add MarkItDownTool and TextToAudioTool; enhance scrape_text and fetch_and_scrape functions with metadata support
This commit is contained in:
parent
8df4bc7118
commit
5d53e58d2c
4 changed files with 201 additions and 11 deletions
|
|
@ -8,6 +8,6 @@ through the Model Context Protocol standard, allowing AI assistants to access:
|
|||
"""
|
||||
|
||||
from .server import MCPServer
|
||||
from .tools import WebSearchTool, WebScrapeTool, ImageGenerationTool
|
||||
from .tools import MarkItDownTool, TextToAudioTool, WebSearchTool, WebScrapeTool, ImageGenerationTool
|
||||
|
||||
__all__ = ['MCPServer', 'WebSearchTool', 'WebScrapeTool', 'ImageGenerationTool']
|
||||
__all__ = ['MCPServer', 'MarkItDownTool', 'TextToAudioTool', 'WebSearchTool', 'WebScrapeTool', 'ImageGenerationTool']
|
||||
|
|
|
|||
|
|
@ -20,6 +20,7 @@ from ..debug import enable_logging
|
|||
|
||||
enable_logging()
|
||||
|
||||
from .tools import MarkItDownTool, TextToAudioTool, WebSearchTool, WebScrapeTool, ImageGenerationTool
|
||||
from .tools import WebSearchTool, WebScrapeTool, ImageGenerationTool
|
||||
|
||||
|
||||
|
|
@ -54,6 +55,8 @@ class MCPServer:
|
|||
'web_search': WebSearchTool(),
|
||||
'web_scrape': WebScrapeTool(),
|
||||
'image_generation': ImageGenerationTool(),
|
||||
'text_to_audio': TextToAudioTool(),
|
||||
'mark_it_down': MarkItDownTool()
|
||||
}
|
||||
self.server_info = {
|
||||
"name": "gpt4free-mcp-server",
|
||||
|
|
|
|||
165
g4f/mcp/tools.py
165
g4f/mcp/tools.py
|
|
@ -8,7 +8,6 @@ This module provides MCP tool implementations that wrap gpt4free capabilities:
|
|||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
from typing import Any, Dict
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
|
|
@ -158,7 +157,7 @@ class WebScrapeTool(MCPTool):
|
|||
session=session,
|
||||
url=url,
|
||||
max_words=max_words,
|
||||
add_source=True
|
||||
add_metadata=True
|
||||
)
|
||||
|
||||
if not content:
|
||||
|
|
@ -183,7 +182,7 @@ class ImageGenerationTool(MCPTool):
|
|||
|
||||
@property
|
||||
def description(self) -> str:
|
||||
return "Generate images from text prompts using AI image generation providers. Returns base64-encoded image data."
|
||||
return "Generate images from text prompts using AI image generation providers. Returns a URL to the generated image."
|
||||
|
||||
@property
|
||||
def input_schema(self) -> Dict[str, Any]:
|
||||
|
|
@ -291,3 +290,163 @@ class ImageGenerationTool(MCPTool):
|
|||
return {
|
||||
"error": f"Image generation failed: {str(e)}"
|
||||
}
|
||||
|
||||
class MarkItDownTool(MCPTool):
|
||||
"""MarkItDown tool for converting URLs to markdown format"""
|
||||
|
||||
@property
|
||||
def description(self) -> str:
|
||||
return "Convert a URL to markdown format using MarkItDown. Supports HTTP/HTTPS URLs and returns formatted markdown content."
|
||||
|
||||
@property
|
||||
def input_schema(self) -> Dict[str, Any]:
|
||||
return {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"url": {
|
||||
"type": "string",
|
||||
"description": "The URL to convert to markdown format (must be HTTP/HTTPS)"
|
||||
},
|
||||
"max_content_length": {
|
||||
"type": "integer",
|
||||
"description": "Maximum content length for processing (default: 10000)",
|
||||
"default": 10000
|
||||
}
|
||||
},
|
||||
"required": ["url"]
|
||||
}
|
||||
|
||||
async def execute(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Execute MarkItDown conversion
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: Markdown content or error message
|
||||
"""
|
||||
try:
|
||||
from ..integration.markitdown import MarkItDown
|
||||
except ImportError as e:
|
||||
return {
|
||||
"error": f"MarkItDown is not installed: {str(e)}"
|
||||
}
|
||||
|
||||
url = arguments.get("url", "")
|
||||
max_content_length = arguments.get("max_content_length", 10000)
|
||||
|
||||
if not url:
|
||||
return {
|
||||
"error": "URL parameter is required"
|
||||
}
|
||||
|
||||
# Validate URL format
|
||||
if not url.startswith(("http://", "https://")):
|
||||
return {
|
||||
"error": "URL must start with http:// or https://"
|
||||
}
|
||||
|
||||
try:
|
||||
# Initialize MarkItDown
|
||||
md = MarkItDown()
|
||||
|
||||
# Convert URL to markdown
|
||||
result = md.convert_url(url)
|
||||
|
||||
if not result:
|
||||
return {
|
||||
"error": "Failed to convert URL to markdown"
|
||||
}
|
||||
|
||||
# Truncate if content exceeds max length
|
||||
if len(result) > max_content_length:
|
||||
result = result[:max_content_length] + "\n\n[Content truncated...]"
|
||||
|
||||
return {
|
||||
"url": url,
|
||||
"markdown_content": result,
|
||||
"content_length": len(result),
|
||||
"truncated": len(result) > max_content_length
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
return {
|
||||
"error": f"MarkItDown conversion failed: {str(e)}"
|
||||
}
|
||||
|
||||
class TextToAudioTool(MCPTool):
|
||||
"""TextToAudio tool for generating audio from text prompts using Pollinations AI"""
|
||||
|
||||
@property
|
||||
def description(self) -> str:
|
||||
return "Generate an audio URL from a text prompt using Pollinations AI text-to-speech service. Returns a direct URL to the generated audio file."
|
||||
|
||||
@property
|
||||
def input_schema(self) -> Dict[str, Any]:
|
||||
return {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"prompt": {
|
||||
"type": "string",
|
||||
"description": "The text prompt to the audio model (example: 'Read this: Hello, world!')"
|
||||
},
|
||||
"voice": {
|
||||
"type": "string",
|
||||
"description": "Voice option for text-to-speech (default: 'alloy')",
|
||||
"default": "alloy"
|
||||
},
|
||||
"url_encode": {
|
||||
"type": "boolean",
|
||||
"description": "Whether to URL-encode the prompt text (default: True)",
|
||||
"default": True
|
||||
}
|
||||
},
|
||||
"required": ["prompt"]
|
||||
}
|
||||
|
||||
async def execute(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Execute text-to-speech conversion
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: Audio URL or error message
|
||||
"""
|
||||
try:
|
||||
import urllib.parse
|
||||
except ImportError as e:
|
||||
return {
|
||||
"error": f"urllib is not available: {str(e)}"
|
||||
}
|
||||
|
||||
prompt = arguments.get("prompt", "")
|
||||
voice = arguments.get("voice", "alloy")
|
||||
url_encode = arguments.get("url_encode", True)
|
||||
|
||||
if not prompt:
|
||||
return {
|
||||
"error": "Prompt parameter is required"
|
||||
}
|
||||
|
||||
# Validate prompt length (reasonable limit for text-to-speech)
|
||||
if len(prompt) > 5000:
|
||||
return {
|
||||
"error": "Prompt is too long (max 5000 characters)"
|
||||
}
|
||||
|
||||
try:
|
||||
# Prepare the prompt for URL
|
||||
if url_encode:
|
||||
encoded_prompt = urllib.parse.quote(prompt)
|
||||
else:
|
||||
encoded_prompt = prompt.replace(" ", "%20") # Basic space encoding
|
||||
|
||||
# Construct the Pollinations AI text-to-speech URL
|
||||
base_url = "https://text.pollinations.ai"
|
||||
audio_url = f"{base_url}/{encoded_prompt}?voice={voice}"
|
||||
|
||||
return {
|
||||
"prompt": prompt,
|
||||
"voice": voice,
|
||||
"audio_url": audio_url
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
return {
|
||||
"error": f"Text-to-speech URL generation failed: {str(e)}"
|
||||
}
|
||||
|
|
|
|||
|
|
@ -3,14 +3,14 @@ from __future__ import annotations
|
|||
import hashlib
|
||||
import asyncio
|
||||
from pathlib import Path
|
||||
from typing import Iterator, Optional
|
||||
from typing import Dict, Iterator, Optional
|
||||
from urllib.parse import urlparse, quote_plus
|
||||
from aiohttp import ClientSession, ClientError
|
||||
from datetime import date
|
||||
import asyncio
|
||||
|
||||
try:
|
||||
from bs4 import BeautifulSoup
|
||||
from bs4 import BeautifulSoup, Tag
|
||||
has_requirements = True
|
||||
except ImportError:
|
||||
has_requirements = False
|
||||
|
|
@ -18,11 +18,39 @@ except ImportError:
|
|||
from ..cookies import get_cookies_dir
|
||||
from ..providers.response import format_link
|
||||
|
||||
def scrape_text(html: str, max_words: Optional[int] = None, add_source: bool = True, count_images: int = 2) -> Iterator[str]:
|
||||
def scrape_text(html: str, max_words: Optional[int] = None, add_source: bool = True, count_images: int = 2, add_metadata: bool = False) -> Iterator[str]:
|
||||
"""
|
||||
Parses the provided HTML and yields text fragments.
|
||||
"""
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
|
||||
# Read the meta tags
|
||||
if add_metadata:
|
||||
metadata: Dict[str, str] = {}
|
||||
|
||||
if soup.title and soup.title.string:
|
||||
yield f"## {soup.title.string}\n"
|
||||
seen_texts.append(soup.title.string)
|
||||
max_words = None if max_words is None else max_words - len(soup.title.string.split())
|
||||
|
||||
for meta in soup(["meta"]):
|
||||
if not isinstance(meta, Tag):
|
||||
continue
|
||||
|
||||
for a in meta.attrs:
|
||||
if a in ["itemprop", "property", "name"]:
|
||||
key = str(meta.get(a, ""))
|
||||
content = str(meta.get("content", ""))
|
||||
if key and content: # Only add non-empty content
|
||||
metadata[key] = content
|
||||
break
|
||||
|
||||
description = metadata.get('description', metadata.get('og:description', '')).strip()
|
||||
if description:
|
||||
yield f"### Description\n{description}\n"
|
||||
seen_texts.append(description)
|
||||
max_words = None if max_words is None else max_words - len(description.split())
|
||||
|
||||
for selector in [
|
||||
"main", ".main-content-wrapper", ".main-content", ".emt-container-inner",
|
||||
".content-wrapper", "#content", "#mainContent",
|
||||
|
|
@ -74,14 +102,14 @@ def scrape_text(html: str, max_words: Optional[int] = None, add_source: bool = T
|
|||
domain = urlparse(link).netloc
|
||||
yield f"\nSource: [{domain}]({link})"
|
||||
|
||||
async def fetch_and_scrape(session: ClientSession, url: str, max_words: Optional[int] = None, add_source: bool = False, proxy: str = None) -> str:
|
||||
async def fetch_and_scrape(session: ClientSession, url: str, max_words: Optional[int] = None, add_source: bool = False, add_metadata: bool = False, proxy: str = None) -> str:
|
||||
"""
|
||||
Fetches a URL and returns the scraped text, using caching to avoid redundant downloads.
|
||||
"""
|
||||
try:
|
||||
cache_dir: Path = Path(get_cookies_dir()) / ".scrape_cache" / "fetch_and_scrape"
|
||||
cache_dir.mkdir(parents=True, exist_ok=True)
|
||||
md5_hash = hashlib.md5(url.encode(errors="ignore")).hexdigest()
|
||||
md5_hash = hashlib.md5(url.encode(errors="ignore")+str([max_words, add_source, add_metadata]).encode(errors="ignore")).hexdigest()
|
||||
cache_file = cache_dir / f"{quote_plus(url.split('?')[0].split('//')[1].replace('/', ' ')[:48])}.{date.today()}.{md5_hash[:16]}.cache"
|
||||
if cache_file.exists():
|
||||
return cache_file.read_text()
|
||||
|
|
@ -89,7 +117,7 @@ async def fetch_and_scrape(session: ClientSession, url: str, max_words: Optional
|
|||
async with session.get(url, proxy=proxy) as response:
|
||||
if response.status == 200:
|
||||
html = await response.text(errors="replace")
|
||||
scraped_text = "".join(scrape_text(html, max_words, add_source))
|
||||
scraped_text = "".join(scrape_text(html, max_words, add_source, add_metadata=add_metadata))
|
||||
with open(cache_file, "wb") as f:
|
||||
f.write(scraped_text.encode(errors="replace"))
|
||||
return scraped_text
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue