Add MarkItDownTool and TextToAudioTool; enhance scrape_text and fetch_and_scrape functions with metadata support

This commit is contained in:
hlohaus 2025-11-02 04:32:32 +01:00
parent 8df4bc7118
commit 5d53e58d2c
4 changed files with 201 additions and 11 deletions

View file

@ -8,6 +8,6 @@ through the Model Context Protocol standard, allowing AI assistants to access:
"""
from .server import MCPServer
from .tools import WebSearchTool, WebScrapeTool, ImageGenerationTool
from .tools import MarkItDownTool, TextToAudioTool, WebSearchTool, WebScrapeTool, ImageGenerationTool
__all__ = ['MCPServer', 'WebSearchTool', 'WebScrapeTool', 'ImageGenerationTool']
__all__ = ['MCPServer', 'MarkItDownTool', 'TextToAudioTool', 'WebSearchTool', 'WebScrapeTool', 'ImageGenerationTool']

View file

@ -20,6 +20,7 @@ from ..debug import enable_logging
enable_logging()
from .tools import MarkItDownTool, TextToAudioTool, WebSearchTool, WebScrapeTool, ImageGenerationTool
from .tools import WebSearchTool, WebScrapeTool, ImageGenerationTool
@ -54,6 +55,8 @@ class MCPServer:
'web_search': WebSearchTool(),
'web_scrape': WebScrapeTool(),
'image_generation': ImageGenerationTool(),
'text_to_audio': TextToAudioTool(),
'mark_it_down': MarkItDownTool()
}
self.server_info = {
"name": "gpt4free-mcp-server",

View file

@ -8,7 +8,6 @@ This module provides MCP tool implementations that wrap gpt4free capabilities:
from __future__ import annotations
import asyncio
from typing import Any, Dict
from abc import ABC, abstractmethod
@ -158,7 +157,7 @@ class WebScrapeTool(MCPTool):
session=session,
url=url,
max_words=max_words,
add_source=True
add_metadata=True
)
if not content:
@ -183,7 +182,7 @@ class ImageGenerationTool(MCPTool):
@property
def description(self) -> str:
return "Generate images from text prompts using AI image generation providers. Returns base64-encoded image data."
return "Generate images from text prompts using AI image generation providers. Returns a URL to the generated image."
@property
def input_schema(self) -> Dict[str, Any]:
@ -291,3 +290,163 @@ class ImageGenerationTool(MCPTool):
return {
"error": f"Image generation failed: {str(e)}"
}
class MarkItDownTool(MCPTool):
"""MarkItDown tool for converting URLs to markdown format"""
@property
def description(self) -> str:
return "Convert a URL to markdown format using MarkItDown. Supports HTTP/HTTPS URLs and returns formatted markdown content."
@property
def input_schema(self) -> Dict[str, Any]:
return {
"type": "object",
"properties": {
"url": {
"type": "string",
"description": "The URL to convert to markdown format (must be HTTP/HTTPS)"
},
"max_content_length": {
"type": "integer",
"description": "Maximum content length for processing (default: 10000)",
"default": 10000
}
},
"required": ["url"]
}
async def execute(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Execute MarkItDown conversion
Returns:
Dict[str, Any]: Markdown content or error message
"""
try:
from ..integration.markitdown import MarkItDown
except ImportError as e:
return {
"error": f"MarkItDown is not installed: {str(e)}"
}
url = arguments.get("url", "")
max_content_length = arguments.get("max_content_length", 10000)
if not url:
return {
"error": "URL parameter is required"
}
# Validate URL format
if not url.startswith(("http://", "https://")):
return {
"error": "URL must start with http:// or https://"
}
try:
# Initialize MarkItDown
md = MarkItDown()
# Convert URL to markdown
result = md.convert_url(url)
if not result:
return {
"error": "Failed to convert URL to markdown"
}
# Truncate if content exceeds max length
if len(result) > max_content_length:
result = result[:max_content_length] + "\n\n[Content truncated...]"
return {
"url": url,
"markdown_content": result,
"content_length": len(result),
"truncated": len(result) > max_content_length
}
except Exception as e:
return {
"error": f"MarkItDown conversion failed: {str(e)}"
}
class TextToAudioTool(MCPTool):
"""TextToAudio tool for generating audio from text prompts using Pollinations AI"""
@property
def description(self) -> str:
return "Generate an audio URL from a text prompt using Pollinations AI text-to-speech service. Returns a direct URL to the generated audio file."
@property
def input_schema(self) -> Dict[str, Any]:
return {
"type": "object",
"properties": {
"prompt": {
"type": "string",
"description": "The text prompt to the audio model (example: 'Read this: Hello, world!')"
},
"voice": {
"type": "string",
"description": "Voice option for text-to-speech (default: 'alloy')",
"default": "alloy"
},
"url_encode": {
"type": "boolean",
"description": "Whether to URL-encode the prompt text (default: True)",
"default": True
}
},
"required": ["prompt"]
}
async def execute(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Execute text-to-speech conversion
Returns:
Dict[str, Any]: Audio URL or error message
"""
try:
import urllib.parse
except ImportError as e:
return {
"error": f"urllib is not available: {str(e)}"
}
prompt = arguments.get("prompt", "")
voice = arguments.get("voice", "alloy")
url_encode = arguments.get("url_encode", True)
if not prompt:
return {
"error": "Prompt parameter is required"
}
# Validate prompt length (reasonable limit for text-to-speech)
if len(prompt) > 5000:
return {
"error": "Prompt is too long (max 5000 characters)"
}
try:
# Prepare the prompt for URL
if url_encode:
encoded_prompt = urllib.parse.quote(prompt)
else:
encoded_prompt = prompt.replace(" ", "%20") # Basic space encoding
# Construct the Pollinations AI text-to-speech URL
base_url = "https://text.pollinations.ai"
audio_url = f"{base_url}/{encoded_prompt}?voice={voice}"
return {
"prompt": prompt,
"voice": voice,
"audio_url": audio_url
}
except Exception as e:
return {
"error": f"Text-to-speech URL generation failed: {str(e)}"
}

View file

@ -3,14 +3,14 @@ from __future__ import annotations
import hashlib
import asyncio
from pathlib import Path
from typing import Iterator, Optional
from typing import Dict, Iterator, Optional
from urllib.parse import urlparse, quote_plus
from aiohttp import ClientSession, ClientError
from datetime import date
import asyncio
try:
from bs4 import BeautifulSoup
from bs4 import BeautifulSoup, Tag
has_requirements = True
except ImportError:
has_requirements = False
@ -18,11 +18,39 @@ except ImportError:
from ..cookies import get_cookies_dir
from ..providers.response import format_link
def scrape_text(html: str, max_words: Optional[int] = None, add_source: bool = True, count_images: int = 2) -> Iterator[str]:
def scrape_text(html: str, max_words: Optional[int] = None, add_source: bool = True, count_images: int = 2, add_metadata: bool = False) -> Iterator[str]:
"""
Parses the provided HTML and yields text fragments.
"""
soup = BeautifulSoup(html, "html.parser")
# Read the meta tags
if add_metadata:
metadata: Dict[str, str] = {}
if soup.title and soup.title.string:
yield f"## {soup.title.string}\n"
seen_texts.append(soup.title.string)
max_words = None if max_words is None else max_words - len(soup.title.string.split())
for meta in soup(["meta"]):
if not isinstance(meta, Tag):
continue
for a in meta.attrs:
if a in ["itemprop", "property", "name"]:
key = str(meta.get(a, ""))
content = str(meta.get("content", ""))
if key and content: # Only add non-empty content
metadata[key] = content
break
description = metadata.get('description', metadata.get('og:description', '')).strip()
if description:
yield f"### Description\n{description}\n"
seen_texts.append(description)
max_words = None if max_words is None else max_words - len(description.split())
for selector in [
"main", ".main-content-wrapper", ".main-content", ".emt-container-inner",
".content-wrapper", "#content", "#mainContent",
@ -74,14 +102,14 @@ def scrape_text(html: str, max_words: Optional[int] = None, add_source: bool = T
domain = urlparse(link).netloc
yield f"\nSource: [{domain}]({link})"
async def fetch_and_scrape(session: ClientSession, url: str, max_words: Optional[int] = None, add_source: bool = False, proxy: str = None) -> str:
async def fetch_and_scrape(session: ClientSession, url: str, max_words: Optional[int] = None, add_source: bool = False, add_metadata: bool = False, proxy: str = None) -> str:
"""
Fetches a URL and returns the scraped text, using caching to avoid redundant downloads.
"""
try:
cache_dir: Path = Path(get_cookies_dir()) / ".scrape_cache" / "fetch_and_scrape"
cache_dir.mkdir(parents=True, exist_ok=True)
md5_hash = hashlib.md5(url.encode(errors="ignore")).hexdigest()
md5_hash = hashlib.md5(url.encode(errors="ignore")+str([max_words, add_source, add_metadata]).encode(errors="ignore")).hexdigest()
cache_file = cache_dir / f"{quote_plus(url.split('?')[0].split('//')[1].replace('/', ' ')[:48])}.{date.today()}.{md5_hash[:16]}.cache"
if cache_file.exists():
return cache_file.read_text()
@ -89,7 +117,7 @@ async def fetch_and_scrape(session: ClientSession, url: str, max_words: Optional
async with session.get(url, proxy=proxy) as response:
if response.status == 200:
html = await response.text(errors="replace")
scraped_text = "".join(scrape_text(html, max_words, add_source))
scraped_text = "".join(scrape_text(html, max_words, add_source, add_metadata=add_metadata))
with open(cache_file, "wb") as f:
f.write(scraped_text.encode(errors="replace"))
return scraped_text