mirror of
https://github.com/xtekky/gpt4free.git
synced 2025-12-06 02:30:41 -08:00
228 lines
8 KiB
Python
228 lines
8 KiB
Python
from __future__ import annotations
|
|
|
|
import hashlib
|
|
import asyncio
|
|
from pathlib import Path
|
|
from typing import Iterator, List, Optional
|
|
from urllib.parse import urlparse, quote_plus
|
|
from aiohttp import ClientSession, ClientTimeout, ClientError
|
|
from datetime import date
|
|
import asyncio
|
|
|
|
# Optional dependencies using the new 'ddgs' package name
|
|
try:
|
|
from ddgs import DDGS as DDGSClient
|
|
from bs4 import BeautifulSoup
|
|
has_requirements = True
|
|
except ImportError:
|
|
has_requirements = False
|
|
|
|
from ...typing import Messages, AsyncResult
|
|
from ...cookies import get_cookies_dir
|
|
from ...providers.response import format_link, JsonMixin, Sources
|
|
from ...errors import MissingRequirementsError
|
|
from ...providers.base_provider import AsyncGeneratorProvider
|
|
from ..helper import format_media_prompt
|
|
|
|
def scrape_text(html: str, max_words: Optional[int] = None, add_source: bool = True, count_images: int = 2) -> Iterator[str]:
|
|
"""
|
|
Parses the provided HTML and yields text fragments.
|
|
"""
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
for selector in [
|
|
"main", ".main-content-wrapper", ".main-content", ".emt-container-inner",
|
|
".content-wrapper", "#content", "#mainContent",
|
|
]:
|
|
selected = soup.select_one(selector)
|
|
if selected:
|
|
soup = selected
|
|
break
|
|
|
|
for remove_selector in [".c-globalDisclosure"]:
|
|
unwanted = soup.select_one(remove_selector)
|
|
if unwanted:
|
|
unwanted.extract()
|
|
|
|
image_selector = "img[alt][src^=http]:not([alt='']):not(.avatar):not([width])"
|
|
image_link_selector = f"a:has({image_selector})"
|
|
seen_texts = []
|
|
|
|
for element in soup.select(f"h1, h2, h3, h4, h5, h6, p, pre, table:not(:has(p)), ul:not(:has(p)), {image_link_selector}"):
|
|
if count_images > 0:
|
|
image = element.select_one(image_selector)
|
|
if image:
|
|
title = str(element.get("title", element.text))
|
|
if title:
|
|
yield f"!{format_link(image['src'], title)}\n"
|
|
if max_words is not None:
|
|
max_words -= 10
|
|
count_images -= 1
|
|
continue
|
|
|
|
for line in element.get_text(" ").splitlines():
|
|
words = [word for word in line.split() if word]
|
|
if not words:
|
|
continue
|
|
joined_line = " ".join(words)
|
|
if joined_line in seen_texts:
|
|
continue
|
|
if max_words is not None:
|
|
max_words -= len(words)
|
|
if max_words <= 0:
|
|
break
|
|
yield joined_line + "\n"
|
|
seen_texts.append(joined_line)
|
|
|
|
if add_source:
|
|
canonical_link = soup.find("link", rel="canonical")
|
|
if canonical_link and "href" in canonical_link.attrs:
|
|
link = canonical_link["href"]
|
|
domain = urlparse(link).netloc
|
|
yield f"\nSource: [{domain}]({link})"
|
|
|
|
async def fetch_and_scrape(session: ClientSession, url: str, max_words: Optional[int] = None, add_source: bool = False, proxy: str = None) -> str:
|
|
"""
|
|
Fetches a URL and returns the scraped text, using caching to avoid redundant downloads.
|
|
"""
|
|
try:
|
|
cache_dir: Path = Path(get_cookies_dir()) / ".scrape_cache" / "fetch_and_scrape"
|
|
cache_dir.mkdir(parents=True, exist_ok=True)
|
|
md5_hash = hashlib.md5(url.encode(errors="ignore")).hexdigest()
|
|
cache_file = cache_dir / f"{quote_plus(url.split('?')[0].split('//')[1].replace('/', ' ')[:48])}.{date.today()}.{md5_hash[:16]}.cache"
|
|
if cache_file.exists():
|
|
return cache_file.read_text()
|
|
|
|
async with session.get(url, proxy=proxy) as response:
|
|
if response.status == 200:
|
|
html = await response.text(errors="replace")
|
|
scraped_text = "".join(scrape_text(html, max_words, add_source))
|
|
with open(cache_file, "wb") as f:
|
|
f.write(scraped_text.encode(errors="replace"))
|
|
return scraped_text
|
|
except (ClientError, asyncio.TimeoutError):
|
|
return ""
|
|
return ""
|
|
|
|
class SearchResults(JsonMixin):
|
|
"""
|
|
Represents a collection of search result entries along with the count of used words.
|
|
"""
|
|
def __init__(self, results: List[SearchResultEntry], used_words: int):
|
|
self.results = results
|
|
self.used_words = used_words
|
|
|
|
@classmethod
|
|
def from_dict(cls, data: dict) -> SearchResults:
|
|
return cls(
|
|
[SearchResultEntry(**item) for item in data["results"]],
|
|
data["used_words"]
|
|
)
|
|
|
|
def __iter__(self) -> Iterator[SearchResultEntry]:
|
|
yield from self.results
|
|
|
|
def __str__(self) -> str:
|
|
# Build a string representation of the search results with markdown formatting.
|
|
output = []
|
|
for idx, result in enumerate(self.results):
|
|
parts = [
|
|
f"### Title: {result.title}",
|
|
"",
|
|
result.text if result.text else result.snippet,
|
|
"",
|
|
f"> **Source:** [[{idx}]]({result.url})"
|
|
]
|
|
output.append("\n".join(parts))
|
|
return "\n\n\n\n".join(output)
|
|
|
|
def __len__(self) -> int:
|
|
return len(self.results)
|
|
|
|
def get_sources(self) -> Sources:
|
|
return Sources([{"url": result.url, "title": result.title} for result in self.results])
|
|
|
|
def get_dict(self) -> dict:
|
|
return {
|
|
"results": [result.get_dict() for result in self.results],
|
|
"used_words": self.used_words
|
|
}
|
|
|
|
class SearchResultEntry(JsonMixin):
|
|
"""
|
|
Represents a single search result entry.
|
|
"""
|
|
def __init__(self, title: str, url: str, snippet: str, text: Optional[str] = None):
|
|
self.title = title
|
|
self.url = url
|
|
self.snippet = snippet
|
|
self.text = text
|
|
|
|
def set_text(self, text: str) -> None:
|
|
self.text = text
|
|
|
|
class DDGS(AsyncGeneratorProvider):
|
|
working = has_requirements
|
|
|
|
@classmethod
|
|
async def create_async_generator(
|
|
cls,
|
|
model: str,
|
|
messages: Messages,
|
|
prompt: str = None,
|
|
proxy: str = None,
|
|
timeout: int = 30,
|
|
region: str = None,
|
|
backend: str = None,
|
|
max_results: int = 5,
|
|
max_words: int = 2500,
|
|
add_text: bool = True,
|
|
**kwargs
|
|
) -> AsyncResult:
|
|
if not has_requirements:
|
|
raise MissingRequirementsError('Install "ddgs" and "beautifulsoup4" | pip install -U g4f[search]')
|
|
|
|
prompt = format_media_prompt(messages, prompt)
|
|
results: List[SearchResultEntry] = []
|
|
|
|
# Use the new DDGS() context manager style
|
|
with DDGSClient() as ddgs:
|
|
for result in ddgs.text(
|
|
prompt,
|
|
region=region,
|
|
safesearch="moderate",
|
|
timelimit="y",
|
|
max_results=max_results,
|
|
backend=backend,
|
|
):
|
|
if ".google." in result["href"]:
|
|
continue
|
|
results.append(SearchResultEntry(
|
|
title=result["title"],
|
|
url=result["href"],
|
|
snippet=result["body"]
|
|
))
|
|
|
|
if add_text:
|
|
tasks = []
|
|
async with ClientSession(timeout=ClientTimeout(timeout)) as session:
|
|
for entry in results:
|
|
tasks.append(fetch_and_scrape(session, entry.url, int(max_words / (max_results - 1)), False, proxy=proxy))
|
|
texts = await asyncio.gather(*tasks)
|
|
|
|
formatted_results: List[SearchResultEntry] = []
|
|
used_words = 0
|
|
left_words = max_words
|
|
for i, entry in enumerate(results):
|
|
if add_text:
|
|
entry.text = texts[i]
|
|
left_words -= entry.title.count(" ") + 5
|
|
if entry.text:
|
|
left_words -= entry.text.count(" ")
|
|
else:
|
|
left_words -= entry.snippet.count(" ")
|
|
if left_words < 0:
|
|
break
|
|
used_words = max_words - left_words
|
|
formatted_results.append(entry)
|
|
|
|
yield SearchResults(formatted_results, used_words)
|