gpt4free/g4f/Provider/search/DDGS.py

from __future__ import annotations

import hashlib
import asyncio
from pathlib import Path
from typing import Iterator, List, Optional
from urllib.parse import urlparse, quote_plus
from aiohttp import ClientSession, ClientTimeout, ClientError
from datetime import date
import asyncio

# Optional dependencies using the new 'ddgs' package name
try:
    from ddgs import DDGS as DDGSClient
    from bs4 import BeautifulSoup
    has_requirements = True
except ImportError:
    has_requirements = False

from ...typing import Messages, AsyncResult
from ...cookies import get_cookies_dir
from ...providers.response import format_link, JsonMixin, Sources
from ...errors import MissingRequirementsError
from ...providers.base_provider import AsyncGeneratorProvider
from ..helper import format_media_prompt

def scrape_text(html: str, max_words: Optional[int] = None, add_source: bool = True, count_images: int = 2) -> Iterator[str]:
    """
    Parses the provided HTML and yields text fragments.
    """
    soup = BeautifulSoup(html, "html.parser")
    for selector in [
        "main", ".main-content-wrapper", ".main-content", ".emt-container-inner",
        ".content-wrapper", "#content", "#mainContent",
    ]:
        selected = soup.select_one(selector)
        if selected:
            soup = selected
            break

    for remove_selector in [".c-globalDisclosure"]:
        unwanted = soup.select_one(remove_selector)
        if unwanted:
            unwanted.extract()

    image_selector = "img[alt][src^=http]:not([alt='']):not(.avatar):not([width])"
    image_link_selector = f"a:has({image_selector})"
    seen_texts = []

    for element in soup.select(f"h1, h2, h3, h4, h5, h6, p, pre, table:not(:has(p)), ul:not(:has(p)), {image_link_selector}"):
        if count_images > 0:
            image = element.select_one(image_selector)
            if image:
                title = str(element.get("title", element.text))
                if title:
                    yield f"!{format_link(image['src'], title)}\n"
                    if max_words is not None:
                        max_words -= 10
                    count_images -= 1
                continue

        for line in element.get_text(" ").splitlines():
            words = [word for word in line.split() if word]
            if not words:
                continue
            joined_line = " ".join(words)
            if joined_line in seen_texts:
                continue
            if max_words is not None:
                max_words -= len(words)
                if max_words <= 0:
                    break
            yield joined_line + "\n"
            seen_texts.append(joined_line)

    if add_source:
        canonical_link = soup.find("link", rel="canonical")
        if canonical_link and "href" in canonical_link.attrs:
            link = canonical_link["href"]
            domain = urlparse(link).netloc
            yield f"\nSource: [{domain}]({link})"

async def fetch_and_scrape(session: ClientSession, url: str, max_words: Optional[int] = None, add_source: bool = False, proxy: str = None) -> str:
    """
    Fetches a URL and returns the scraped text, using caching to avoid redundant downloads.
    """
    try:
        cache_dir: Path = Path(get_cookies_dir()) / ".scrape_cache" / "fetch_and_scrape"
        cache_dir.mkdir(parents=True, exist_ok=True)
        md5_hash = hashlib.md5(url.encode(errors="ignore")).hexdigest()
        cache_file = cache_dir / f"{quote_plus(url.split('?')[0].split('//')[1].replace('/', ' ')[:48])}.{date.today()}.{md5_hash[:16]}.cache"
        if cache_file.exists():
            return cache_file.read_text()

        async with session.get(url, proxy=proxy) as response:
            if response.status == 200:
                html = await response.text(errors="replace")
                scraped_text = "".join(scrape_text(html, max_words, add_source))
                with open(cache_file, "wb") as f:
                    f.write(scraped_text.encode(errors="replace"))
                return scraped_text
    except (ClientError, asyncio.TimeoutError):
        return ""
    return ""

class SearchResults(JsonMixin):
    """
    Represents a collection of search result entries along with the count of used words.
    """
    def __init__(self, results: List[SearchResultEntry], used_words: int):
        self.results = results
        self.used_words = used_words

    @classmethod
    def from_dict(cls, data: dict) -> SearchResults:
        return cls(
            [SearchResultEntry(**item) for item in data["results"]],
            data["used_words"]
        )

    def __iter__(self) -> Iterator[SearchResultEntry]:
        yield from self.results

    def __str__(self) -> str:
        # Build a string representation of the search results with markdown formatting.
        output = []
        for idx, result in enumerate(self.results):
            parts = [
                f"### Title: {result.title}",
                "",
                result.text if result.text else result.snippet,
                "",
                f"> **Source:** [[{idx}]]({result.url})"
            ]
            output.append("\n".join(parts))
        return "\n\n\n\n".join(output)

    def __len__(self) -> int:
        return len(self.results)

    def get_sources(self) -> Sources:
        return Sources([{"url": result.url, "title": result.title} for result in self.results])

    def get_dict(self) -> dict:
        return {
            "results": [result.get_dict() for result in self.results],
            "used_words": self.used_words
        }

class SearchResultEntry(JsonMixin):
    """
    Represents a single search result entry.
    """
    def __init__(self, title: str, url: str, snippet: str, text: Optional[str] = None):
        self.title = title
        self.url = url
        self.snippet = snippet
        self.text = text

    def set_text(self, text: str) -> None:
        self.text = text

class DDGS(AsyncGeneratorProvider):
    working = has_requirements

    @classmethod
    async def create_async_generator(
        cls,
        model: str,
        messages: Messages,
        prompt: str = None,
        proxy: str = None,
        timeout: int = 30,
        region: str = None,
        backend: str = None,
        max_results: int = 5,
        max_words: int = 2500,
        add_text: bool = True,
        **kwargs
    ) -> AsyncResult:
        if not has_requirements:
            raise MissingRequirementsError('Install "ddgs" and "beautifulsoup4" | pip install -U g4f[search]')

        prompt = format_media_prompt(messages, prompt)
        results: List[SearchResultEntry] = []

        # Use the new DDGS() context manager style
        with DDGSClient() as ddgs:
            for result in ddgs.text(
                prompt,
                region=region,
                safesearch="moderate",
                timelimit="y",
                max_results=max_results,
                backend=backend,
            ):
                if ".google." in result["href"]:
                    continue
                results.append(SearchResultEntry(
                    title=result["title"],
                    url=result["href"],
                    snippet=result["body"]
                ))

        if add_text:
            tasks = []
            async with ClientSession(timeout=ClientTimeout(timeout)) as session:
                for entry in results:
                    tasks.append(fetch_and_scrape(session, entry.url, int(max_words / (max_results - 1)), False, proxy=proxy))
                texts = await asyncio.gather(*tasks)

        formatted_results: List[SearchResultEntry] = []
        used_words = 0
        left_words = max_words
        for i, entry in enumerate(results):
            if add_text:
                entry.text = texts[i]
            left_words -= entry.title.count(" ") + 5
            if entry.text:
                left_words -= entry.text.count(" ")
            else:
                left_words -= entry.snippet.count(" ")
            if left_words < 0:
                break
            used_words = max_words - left_words
            formatted_results.append(entry)

        yield SearchResults(formatted_results, used_words)