gpt4free/g4f/Provider/DDG.py

from __future__ import annotations

import time
from aiohttp import ClientSession, ClientTimeout
import json
import asyncio
import random
import base64
import hashlib
from yarl import URL

from ..typing import AsyncResult, Messages, Cookies
from ..requests.raise_for_status import raise_for_status
from .base_provider import AsyncGeneratorProvider, ProviderModelMixin
from .helper import format_prompt, get_last_user_message
from ..providers.response import FinishReason, JsonConversation
from ..errors import ModelNotSupportedError, ResponseStatusError, RateLimitError, TimeoutError, ConversationLimitError

try:
    from bs4 import BeautifulSoup
    has_bs4 = True
except ImportError:
    has_bs4 = False


class DuckDuckGoSearchException(Exception):
    """Base exception class for duckduckgo_search."""

class DuckDuckGoChallengeError(ResponseStatusError):
    """Raised when DuckDuckGo presents a challenge that needs to be solved."""

class Conversation(JsonConversation):
    vqd: str = None
    vqd_hash_1: str = None
    message_history: Messages = []
    cookies: dict = {}
    fe_version: str = None

    def __init__(self, model: str):
        self.model = model

class DDG(AsyncGeneratorProvider, ProviderModelMixin):
    label = "DuckDuckGo AI Chat"
    url = "https://duckduckgo.com/aichat"
    api_endpoint = "https://duckduckgo.com/duckchat/v1/chat"
    status_url = "https://duckduckgo.com/duckchat/v1/status"

    working = True
    supports_stream = True
    supports_system_message = True
    supports_message_history = True

    default_model = "gpt-4o-mini"

    # Model mapping from user-friendly names to API model names
    _chat_models = {
        "gpt-4": default_model,
        "gpt-4o-mini": default_model,
        "llama-3.3-70b": "meta-llama/Llama-3.3-70B-Instruct-Turbo",
        "claude-3-haiku": "claude-3-haiku-20240307",
        "o3-mini": "o3-mini",
        "mixtral-small-24b": "mistralai/Mistral-Small-24B-Instruct-2501",
    }

    # Available models (user-friendly names)
    models = list(_chat_models.keys())

    last_request_time = 0
    max_retries = 3
    base_delay = 2

    # Class variable to store the x-fe-version across instances
    _chat_xfe = ""

    @staticmethod
    def sha256_base64(text: str) -> str:
        """Return the base64 encoding of the SHA256 digest of the text."""
        sha256_hash = hashlib.sha256(text.encode("utf-8")).digest()
        return base64.b64encode(sha256_hash).decode()

    @staticmethod
    def parse_dom_fingerprint(js_text: str) -> str:
        if not has_bs4:
            # Fallback if BeautifulSoup is not available
            return "1000"

        try:
            html_snippet = js_text.split("e.innerHTML = '")[1].split("';")[0]
            offset_value = js_text.split("return String(")[1].split(" ")[0]
            soup = BeautifulSoup(html_snippet, "html.parser")
            corrected_inner_html = soup.body.decode_contents()
            inner_html_length = len(corrected_inner_html)
            fingerprint = int(offset_value) + inner_html_length
            return str(fingerprint)
        except Exception:
            # Return a fallback value if parsing fails
            return "1000"

    @staticmethod
    def parse_server_hashes(js_text: str) -> list:
        try:
            return js_text.split('server_hashes: ["', maxsplit=1)[1].split('"]', maxsplit=1)[0].split('","')
        except Exception:
            # Return a fallback value if parsing fails
            return ["1", "2"]

    @classmethod
    def build_x_vqd_hash_1(cls, vqd_hash_1: str, headers: dict) -> str:
        """Build the x-vqd-hash-1 header value."""
        try:
            # If we received a valid base64 string, try to decode it
            if vqd_hash_1 and len(vqd_hash_1) > 20:
                try:
                    # Try to decode and parse as JSON first
                    decoded_json = json.loads(base64.b64decode(vqd_hash_1).decode())
                    # If it's already a complete structure with meta, return it as is
                    if isinstance(decoded_json, dict) and "meta" in decoded_json:
                        return vqd_hash_1

                    # Otherwise, extract what we can from it
                    if isinstance(decoded_json, dict) and "server_hashes" in decoded_json:
                        server_hashes = decoded_json.get("server_hashes", ["1", "2"])
                    else:
                        # Fall back to parsing from string
                        decoded = base64.b64decode(vqd_hash_1).decode()
                        server_hashes = cls.parse_server_hashes(decoded)
                except (json.JSONDecodeError, UnicodeDecodeError):
                    # If it's not valid JSON, try to parse it as a string
                    decoded = base64.b64decode(vqd_hash_1).decode()
                    server_hashes = cls.parse_server_hashes(decoded)
            else:
                # Default server hashes if we can't extract them
                server_hashes = ["1", "2"]

            # Generate fingerprints
            dom_fingerprint = "1000"  # Default value
            ua_fingerprint = headers.get("User-Agent", "") + headers.get("sec-ch-ua", "")
            ua_hash = cls.sha256_base64(ua_fingerprint)
            dom_hash = cls.sha256_base64(dom_fingerprint)

            # Create a challenge ID (random hex string)
            challenge_id = ''.join(random.choice('0123456789abcdef') for _ in range(40)) + 'h8jbt'

            # Build the complete structure including meta
            final_result = {
                "server_hashes": server_hashes,
                "client_hashes": [ua_hash, dom_hash],
                "signals": {},
                "meta": {
                    "v": "1",
                    "challenge_id": challenge_id,
                    "origin": "https://duckduckgo.com",
                    "stack": "Error\nat ke (https://duckduckgo.com/dist/wpm.chat.js:1:29526)\nat async dispatchServiceInitialVQD (https://duckduckgo.com/dist/wpm.chat.js:1:45076)"
                }
            }

            base64_final_result = base64.b64encode(json.dumps(final_result).encode()).decode()
            return base64_final_result
        except Exception as e:
            # If anything fails, return an empty string
            return ""

    @classmethod
    def validate_model(cls, model: str) -> str:
        """Validates and returns the correct model name for the API"""
        if not model:
            return cls.default_model

        # Check aliases first
        if model in cls.model_aliases:
            model = cls.model_aliases[model]

        # Check if it's a valid model name
        if model not in cls.models:
            raise ModelNotSupportedError(f"Model {model} not supported. Available models: {cls.models}")

        return model

    @classmethod
    async def sleep(cls, multiplier=1.0):
        """Implements rate limiting between requests"""
        now = time.time()
        if cls.last_request_time > 0:
            delay = max(0.0, 1.5 - (now - cls.last_request_time)) * multiplier
            if delay > 0:
                await asyncio.sleep(delay)
        cls.last_request_time = time.time()

    @classmethod
    async def get_default_cookies(cls, session: ClientSession) -> dict:
        """Obtains default cookies needed for API requests"""
        try:
            await cls.sleep()
            # Make initial request to get cookies
            async with session.get(cls.url) as response:
                # Set the required cookies
                cookies = {}
                cookies_dict = {'dcs': '1', 'dcm': '3'}

                # Add any cookies from the response
                for cookie in response.cookies.values():
                    cookies[cookie.key] = cookie.value

                # Ensure our required cookies are set
                for name, value in cookies_dict.items():
                    cookies[name] = value
                    url_obj = URL(cls.url)
                    session.cookie_jar.update_cookies({name: value}, url_obj)

                # Make a second request to the status endpoint to get any additional cookies
                headers = {
                    "accept": "text/event-stream",
                    "accept-language": "en",
                    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36",
                    "origin": "https://duckduckgo.com",
                    "referer": "https://duckduckgo.com/",
                }

                await cls.sleep()
                async with session.get(cls.status_url, headers=headers) as status_response:
                    # Add any cookies from the status response
                    for cookie in status_response.cookies.values():
                        cookies[cookie.key] = cookie.value
                        url_obj = URL(cls.url)
                        session.cookie_jar.update_cookies({cookie.key: cookie.value}, url_obj)

                return cookies
        except Exception as e:
            # Return at least the required cookies on error
            cookies = {'dcs': '1', 'dcm': '3'}
            url_obj = URL(cls.url)
            for name, value in cookies.items():
                session.cookie_jar.update_cookies({name: value}, url_obj)
            return cookies

    @classmethod
    async def fetch_fe_version(cls, session: ClientSession) -> str:
        """Fetches the fe-version from the initial page load."""
        if cls._chat_xfe:
            return cls._chat_xfe

        try:
            url = "https://duckduckgo.com/?q=DuckDuckGo+AI+Chat&ia=chat&duckai=1"
            await cls.sleep()
            async with session.get(url) as response:
                await raise_for_status(response)
                content = await response.text()

                # Extract x-fe-version components
                try:
                    # Try to extract the version components
                    xfe1 = content.split('__DDG_BE_VERSION__="', 1)[1].split('"', 1)[0]
                    xfe2 = content.split('__DDG_FE_CHAT_HASH__="', 1)[1].split('"', 1)[0]

                    # Format it like "serp_YYYYMMDD_HHMMSS_ET-hash"
                    from datetime import datetime
                    current_date = datetime.now().strftime("%Y%m%d_%H%M%S")
                    cls._chat_xfe = f"serp_{current_date}_ET-{xfe2}"

                    return cls._chat_xfe
                except Exception:
                    # Fallback to a default format if extraction fails
                    from datetime import datetime
                    current_date = datetime.now().strftime("%Y%m%d_%H%M%S")
                    cls._chat_xfe = f"serp_{current_date}_ET-78c2e87e3d286691cc21"
                    return cls._chat_xfe
        except Exception:
            # Fallback to a default format if request fails
            from datetime import datetime
            current_date = datetime.now().strftime("%Y%m%d_%H%M%S")
            cls._chat_xfe = f"serp_{current_date}_ET-78c2e87e3d286691cc21"
            return cls._chat_xfe

    @classmethod
    async def fetch_vqd_and_hash(cls, session: ClientSession, retry_count: int = 0) -> tuple[str, str]:
        """Fetches the required VQD token and hash for the chat session with retries."""
        headers = {
            "accept": "text/event-stream",
            "accept-language": "en",
            "cache-control": "no-cache",
            "pragma": "no-cache",
            "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36",
            "origin": "https://duckduckgo.com",
            "referer": "https://duckduckgo.com/",
            "x-vqd-accept": "1",
        }

        # Make sure we have cookies first
        if len(session.cookie_jar) == 0:
            await cls.get_default_cookies(session)

        try:
            await cls.sleep(multiplier=1.0 + retry_count * 0.5)
            async with session.get(cls.status_url, headers=headers) as response:
                await raise_for_status(response)

                vqd = response.headers.get("x-vqd-4", "")
                vqd_hash_1 = response.headers.get("x-vqd-hash-1", "")

                if vqd:
                    # Return the fetched vqd and vqd_hash_1
                    return vqd, vqd_hash_1

                response_text = await response.text()
                raise RuntimeError(f"Failed to fetch VQD token and hash: {response.status} {response_text}")

        except Exception as e:
            if retry_count < cls.max_retries:
                wait_time = cls.base_delay * (2 ** retry_count) * (1 + random.random())
                await asyncio.sleep(wait_time)
                return await cls.fetch_vqd_and_hash(session, retry_count + 1)
            else:
                raise RuntimeError(f"Failed to fetch VQD token and hash after {cls.max_retries} attempts: {str(e)}")

    @classmethod
    async def create_async_generator(
        cls,
        model: str,
        messages: Messages,
        proxy: str = None,
        timeout: int = 60,
        cookies: Cookies = None,
        conversation: Conversation = None,
        return_conversation: bool = False,
        **kwargs
    ) -> AsyncResult:
        model = cls.validate_model(model)
        retry_count = 0

        while retry_count <= cls.max_retries:
            try:
                session_timeout = ClientTimeout(total=timeout)
                async with ClientSession(timeout=session_timeout, cookies=cookies) as session:
                    # Step 1: Ensure we have the fe_version
                    if not cls._chat_xfe:
                        cls._chat_xfe = await cls.fetch_fe_version(session)

                    # Step 2: Initialize or update conversation
                    if conversation is None:
                        # Get initial cookies if not provided
                        if not cookies:
                            await cls.get_default_cookies(session)

                        # Create a new conversation
                        conversation = Conversation(model)
                        conversation.fe_version = cls._chat_xfe

                        # Step 3: Get VQD tokens
                        vqd, vqd_hash_1 = await cls.fetch_vqd_and_hash(session)
                        conversation.vqd = vqd
                        conversation.vqd_hash_1 = vqd_hash_1
                        conversation.message_history = [{"role": "user", "content": format_prompt(messages)}]
                    else:
                        # Update existing conversation with new message
                        last_message = get_last_user_message(messages.copy())
                        conversation.message_history.append({"role": "user", "content": last_message})

                    # Step 4: Prepare headers with proper x-vqd-hash-1
                    headers = {
                        "accept": "text/event-stream",
                        "accept-language": "en",
                        "cache-control": "no-cache",
                        "content-type": "application/json",
                        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36",
                        "origin": "https://duckduckgo.com",
                        "referer": "https://duckduckgo.com/",
                        "pragma": "no-cache",
                        "priority": "u=1, i",
                        "sec-ch-ua": '"Not:A-Brand";v="24", "Chromium";v="134"',
                        "sec-ch-ua-mobile": "?0",
                        "sec-ch-ua-platform": '"Linux"',
                        "sec-fetch-dest": "empty",
                        "sec-fetch-mode": "cors",
                        "sec-fetch-site": "same-origin",
                        "x-fe-version": conversation.fe_version or cls._chat_xfe,
                        "x-vqd-4": conversation.vqd,
                    }

                    # For the first request, send an empty x-vqd-hash-1 header
                    # This matches the behavior in the duckduckgo_search module
                    headers["x-vqd-hash-1"] = ""

                    # Step 5: Prepare the request data
                    # Convert the user-friendly model name to the API model name
                    api_model = cls._chat_models.get(model, model)

                    data = {
                        "model": api_model,
                        "messages": conversation.message_history,
                    }

                    # Step 6: Send the request
                    await cls.sleep(multiplier=1.0 + retry_count * 0.5)
                    async with session.post(cls.api_endpoint, json=data, headers=headers, proxy=proxy) as response:
                        # Handle 429 and 418 errors specifically
                        if response.status == 429:
                            response_text = await response.text()

                            if retry_count < cls.max_retries:
                                retry_count += 1
                                wait_time = cls.base_delay * (2 ** retry_count) * (1 + random.random())
                                await asyncio.sleep(wait_time)

                                # Get fresh tokens and cookies
                                cookies = await cls.get_default_cookies(session)
                                continue
                            else:
                                raise RateLimitError(f"Rate limited after {cls.max_retries} retries")
                        elif response.status == 418:
                            # Check if it's a challenge error
                            try:
                                response_text = await response.text()
                                try:
                                    response_json = json.loads(response_text)

                                    # Extract challenge data if available
                                    challenge_data = None
                                    if response_json.get("type") == "ERR_CHALLENGE" and "cd" in response_json:
                                        challenge_data = response_json["cd"]

                                    if retry_count < cls.max_retries:
                                        retry_count += 1
                                        wait_time = cls.base_delay * (2 ** retry_count) * (1 + random.random())
                                        await asyncio.sleep(wait_time)

                                        # Reset tokens and try again with fresh session
                                        conversation = None
                                        cls._chat_xfe = ""

                                        # Get fresh cookies
                                        cookies = await cls.get_default_cookies(session)

                                        # If we have challenge data, try to use it
                                        if challenge_data and isinstance(challenge_data, dict):
                                            # Extract any useful information from challenge data
                                            # This could be used to build a better response in the future
                                            pass

                                        continue
                                    else:
                                        raise DuckDuckGoChallengeError(f"Challenge error after {cls.max_retries} retries")
                                except json.JSONDecodeError:
                                    # If we can't parse the JSON, assume it's a challenge error anyway
                                    if retry_count < cls.max_retries:
                                        retry_count += 1
                                        wait_time = cls.base_delay * (2 ** retry_count) * (1 + random.random())
                                        await asyncio.sleep(wait_time)

                                        # Reset tokens and try again with fresh session
                                        conversation = None
                                        cls._chat_xfe = ""
                                        cookies = await cls.get_default_cookies(session)
                                        continue
                                    else:
                                        raise DuckDuckGoChallengeError(f"Challenge error after {cls.max_retries} retries")
                            except Exception as e:
                                # If any other error occurs during handling, still try to recover
                                if retry_count < cls.max_retries:
                                    retry_count += 1
                                    wait_time = cls.base_delay * (2 ** retry_count) * (1 + random.random())
                                    await asyncio.sleep(wait_time)

                                    # Reset tokens and try again with fresh session
                                    conversation = None
                                    cls._chat_xfe = ""
                                    cookies = await cls.get_default_cookies(session)
                                    continue
                                else:
                                    raise DuckDuckGoChallengeError(f"Challenge error after {cls.max_retries} retries: {str(e)}")

                        # For other status codes, use the standard error handler
                        await raise_for_status(response)
                        reason = None
                        full_message = ""

                        # Step 7: Process the streaming response
                        async for line in response.content:
                            line = line.decode("utf-8").strip()

                            if line.startswith("data:"):
                                try:
                                    message = json.loads(line[5:].strip())
                                except json.JSONDecodeError:
                                    continue

                                if "action" in message and message["action"] == "error":
                                    error_type = message.get("type", "")
                                    if message.get("status") == 429:
                                        if error_type == "ERR_CONVERSATION_LIMIT":
                                            raise ConversationLimitError(error_type)
                                        raise RateLimitError(error_type)
                                    elif message.get("status") == 418 and error_type == "ERR_CHALLENGE":
                                        # Handle challenge error by refreshing tokens and retrying
                                        if retry_count < cls.max_retries:
                                            # Don't raise here, let the outer exception handler retry
                                            raise DuckDuckGoChallengeError(f"Challenge detected: {error_type}")
                                    raise DuckDuckGoSearchException(error_type)

                                if "message" in message:
                                    if message["message"]:
                                        yield message["message"]
                                        full_message += message["message"]
                                        reason = "length"
                                    else:
                                        reason = "stop"

                        # Step 8: Update conversation with response information
                        # Always update the VQD tokens from the response headers
                        conversation.vqd = response.headers.get("x-vqd-4", conversation.vqd)
                        conversation.vqd_hash_1 = response.headers.get("x-vqd-hash-1", conversation.vqd_hash_1)

                        # Update cookies
                        conversation.cookies = {
                            n: c.value
                            for n, c in session.cookie_jar.filter_cookies(URL(cls.url)).items()
                        }

                        # If requested, return the updated conversation
                        if return_conversation:
                            conversation.message_history.append({"role": "assistant", "content": full_message})
                            yield conversation

                        if reason is not None:
                            yield FinishReason(reason)

                        # If we got here, the request was successful
                        break

            except (RateLimitError, ResponseStatusError, DuckDuckGoChallengeError) as e:
                if ("429" in str(e) or isinstance(e, DuckDuckGoChallengeError)) and retry_count < cls.max_retries:
                    retry_count += 1
                    wait_time = cls.base_delay * (2 ** retry_count) * (1 + random.random())
                    await asyncio.sleep(wait_time)

                    # For challenge errors, refresh tokens and cookies
                    if isinstance(e, DuckDuckGoChallengeError):
                        # Reset conversation to force new token acquisition
                        conversation = None
                        # Clear class cache to force refresh
                        cls._chat_xfe = ""
                else:
                    raise
            except asyncio.TimeoutError as e:
                raise TimeoutError(f"Request timed out: {str(e)}")
            except Exception as e:
                raise