feat: Switch to httpx-curl-cffi for better scraping (#7254)

2026-04-18 10:55:36 -04:00 · 2026-03-15 18:30:23 -05:00
parent 69d740e100
commit 60d4a62f0a
7 changed files with 101 additions and 119 deletions
--- a/mealie/pkgs/safehttp/transport.py
+++ b/mealie/pkgs/safehttp/transport.py
@@ -3,6 +3,7 @@ import logging
 import socket

 import httpx
+from httpx_curl_cffi import AsyncCurlTransport


 class ForcedTimeoutException(Exception):
@@ -21,9 +22,9 @@ class InvalidDomainError(Exception):
    ...


-class AsyncSafeTransport(httpx.AsyncBaseTransport):
+class AsyncSafeTransport(AsyncCurlTransport):
    """
-    A wrapper around the httpx transport class that enforces a timeout value
+    A wrapper around the httpx-curl-cffi transport class that enforces a timeout value
    and that the request is not made to a local IP address.
    """

@@ -31,10 +32,10 @@ class AsyncSafeTransport(httpx.AsyncBaseTransport):

    def __init__(self, log: logging.Logger | None = None, **kwargs):
        self.timeout = kwargs.pop("timeout", self.timeout)
-        self._wrapper = httpx.AsyncHTTPTransport(**kwargs)
        self._log = log
+        super().__init__(**kwargs)

-    async def handle_async_request(self, request) -> httpx.Response:
+    async def handle_async_request(self, request: httpx.Request) -> httpx.Response:
        # override timeout value for _all_ requests
        request.extensions["timeout"] = httpx.Timeout(self.timeout, pool=self.timeout).as_dict()

@@ -72,7 +73,4 @@ class AsyncSafeTransport(httpx.AsyncBaseTransport):
                self._log.warning(f"invalid request on local resource: {request.url} -> {ip}")
            raise InvalidDomainError(f"invalid request on local resource: {request.url} -> {ip}")

-        return await self._wrapper.handle_async_request(request)
-
-    async def aclose(self):
-        await self._wrapper.aclose()
+        return await super().handle_async_request(request)
--- a/mealie/services/recipe/recipe_data_service.py
+++ b/mealie/services/recipe/recipe_data_service.py
@@ -7,11 +7,9 @@ from httpx import AsyncClient, Response
 from pydantic import UUID4

 from mealie.pkgs import img, safehttp
-from mealie.pkgs.safehttp.transport import AsyncSafeTransport
 from mealie.schema.recipe.recipe import Recipe
 from mealie.schema.recipe.recipe_image_types import RecipeImageTypes
 from mealie.services._base_service import BaseService
-from mealie.services.scraper.user_agents_manager import get_user_agents_manager


 async def gather_with_concurrency(n, *coros, ignore_exceptions=False):
@@ -28,17 +26,15 @@ async def gather_with_concurrency(n, *coros, ignore_exceptions=False):


 async def largest_content_len(urls: list[str]) -> tuple[str, int]:
-    user_agent_manager = get_user_agents_manager()
-
    largest_url = ""
    largest_len = 0

    max_concurrency = 10

    async def do(client: AsyncClient, url: str) -> Response:
-        return await client.head(url, headers=user_agent_manager.get_scrape_headers())
+        return await client.head(url)

-    async with AsyncClient(transport=safehttp.AsyncSafeTransport()) as client:
+    async with AsyncClient(transport=safehttp.AsyncSafeTransport(impersonate="chrome")) as client:
        tasks = [do(client, url) for url in urls]
        responses: list[Response] = await gather_with_concurrency(max_concurrency, *tasks, ignore_exceptions=True)
        for response in responses:
@@ -117,7 +113,6 @@ class RecipeDataService(BaseService):

    async def scrape_image(self, image_url: str | dict[str, str] | list[str]) -> None:
        self.logger.info(f"Image URL: {image_url}")
-        user_agent = get_user_agents_manager().user_agents[0]

        image_url_str = ""

@@ -146,9 +141,9 @@ class RecipeDataService(BaseService):
        file_name = f"{self.recipe_id!s}.{ext}"
        file_path = Recipe.directory_from_id(self.recipe_id).joinpath("images", file_name)

-        async with AsyncClient(transport=AsyncSafeTransport()) as client:
+        async with AsyncClient(transport=safehttp.AsyncSafeTransport(impersonate="chrome")) as client:
            try:
-                r = await client.get(image_url_str, headers={"User-Agent": user_agent})
+                r = await client.get(image_url_str)
            except Exception:
                self.logger.exception("Fatal Image Request Exception")
                return None
--- a/mealie/services/scraper/scraper_strategies.py
+++ b/mealie/services/scraper/scraper_strategies.py
@@ -32,9 +32,16 @@ from mealie.services.openai import OpenAIService
 from mealie.services.scraper.scraped_extras import ScrapedExtras

 from . import cleaner
-from .user_agents_manager import get_user_agents_manager

 SCRAPER_TIMEOUT = 15
+
+BROWSER_IMPERSONATIONS = [
+    "chrome",
+    "firefox",
+    "safari",
+    "edge",
+]
+
 logger = get_logger()


@@ -51,29 +58,40 @@ class ForceTimeoutException(Exception):
 async def safe_scrape_html(url: str) -> str:
    """
    Scrapes the html from a url but will cancel the request
-    if the request takes longer than 15 seconds. This is used to mitigate
+    if the request takes longer than SCRAPER_TIMEOUT seconds. This is used to mitigate
    DDOS attacks from users providing a url with arbitrary large content.
+
+    Cycles through browser TLS impersonations (via httpx-curl-cffi) to bypass
+    bot-detection systems that fingerprint the TLS handshake (JA3/JA4),
+    such as Cloudflare.
    """
-    user_agents_manager = get_user_agents_manager()
-
    logger.debug(f"Scraping URL: {url}")
-    async with AsyncClient(transport=safehttp.AsyncSafeTransport()) as client:
-        for user_agent in user_agents_manager.user_agents:
-            logger.debug(f'Trying User-Agent: "{user_agent}"')

-            response: Response | None = None
-            html_bytes = b""
+    html_bytes = b""
+    response: Response | None = None
+
+    for impersonation in BROWSER_IMPERSONATIONS:
+        logger.debug(f'Trying browser impersonation: "{impersonation}"')
+
+        html_bytes = b""
+        response = None
+
+        transport = safehttp.AsyncSafeTransport(impersonate=impersonation)
+        async with AsyncClient(transport=transport) as client:
            async with client.stream(
                "GET",
                url,
                timeout=SCRAPER_TIMEOUT,
-                headers=user_agents_manager.get_scrape_headers(user_agent),
                follow_redirects=True,
            ) as resp:
-                if resp.status_code >= status.HTTP_400_BAD_REQUEST:
-                    logger.debug(f'Error status code {resp.status_code} with User-Agent: "{user_agent}"')
+                if resp.status_code == 403:
+                    logger.debug(f'403 Forbidden with impersonation "{impersonation}", trying next')
                    continue

+                if resp.status_code >= 400:
+                    logger.debug(f'Error status code {resp.status_code} with impersonation "{impersonation}"')
+                    break
+
                start_time = time.time()

                async for chunk in resp.aiter_bytes(chunk_size=1024):
@@ -85,33 +103,32 @@ async def safe_scrape_html(url: str) -> str:
                response = resp
                break

-        if not (response and html_bytes):
-            return ""
+    if not (response and html_bytes):
+        return ""

-        # =====================================
-        # Copied from requests text property
+    # =====================================
+    # Copied from requests text property

-        # Try charset from content-type
-        content = None
-        encoding = response.encoding
+    # Try charset from content-type
+    encoding = response.encoding

-        # Fallback to auto-detected encoding.
-        if encoding is None:
-            encoding = response.apparent_encoding
+    # Fallback to auto-detected encoding.
+    if encoding is None:
+        encoding = response.apparent_encoding

-        # Decode unicode from given encoding.
-        try:
-            content = str(html_bytes, encoding, errors="replace")
-        except (LookupError, TypeError):
-            # A LookupError is raised if the encoding was not found which could
-            # indicate a misspelling or similar mistake.
-            #
-            # A TypeError can be raised if encoding is None
-            #
-            # So we try blindly encoding.
-            content = str(html_bytes, errors="replace")
+    # Decode unicode from given encoding.
+    try:
+        content = str(html_bytes, encoding, errors="replace")
+    except (LookupError, TypeError):
+        # A LookupError is raised if the encoding was not found which could
+        # indicate a misspelling or similar mistake.
+        #
+        # A TypeError can be raised if encoding is None
+        #
+        # So we try blindly encoding.
+        content = str(html_bytes, errors="replace")

-        return content
+    return content


 class ABCScraperStrategy(ABC):
--- a/mealie/services/scraper/user-agents.txt
+++ b/mealie/services/scraper/user-agents.txt
@@ -1,3 +0,0 @@
-Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/26.3.1 Safari/605.1.15
-Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:148.0) Gecko/20100101 Firefox/148.0
-Mozilla/5.0 (Linux; Android 10; HD1913) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.6834.164 Mobile Safari/537.36 EdgA/131.0.2903.87
--- a/mealie/services/scraper/user_agents_manager.py
+++ b/mealie/services/scraper/user_agents_manager.py
@@ -1,65 +0,0 @@
-from __future__ import annotations
-
-import os
-import random
-
-_USER_AGENTS_MANAGER: UserAgentsManager | None = None
-
-
-def get_user_agents_manager() -> UserAgentsManager:
-    global _USER_AGENTS_MANAGER
-
-    if not _USER_AGENTS_MANAGER:
-        _USER_AGENTS_MANAGER = UserAgentsManager()
-
-    return _USER_AGENTS_MANAGER
-
-
-class UserAgentsManager:
-    def __init__(self) -> None:
-        self._user_agents: list[str] | None = None
-        self._user_agents_text_path = os.path.join(os.path.dirname(__file__), "user-agents.txt")
-
-    def get_scrape_headers(self, user_agent: str | None = None) -> dict[str, str]:
-        # From: https://scrapeops.io/web-scraping-playbook/403-forbidden-error-web-scraping/#optimize-request-headers
-        if user_agent is None:
-            user_agent = random.choice(self.user_agents)
-
-        return {
-            "User-Agent": user_agent,
-            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
-            "Accept-Language": "en-US,en;q=0.5",
-            "Accept-Encoding": "gzip, deflate",
-            "Connection": "keep-alive",
-            "Upgrade-Insecure-Requests": "1",
-            "Sec-Fetch-Dest": "document",
-            "Sec-Fetch-Mode": "navigate",
-            "Sec-Fetch-Site": "none",
-            "Sec-Fetch-User": "?1",
-            "Cache-Control": "max-age=0",
-        }
-
-    @property
-    def user_agents(self) -> list[str]:
-        if not self._user_agents:
-            self._user_agents = self._fetch_user_agents()
-
-        return self._user_agents
-
-    def _fetch_user_agents(self) -> list[str]:
-        user_agents: list[str] = []
-
-        try:
-            from recipe_scrapers._abstract import HEADERS
-
-            user_agents.append(HEADERS["User-Agent"])
-        except (ImportError, KeyError):
-            user_agents.append("Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/128.0")
-
-        with open(self._user_agents_text_path) as f:
-            for line in f:
-                if not line:
-                    continue
-                user_agents.append(line.strip())
-
-        return user_agents