From 60d4a62f0a4333f064dd61c86c1855b15d8ef0d3 Mon Sep 17 00:00:00 2001 From: Michael Genson <71845777+michael-genson@users.noreply.github.com> Date: Sun, 15 Mar 2026 18:30:23 -0500 Subject: [PATCH] feat: Switch to httpx-curl-cffi for better scraping (#7254) --- mealie/pkgs/safehttp/transport.py | 14 ++- mealie/services/recipe/recipe_data_service.py | 13 +-- mealie/services/scraper/scraper_strategies.py | 85 +++++++++++-------- mealie/services/scraper/user-agents.txt | 3 - .../services/scraper/user_agents_manager.py | 65 -------------- pyproject.toml | 1 + uv.lock | 39 +++++++++ 7 files changed, 101 insertions(+), 119 deletions(-) delete mode 100644 mealie/services/scraper/user-agents.txt delete mode 100644 mealie/services/scraper/user_agents_manager.py diff --git a/mealie/pkgs/safehttp/transport.py b/mealie/pkgs/safehttp/transport.py index b0352b87e..264c8ef91 100644 --- a/mealie/pkgs/safehttp/transport.py +++ b/mealie/pkgs/safehttp/transport.py @@ -3,6 +3,7 @@ import logging import socket import httpx +from httpx_curl_cffi import AsyncCurlTransport class ForcedTimeoutException(Exception): @@ -21,9 +22,9 @@ class InvalidDomainError(Exception): ... -class AsyncSafeTransport(httpx.AsyncBaseTransport): +class AsyncSafeTransport(AsyncCurlTransport): """ - A wrapper around the httpx transport class that enforces a timeout value + A wrapper around the httpx-curl-cffi transport class that enforces a timeout value and that the request is not made to a local IP address. """ @@ -31,10 +32,10 @@ class AsyncSafeTransport(httpx.AsyncBaseTransport): def __init__(self, log: logging.Logger | None = None, **kwargs): self.timeout = kwargs.pop("timeout", self.timeout) - self._wrapper = httpx.AsyncHTTPTransport(**kwargs) self._log = log + super().__init__(**kwargs) - async def handle_async_request(self, request) -> httpx.Response: + async def handle_async_request(self, request: httpx.Request) -> httpx.Response: # override timeout value for _all_ requests request.extensions["timeout"] = httpx.Timeout(self.timeout, pool=self.timeout).as_dict() @@ -72,7 +73,4 @@ class AsyncSafeTransport(httpx.AsyncBaseTransport): self._log.warning(f"invalid request on local resource: {request.url} -> {ip}") raise InvalidDomainError(f"invalid request on local resource: {request.url} -> {ip}") - return await self._wrapper.handle_async_request(request) - - async def aclose(self): - await self._wrapper.aclose() + return await super().handle_async_request(request) diff --git a/mealie/services/recipe/recipe_data_service.py b/mealie/services/recipe/recipe_data_service.py index 9eb864773..d269d5d1e 100644 --- a/mealie/services/recipe/recipe_data_service.py +++ b/mealie/services/recipe/recipe_data_service.py @@ -7,11 +7,9 @@ from httpx import AsyncClient, Response from pydantic import UUID4 from mealie.pkgs import img, safehttp -from mealie.pkgs.safehttp.transport import AsyncSafeTransport from mealie.schema.recipe.recipe import Recipe from mealie.schema.recipe.recipe_image_types import RecipeImageTypes from mealie.services._base_service import BaseService -from mealie.services.scraper.user_agents_manager import get_user_agents_manager async def gather_with_concurrency(n, *coros, ignore_exceptions=False): @@ -28,17 +26,15 @@ async def gather_with_concurrency(n, *coros, ignore_exceptions=False): async def largest_content_len(urls: list[str]) -> tuple[str, int]: - user_agent_manager = get_user_agents_manager() - largest_url = "" largest_len = 0 max_concurrency = 10 async def do(client: AsyncClient, url: str) -> Response: - return await client.head(url, headers=user_agent_manager.get_scrape_headers()) + return await client.head(url) - async with AsyncClient(transport=safehttp.AsyncSafeTransport()) as client: + async with AsyncClient(transport=safehttp.AsyncSafeTransport(impersonate="chrome")) as client: tasks = [do(client, url) for url in urls] responses: list[Response] = await gather_with_concurrency(max_concurrency, *tasks, ignore_exceptions=True) for response in responses: @@ -117,7 +113,6 @@ class RecipeDataService(BaseService): async def scrape_image(self, image_url: str | dict[str, str] | list[str]) -> None: self.logger.info(f"Image URL: {image_url}") - user_agent = get_user_agents_manager().user_agents[0] image_url_str = "" @@ -146,9 +141,9 @@ class RecipeDataService(BaseService): file_name = f"{self.recipe_id!s}.{ext}" file_path = Recipe.directory_from_id(self.recipe_id).joinpath("images", file_name) - async with AsyncClient(transport=AsyncSafeTransport()) as client: + async with AsyncClient(transport=safehttp.AsyncSafeTransport(impersonate="chrome")) as client: try: - r = await client.get(image_url_str, headers={"User-Agent": user_agent}) + r = await client.get(image_url_str) except Exception: self.logger.exception("Fatal Image Request Exception") return None diff --git a/mealie/services/scraper/scraper_strategies.py b/mealie/services/scraper/scraper_strategies.py index d6b7e489e..b33a441d6 100644 --- a/mealie/services/scraper/scraper_strategies.py +++ b/mealie/services/scraper/scraper_strategies.py @@ -32,9 +32,16 @@ from mealie.services.openai import OpenAIService from mealie.services.scraper.scraped_extras import ScrapedExtras from . import cleaner -from .user_agents_manager import get_user_agents_manager SCRAPER_TIMEOUT = 15 + +BROWSER_IMPERSONATIONS = [ + "chrome", + "firefox", + "safari", + "edge", +] + logger = get_logger() @@ -51,29 +58,40 @@ class ForceTimeoutException(Exception): async def safe_scrape_html(url: str) -> str: """ Scrapes the html from a url but will cancel the request - if the request takes longer than 15 seconds. This is used to mitigate + if the request takes longer than SCRAPER_TIMEOUT seconds. This is used to mitigate DDOS attacks from users providing a url with arbitrary large content. + + Cycles through browser TLS impersonations (via httpx-curl-cffi) to bypass + bot-detection systems that fingerprint the TLS handshake (JA3/JA4), + such as Cloudflare. """ - user_agents_manager = get_user_agents_manager() - logger.debug(f"Scraping URL: {url}") - async with AsyncClient(transport=safehttp.AsyncSafeTransport()) as client: - for user_agent in user_agents_manager.user_agents: - logger.debug(f'Trying User-Agent: "{user_agent}"') - response: Response | None = None - html_bytes = b"" + html_bytes = b"" + response: Response | None = None + + for impersonation in BROWSER_IMPERSONATIONS: + logger.debug(f'Trying browser impersonation: "{impersonation}"') + + html_bytes = b"" + response = None + + transport = safehttp.AsyncSafeTransport(impersonate=impersonation) + async with AsyncClient(transport=transport) as client: async with client.stream( "GET", url, timeout=SCRAPER_TIMEOUT, - headers=user_agents_manager.get_scrape_headers(user_agent), follow_redirects=True, ) as resp: - if resp.status_code >= status.HTTP_400_BAD_REQUEST: - logger.debug(f'Error status code {resp.status_code} with User-Agent: "{user_agent}"') + if resp.status_code == 403: + logger.debug(f'403 Forbidden with impersonation "{impersonation}", trying next') continue + if resp.status_code >= 400: + logger.debug(f'Error status code {resp.status_code} with impersonation "{impersonation}"') + break + start_time = time.time() async for chunk in resp.aiter_bytes(chunk_size=1024): @@ -85,33 +103,32 @@ async def safe_scrape_html(url: str) -> str: response = resp break - if not (response and html_bytes): - return "" + if not (response and html_bytes): + return "" - # ===================================== - # Copied from requests text property + # ===================================== + # Copied from requests text property - # Try charset from content-type - content = None - encoding = response.encoding + # Try charset from content-type + encoding = response.encoding - # Fallback to auto-detected encoding. - if encoding is None: - encoding = response.apparent_encoding + # Fallback to auto-detected encoding. + if encoding is None: + encoding = response.apparent_encoding - # Decode unicode from given encoding. - try: - content = str(html_bytes, encoding, errors="replace") - except (LookupError, TypeError): - # A LookupError is raised if the encoding was not found which could - # indicate a misspelling or similar mistake. - # - # A TypeError can be raised if encoding is None - # - # So we try blindly encoding. - content = str(html_bytes, errors="replace") + # Decode unicode from given encoding. + try: + content = str(html_bytes, encoding, errors="replace") + except (LookupError, TypeError): + # A LookupError is raised if the encoding was not found which could + # indicate a misspelling or similar mistake. + # + # A TypeError can be raised if encoding is None + # + # So we try blindly encoding. + content = str(html_bytes, errors="replace") - return content + return content class ABCScraperStrategy(ABC): diff --git a/mealie/services/scraper/user-agents.txt b/mealie/services/scraper/user-agents.txt deleted file mode 100644 index fdd70aa5c..000000000 --- a/mealie/services/scraper/user-agents.txt +++ /dev/null @@ -1,3 +0,0 @@ -Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/26.3.1 Safari/605.1.15 -Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:148.0) Gecko/20100101 Firefox/148.0 -Mozilla/5.0 (Linux; Android 10; HD1913) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.6834.164 Mobile Safari/537.36 EdgA/131.0.2903.87 diff --git a/mealie/services/scraper/user_agents_manager.py b/mealie/services/scraper/user_agents_manager.py deleted file mode 100644 index 416ad08f2..000000000 --- a/mealie/services/scraper/user_agents_manager.py +++ /dev/null @@ -1,65 +0,0 @@ -from __future__ import annotations - -import os -import random - -_USER_AGENTS_MANAGER: UserAgentsManager | None = None - - -def get_user_agents_manager() -> UserAgentsManager: - global _USER_AGENTS_MANAGER - - if not _USER_AGENTS_MANAGER: - _USER_AGENTS_MANAGER = UserAgentsManager() - - return _USER_AGENTS_MANAGER - - -class UserAgentsManager: - def __init__(self) -> None: - self._user_agents: list[str] | None = None - self._user_agents_text_path = os.path.join(os.path.dirname(__file__), "user-agents.txt") - - def get_scrape_headers(self, user_agent: str | None = None) -> dict[str, str]: - # From: https://scrapeops.io/web-scraping-playbook/403-forbidden-error-web-scraping/#optimize-request-headers - if user_agent is None: - user_agent = random.choice(self.user_agents) - - return { - "User-Agent": user_agent, - "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", - "Accept-Language": "en-US,en;q=0.5", - "Accept-Encoding": "gzip, deflate", - "Connection": "keep-alive", - "Upgrade-Insecure-Requests": "1", - "Sec-Fetch-Dest": "document", - "Sec-Fetch-Mode": "navigate", - "Sec-Fetch-Site": "none", - "Sec-Fetch-User": "?1", - "Cache-Control": "max-age=0", - } - - @property - def user_agents(self) -> list[str]: - if not self._user_agents: - self._user_agents = self._fetch_user_agents() - - return self._user_agents - - def _fetch_user_agents(self) -> list[str]: - user_agents: list[str] = [] - - try: - from recipe_scrapers._abstract import HEADERS - - user_agents.append(HEADERS["User-Agent"]) - except (ImportError, KeyError): - user_agents.append("Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/128.0") - - with open(self._user_agents_text_path) as f: - for line in f: - if not line: - continue - user_agents.append(line.strip()) - - return user_agents diff --git a/pyproject.toml b/pyproject.toml index 8d1e272e0..698b55be1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,6 +48,7 @@ dependencies = [ "yt-dlp==2026.3.13", "ingredient-parser-nlp==2.5.0", "pint==0.25.2", + "httpx-curl-cffi==0.1.5", ] [project.scripts] diff --git a/uv.lock b/uv.lock index 4520b4d7c..f31d8febe 100644 --- a/uv.lock +++ b/uv.lock @@ -351,6 +351,29 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e8/cb/2da4cc83f5edb9c3257d09e1e7ab7b23f049c7962cae8d842bbef0a9cec9/cryptography-46.0.3-cp38-abi3-win_arm64.whl", hash = "sha256:d89c3468de4cdc4f08a57e214384d0471911a3830fcdaf7a8cc587e42a866372", size = 2918740, upload-time = "2025-10-15T23:18:12.277Z" }, ] +[[package]] +name = "curl-cffi" +version = "0.14.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "certifi" }, + { name = "cffi" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/9b/c9/0067d9a25ed4592b022d4558157fcdb6e123516083700786d38091688767/curl_cffi-0.14.0.tar.gz", hash = "sha256:5ffbc82e59f05008ec08ea432f0e535418823cda44178ee518906a54f27a5f0f", size = 162633, upload-time = "2025-12-16T03:25:07.931Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/aa/f0/0f21e9688eaac85e705537b3a87a5588d0cefb2f09d83e83e0e8be93aa99/curl_cffi-0.14.0-cp39-abi3-macosx_14_0_arm64.whl", hash = "sha256:e35e89c6a69872f9749d6d5fda642ed4fc159619329e99d577d0104c9aad5893", size = 3087277, upload-time = "2025-12-16T03:24:49.607Z" }, + { url = "https://files.pythonhosted.org/packages/ba/a3/0419bd48fce5b145cb6a2344c6ac17efa588f5b0061f212c88e0723da026/curl_cffi-0.14.0-cp39-abi3-macosx_15_0_x86_64.whl", hash = "sha256:5945478cd28ad7dfb5c54473bcfb6743ee1d66554d57951fdf8fc0e7d8cf4e45", size = 5804650, upload-time = "2025-12-16T03:24:51.518Z" }, + { url = "https://files.pythonhosted.org/packages/e2/07/a238dd062b7841b8caa2fa8a359eb997147ff3161288f0dd46654d898b4d/curl_cffi-0.14.0-cp39-abi3-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c42e8fa3c667db9ccd2e696ee47adcd3cd5b0838d7282f3fc45f6c0ef3cfdfa7", size = 8231918, upload-time = "2025-12-16T03:24:52.862Z" }, + { url = "https://files.pythonhosted.org/packages/7c/d2/ce907c9b37b5caf76ac08db40cc4ce3d9f94c5500db68a195af3513eacbc/curl_cffi-0.14.0-cp39-abi3-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:060fe2c99c41d3cb7f894de318ddf4b0301b08dca70453d769bd4e74b36b8483", size = 8654624, upload-time = "2025-12-16T03:24:54.579Z" }, + { url = "https://files.pythonhosted.org/packages/f2/ae/6256995b18c75e6ef76b30753a5109e786813aa79088b27c8eabb1ef85c9/curl_cffi-0.14.0-cp39-abi3-manylinux_2_28_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:b158c41a25388690dd0d40b5bc38d1e0f512135f17fdb8029868cbc1993d2e5b", size = 8010654, upload-time = "2025-12-16T03:24:56.507Z" }, + { url = "https://files.pythonhosted.org/packages/fb/10/ff64249e516b103cb762e0a9dca3ee0f04cf25e2a1d5d9838e0f1273d071/curl_cffi-0.14.0-cp39-abi3-manylinux_2_28_i686.whl", hash = "sha256:1439fbef3500fb723333c826adf0efb0e2e5065a703fb5eccce637a2250db34a", size = 7781969, upload-time = "2025-12-16T03:24:57.885Z" }, + { url = "https://files.pythonhosted.org/packages/51/76/d6f7bb76c2d12811aa7ff16f5e17b678abdd1b357b9a8ac56310ceccabd5/curl_cffi-0.14.0-cp39-abi3-manylinux_2_34_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:e7176f2c2d22b542e3cf261072a81deb018cfa7688930f95dddef215caddb469", size = 7969133, upload-time = "2025-12-16T03:24:59.261Z" }, + { url = "https://files.pythonhosted.org/packages/23/7c/cca39c0ed4e1772613d3cba13091c0e9d3b89365e84b9bf9838259a3cd8f/curl_cffi-0.14.0-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:03f21ade2d72978c2bb8670e9b6de5260e2755092b02d94b70b906813662998d", size = 9080167, upload-time = "2025-12-16T03:25:00.946Z" }, + { url = "https://files.pythonhosted.org/packages/75/03/a942d7119d3e8911094d157598ae0169b1c6ca1bd3f27d7991b279bcc45b/curl_cffi-0.14.0-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:58ebf02de64ee5c95613209ddacb014c2d2f86298d7080c0a1c12ed876ee0690", size = 9520464, upload-time = "2025-12-16T03:25:02.922Z" }, + { url = "https://files.pythonhosted.org/packages/a2/77/78900e9b0833066d2274bda75cba426fdb4cef7fbf6a4f6a6ca447607bec/curl_cffi-0.14.0-cp39-abi3-win_amd64.whl", hash = "sha256:6e503f9a103f6ae7acfb3890c843b53ec030785a22ae7682a22cc43afb94123e", size = 1677416, upload-time = "2025-12-16T03:25:04.902Z" }, + { url = "https://files.pythonhosted.org/packages/5c/7c/d2ba86b0b3e1e2830bd94163d047de122c69a8df03c5c7c36326c456ad82/curl_cffi-0.14.0-cp39-abi3-win_arm64.whl", hash = "sha256:2eed50a969201605c863c4c31269dfc3e0da52916086ac54553cfa353022425c", size = 1425067, upload-time = "2025-12-16T03:25:06.454Z" }, +] + [[package]] name = "dill" version = "0.4.0" @@ -575,6 +598,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload-time = "2024-12-06T15:37:21.509Z" }, ] +[[package]] +name = "httpx-curl-cffi" +version = "0.1.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "curl-cffi" }, + { name = "httpx" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/64/1f/158975d2541effa30f0d7634542dce50e8280ab283e7efc8d221ebf8a949/httpx_curl_cffi-0.1.5.tar.gz", hash = "sha256:177ee9968e9da142407017816cc3fb08ab281b134f773a9359b6a4650a6c81f3", size = 7937, upload-time = "2025-12-02T08:59:13.656Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/6e/13/82039e3df58e0d52a6f82cc73d958400a2777d78c6cd6378c937a707afd0/httpx_curl_cffi-0.1.5-py3-none-any.whl", hash = "sha256:be414a97ac1f627693f4c8a8631f2852bb1c09456e61ff8ad996ad050a11fb53", size = 8933, upload-time = "2025-12-02T08:59:12.447Z" }, +] + [[package]] name = "identify" version = "2.6.15" @@ -840,6 +877,7 @@ dependencies = [ { name = "fastapi" }, { name = "html2text" }, { name = "httpx" }, + { name = "httpx-curl-cffi" }, { name = "ingredient-parser-nlp" }, { name = "isodate" }, { name = "itsdangerous" }, @@ -915,6 +953,7 @@ requires-dist = [ { name = "fastapi", specifier = "==0.135.1" }, { name = "html2text", specifier = "==2025.4.15" }, { name = "httpx", specifier = "==0.28.1" }, + { name = "httpx-curl-cffi", specifier = "==0.1.5" }, { name = "ingredient-parser-nlp", specifier = "==2.5.0" }, { name = "isodate", specifier = "==0.7.2" }, { name = "itsdangerous", specifier = "==2.2.0" },