feat: Switch to httpx-curl-cffi for better scraping (#7254)

This commit is contained in:
Michael Genson
2026-03-15 18:30:23 -05:00
committed by GitHub
parent 69d740e100
commit 60d4a62f0a
7 changed files with 101 additions and 119 deletions

View File

@@ -3,6 +3,7 @@ import logging
import socket import socket
import httpx import httpx
from httpx_curl_cffi import AsyncCurlTransport
class ForcedTimeoutException(Exception): class ForcedTimeoutException(Exception):
@@ -21,9 +22,9 @@ class InvalidDomainError(Exception):
... ...
class AsyncSafeTransport(httpx.AsyncBaseTransport): class AsyncSafeTransport(AsyncCurlTransport):
""" """
A wrapper around the httpx transport class that enforces a timeout value A wrapper around the httpx-curl-cffi transport class that enforces a timeout value
and that the request is not made to a local IP address. and that the request is not made to a local IP address.
""" """
@@ -31,10 +32,10 @@ class AsyncSafeTransport(httpx.AsyncBaseTransport):
def __init__(self, log: logging.Logger | None = None, **kwargs): def __init__(self, log: logging.Logger | None = None, **kwargs):
self.timeout = kwargs.pop("timeout", self.timeout) self.timeout = kwargs.pop("timeout", self.timeout)
self._wrapper = httpx.AsyncHTTPTransport(**kwargs)
self._log = log self._log = log
super().__init__(**kwargs)
async def handle_async_request(self, request) -> httpx.Response: async def handle_async_request(self, request: httpx.Request) -> httpx.Response:
# override timeout value for _all_ requests # override timeout value for _all_ requests
request.extensions["timeout"] = httpx.Timeout(self.timeout, pool=self.timeout).as_dict() request.extensions["timeout"] = httpx.Timeout(self.timeout, pool=self.timeout).as_dict()
@@ -72,7 +73,4 @@ class AsyncSafeTransport(httpx.AsyncBaseTransport):
self._log.warning(f"invalid request on local resource: {request.url} -> {ip}") self._log.warning(f"invalid request on local resource: {request.url} -> {ip}")
raise InvalidDomainError(f"invalid request on local resource: {request.url} -> {ip}") raise InvalidDomainError(f"invalid request on local resource: {request.url} -> {ip}")
return await self._wrapper.handle_async_request(request) return await super().handle_async_request(request)
async def aclose(self):
await self._wrapper.aclose()

View File

@@ -7,11 +7,9 @@ from httpx import AsyncClient, Response
from pydantic import UUID4 from pydantic import UUID4
from mealie.pkgs import img, safehttp from mealie.pkgs import img, safehttp
from mealie.pkgs.safehttp.transport import AsyncSafeTransport
from mealie.schema.recipe.recipe import Recipe from mealie.schema.recipe.recipe import Recipe
from mealie.schema.recipe.recipe_image_types import RecipeImageTypes from mealie.schema.recipe.recipe_image_types import RecipeImageTypes
from mealie.services._base_service import BaseService from mealie.services._base_service import BaseService
from mealie.services.scraper.user_agents_manager import get_user_agents_manager
async def gather_with_concurrency(n, *coros, ignore_exceptions=False): async def gather_with_concurrency(n, *coros, ignore_exceptions=False):
@@ -28,17 +26,15 @@ async def gather_with_concurrency(n, *coros, ignore_exceptions=False):
async def largest_content_len(urls: list[str]) -> tuple[str, int]: async def largest_content_len(urls: list[str]) -> tuple[str, int]:
user_agent_manager = get_user_agents_manager()
largest_url = "" largest_url = ""
largest_len = 0 largest_len = 0
max_concurrency = 10 max_concurrency = 10
async def do(client: AsyncClient, url: str) -> Response: async def do(client: AsyncClient, url: str) -> Response:
return await client.head(url, headers=user_agent_manager.get_scrape_headers()) return await client.head(url)
async with AsyncClient(transport=safehttp.AsyncSafeTransport()) as client: async with AsyncClient(transport=safehttp.AsyncSafeTransport(impersonate="chrome")) as client:
tasks = [do(client, url) for url in urls] tasks = [do(client, url) for url in urls]
responses: list[Response] = await gather_with_concurrency(max_concurrency, *tasks, ignore_exceptions=True) responses: list[Response] = await gather_with_concurrency(max_concurrency, *tasks, ignore_exceptions=True)
for response in responses: for response in responses:
@@ -117,7 +113,6 @@ class RecipeDataService(BaseService):
async def scrape_image(self, image_url: str | dict[str, str] | list[str]) -> None: async def scrape_image(self, image_url: str | dict[str, str] | list[str]) -> None:
self.logger.info(f"Image URL: {image_url}") self.logger.info(f"Image URL: {image_url}")
user_agent = get_user_agents_manager().user_agents[0]
image_url_str = "" image_url_str = ""
@@ -146,9 +141,9 @@ class RecipeDataService(BaseService):
file_name = f"{self.recipe_id!s}.{ext}" file_name = f"{self.recipe_id!s}.{ext}"
file_path = Recipe.directory_from_id(self.recipe_id).joinpath("images", file_name) file_path = Recipe.directory_from_id(self.recipe_id).joinpath("images", file_name)
async with AsyncClient(transport=AsyncSafeTransport()) as client: async with AsyncClient(transport=safehttp.AsyncSafeTransport(impersonate="chrome")) as client:
try: try:
r = await client.get(image_url_str, headers={"User-Agent": user_agent}) r = await client.get(image_url_str)
except Exception: except Exception:
self.logger.exception("Fatal Image Request Exception") self.logger.exception("Fatal Image Request Exception")
return None return None

View File

@@ -32,9 +32,16 @@ from mealie.services.openai import OpenAIService
from mealie.services.scraper.scraped_extras import ScrapedExtras from mealie.services.scraper.scraped_extras import ScrapedExtras
from . import cleaner from . import cleaner
from .user_agents_manager import get_user_agents_manager
SCRAPER_TIMEOUT = 15 SCRAPER_TIMEOUT = 15
BROWSER_IMPERSONATIONS = [
"chrome",
"firefox",
"safari",
"edge",
]
logger = get_logger() logger = get_logger()
@@ -51,29 +58,40 @@ class ForceTimeoutException(Exception):
async def safe_scrape_html(url: str) -> str: async def safe_scrape_html(url: str) -> str:
""" """
Scrapes the html from a url but will cancel the request Scrapes the html from a url but will cancel the request
if the request takes longer than 15 seconds. This is used to mitigate if the request takes longer than SCRAPER_TIMEOUT seconds. This is used to mitigate
DDOS attacks from users providing a url with arbitrary large content. DDOS attacks from users providing a url with arbitrary large content.
Cycles through browser TLS impersonations (via httpx-curl-cffi) to bypass
bot-detection systems that fingerprint the TLS handshake (JA3/JA4),
such as Cloudflare.
""" """
user_agents_manager = get_user_agents_manager()
logger.debug(f"Scraping URL: {url}") logger.debug(f"Scraping URL: {url}")
async with AsyncClient(transport=safehttp.AsyncSafeTransport()) as client:
for user_agent in user_agents_manager.user_agents:
logger.debug(f'Trying User-Agent: "{user_agent}"')
response: Response | None = None
html_bytes = b"" html_bytes = b""
response: Response | None = None
for impersonation in BROWSER_IMPERSONATIONS:
logger.debug(f'Trying browser impersonation: "{impersonation}"')
html_bytes = b""
response = None
transport = safehttp.AsyncSafeTransport(impersonate=impersonation)
async with AsyncClient(transport=transport) as client:
async with client.stream( async with client.stream(
"GET", "GET",
url, url,
timeout=SCRAPER_TIMEOUT, timeout=SCRAPER_TIMEOUT,
headers=user_agents_manager.get_scrape_headers(user_agent),
follow_redirects=True, follow_redirects=True,
) as resp: ) as resp:
if resp.status_code >= status.HTTP_400_BAD_REQUEST: if resp.status_code == 403:
logger.debug(f'Error status code {resp.status_code} with User-Agent: "{user_agent}"') logger.debug(f'403 Forbidden with impersonation "{impersonation}", trying next')
continue continue
if resp.status_code >= 400:
logger.debug(f'Error status code {resp.status_code} with impersonation "{impersonation}"')
break
start_time = time.time() start_time = time.time()
async for chunk in resp.aiter_bytes(chunk_size=1024): async for chunk in resp.aiter_bytes(chunk_size=1024):
@@ -92,7 +110,6 @@ async def safe_scrape_html(url: str) -> str:
# Copied from requests text property # Copied from requests text property
# Try charset from content-type # Try charset from content-type
content = None
encoding = response.encoding encoding = response.encoding
# Fallback to auto-detected encoding. # Fallback to auto-detected encoding.

View File

@@ -1,3 +0,0 @@
Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/26.3.1 Safari/605.1.15
Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:148.0) Gecko/20100101 Firefox/148.0
Mozilla/5.0 (Linux; Android 10; HD1913) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.6834.164 Mobile Safari/537.36 EdgA/131.0.2903.87

View File

@@ -1,65 +0,0 @@
from __future__ import annotations
import os
import random
_USER_AGENTS_MANAGER: UserAgentsManager | None = None
def get_user_agents_manager() -> UserAgentsManager:
global _USER_AGENTS_MANAGER
if not _USER_AGENTS_MANAGER:
_USER_AGENTS_MANAGER = UserAgentsManager()
return _USER_AGENTS_MANAGER
class UserAgentsManager:
def __init__(self) -> None:
self._user_agents: list[str] | None = None
self._user_agents_text_path = os.path.join(os.path.dirname(__file__), "user-agents.txt")
def get_scrape_headers(self, user_agent: str | None = None) -> dict[str, str]:
# From: https://scrapeops.io/web-scraping-playbook/403-forbidden-error-web-scraping/#optimize-request-headers
if user_agent is None:
user_agent = random.choice(self.user_agents)
return {
"User-Agent": user_agent,
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
"Cache-Control": "max-age=0",
}
@property
def user_agents(self) -> list[str]:
if not self._user_agents:
self._user_agents = self._fetch_user_agents()
return self._user_agents
def _fetch_user_agents(self) -> list[str]:
user_agents: list[str] = []
try:
from recipe_scrapers._abstract import HEADERS
user_agents.append(HEADERS["User-Agent"])
except (ImportError, KeyError):
user_agents.append("Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/128.0")
with open(self._user_agents_text_path) as f:
for line in f:
if not line:
continue
user_agents.append(line.strip())
return user_agents

View File

@@ -48,6 +48,7 @@ dependencies = [
"yt-dlp==2026.3.13", "yt-dlp==2026.3.13",
"ingredient-parser-nlp==2.5.0", "ingredient-parser-nlp==2.5.0",
"pint==0.25.2", "pint==0.25.2",
"httpx-curl-cffi==0.1.5",
] ]
[project.scripts] [project.scripts]

39
uv.lock generated
View File

@@ -351,6 +351,29 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/e8/cb/2da4cc83f5edb9c3257d09e1e7ab7b23f049c7962cae8d842bbef0a9cec9/cryptography-46.0.3-cp38-abi3-win_arm64.whl", hash = "sha256:d89c3468de4cdc4f08a57e214384d0471911a3830fcdaf7a8cc587e42a866372", size = 2918740, upload-time = "2025-10-15T23:18:12.277Z" }, { url = "https://files.pythonhosted.org/packages/e8/cb/2da4cc83f5edb9c3257d09e1e7ab7b23f049c7962cae8d842bbef0a9cec9/cryptography-46.0.3-cp38-abi3-win_arm64.whl", hash = "sha256:d89c3468de4cdc4f08a57e214384d0471911a3830fcdaf7a8cc587e42a866372", size = 2918740, upload-time = "2025-10-15T23:18:12.277Z" },
] ]
[[package]]
name = "curl-cffi"
version = "0.14.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "certifi" },
{ name = "cffi" },
]
sdist = { url = "https://files.pythonhosted.org/packages/9b/c9/0067d9a25ed4592b022d4558157fcdb6e123516083700786d38091688767/curl_cffi-0.14.0.tar.gz", hash = "sha256:5ffbc82e59f05008ec08ea432f0e535418823cda44178ee518906a54f27a5f0f", size = 162633, upload-time = "2025-12-16T03:25:07.931Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/aa/f0/0f21e9688eaac85e705537b3a87a5588d0cefb2f09d83e83e0e8be93aa99/curl_cffi-0.14.0-cp39-abi3-macosx_14_0_arm64.whl", hash = "sha256:e35e89c6a69872f9749d6d5fda642ed4fc159619329e99d577d0104c9aad5893", size = 3087277, upload-time = "2025-12-16T03:24:49.607Z" },
{ url = "https://files.pythonhosted.org/packages/ba/a3/0419bd48fce5b145cb6a2344c6ac17efa588f5b0061f212c88e0723da026/curl_cffi-0.14.0-cp39-abi3-macosx_15_0_x86_64.whl", hash = "sha256:5945478cd28ad7dfb5c54473bcfb6743ee1d66554d57951fdf8fc0e7d8cf4e45", size = 5804650, upload-time = "2025-12-16T03:24:51.518Z" },
{ url = "https://files.pythonhosted.org/packages/e2/07/a238dd062b7841b8caa2fa8a359eb997147ff3161288f0dd46654d898b4d/curl_cffi-0.14.0-cp39-abi3-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c42e8fa3c667db9ccd2e696ee47adcd3cd5b0838d7282f3fc45f6c0ef3cfdfa7", size = 8231918, upload-time = "2025-12-16T03:24:52.862Z" },
{ url = "https://files.pythonhosted.org/packages/7c/d2/ce907c9b37b5caf76ac08db40cc4ce3d9f94c5500db68a195af3513eacbc/curl_cffi-0.14.0-cp39-abi3-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:060fe2c99c41d3cb7f894de318ddf4b0301b08dca70453d769bd4e74b36b8483", size = 8654624, upload-time = "2025-12-16T03:24:54.579Z" },
{ url = "https://files.pythonhosted.org/packages/f2/ae/6256995b18c75e6ef76b30753a5109e786813aa79088b27c8eabb1ef85c9/curl_cffi-0.14.0-cp39-abi3-manylinux_2_28_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:b158c41a25388690dd0d40b5bc38d1e0f512135f17fdb8029868cbc1993d2e5b", size = 8010654, upload-time = "2025-12-16T03:24:56.507Z" },
{ url = "https://files.pythonhosted.org/packages/fb/10/ff64249e516b103cb762e0a9dca3ee0f04cf25e2a1d5d9838e0f1273d071/curl_cffi-0.14.0-cp39-abi3-manylinux_2_28_i686.whl", hash = "sha256:1439fbef3500fb723333c826adf0efb0e2e5065a703fb5eccce637a2250db34a", size = 7781969, upload-time = "2025-12-16T03:24:57.885Z" },
{ url = "https://files.pythonhosted.org/packages/51/76/d6f7bb76c2d12811aa7ff16f5e17b678abdd1b357b9a8ac56310ceccabd5/curl_cffi-0.14.0-cp39-abi3-manylinux_2_34_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:e7176f2c2d22b542e3cf261072a81deb018cfa7688930f95dddef215caddb469", size = 7969133, upload-time = "2025-12-16T03:24:59.261Z" },
{ url = "https://files.pythonhosted.org/packages/23/7c/cca39c0ed4e1772613d3cba13091c0e9d3b89365e84b9bf9838259a3cd8f/curl_cffi-0.14.0-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:03f21ade2d72978c2bb8670e9b6de5260e2755092b02d94b70b906813662998d", size = 9080167, upload-time = "2025-12-16T03:25:00.946Z" },
{ url = "https://files.pythonhosted.org/packages/75/03/a942d7119d3e8911094d157598ae0169b1c6ca1bd3f27d7991b279bcc45b/curl_cffi-0.14.0-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:58ebf02de64ee5c95613209ddacb014c2d2f86298d7080c0a1c12ed876ee0690", size = 9520464, upload-time = "2025-12-16T03:25:02.922Z" },
{ url = "https://files.pythonhosted.org/packages/a2/77/78900e9b0833066d2274bda75cba426fdb4cef7fbf6a4f6a6ca447607bec/curl_cffi-0.14.0-cp39-abi3-win_amd64.whl", hash = "sha256:6e503f9a103f6ae7acfb3890c843b53ec030785a22ae7682a22cc43afb94123e", size = 1677416, upload-time = "2025-12-16T03:25:04.902Z" },
{ url = "https://files.pythonhosted.org/packages/5c/7c/d2ba86b0b3e1e2830bd94163d047de122c69a8df03c5c7c36326c456ad82/curl_cffi-0.14.0-cp39-abi3-win_arm64.whl", hash = "sha256:2eed50a969201605c863c4c31269dfc3e0da52916086ac54553cfa353022425c", size = 1425067, upload-time = "2025-12-16T03:25:06.454Z" },
]
[[package]] [[package]]
name = "dill" name = "dill"
version = "0.4.0" version = "0.4.0"
@@ -575,6 +598,20 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload-time = "2024-12-06T15:37:21.509Z" }, { url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload-time = "2024-12-06T15:37:21.509Z" },
] ]
[[package]]
name = "httpx-curl-cffi"
version = "0.1.5"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "curl-cffi" },
{ name = "httpx" },
{ name = "typing-extensions" },
]
sdist = { url = "https://files.pythonhosted.org/packages/64/1f/158975d2541effa30f0d7634542dce50e8280ab283e7efc8d221ebf8a949/httpx_curl_cffi-0.1.5.tar.gz", hash = "sha256:177ee9968e9da142407017816cc3fb08ab281b134f773a9359b6a4650a6c81f3", size = 7937, upload-time = "2025-12-02T08:59:13.656Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/6e/13/82039e3df58e0d52a6f82cc73d958400a2777d78c6cd6378c937a707afd0/httpx_curl_cffi-0.1.5-py3-none-any.whl", hash = "sha256:be414a97ac1f627693f4c8a8631f2852bb1c09456e61ff8ad996ad050a11fb53", size = 8933, upload-time = "2025-12-02T08:59:12.447Z" },
]
[[package]] [[package]]
name = "identify" name = "identify"
version = "2.6.15" version = "2.6.15"
@@ -840,6 +877,7 @@ dependencies = [
{ name = "fastapi" }, { name = "fastapi" },
{ name = "html2text" }, { name = "html2text" },
{ name = "httpx" }, { name = "httpx" },
{ name = "httpx-curl-cffi" },
{ name = "ingredient-parser-nlp" }, { name = "ingredient-parser-nlp" },
{ name = "isodate" }, { name = "isodate" },
{ name = "itsdangerous" }, { name = "itsdangerous" },
@@ -915,6 +953,7 @@ requires-dist = [
{ name = "fastapi", specifier = "==0.135.1" }, { name = "fastapi", specifier = "==0.135.1" },
{ name = "html2text", specifier = "==2025.4.15" }, { name = "html2text", specifier = "==2025.4.15" },
{ name = "httpx", specifier = "==0.28.1" }, { name = "httpx", specifier = "==0.28.1" },
{ name = "httpx-curl-cffi", specifier = "==0.1.5" },
{ name = "ingredient-parser-nlp", specifier = "==2.5.0" }, { name = "ingredient-parser-nlp", specifier = "==2.5.0" },
{ name = "isodate", specifier = "==0.7.2" }, { name = "isodate", specifier = "==0.7.2" },
{ name = "itsdangerous", specifier = "==2.2.0" }, { name = "itsdangerous", specifier = "==2.2.0" },