mirror of
https://github.com/mealie-recipes/mealie.git
synced 2026-04-18 10:55:36 -04:00
feat: Switch to httpx-curl-cffi for better scraping (#7254)
This commit is contained in:
@@ -3,6 +3,7 @@ import logging
|
||||
import socket
|
||||
|
||||
import httpx
|
||||
from httpx_curl_cffi import AsyncCurlTransport
|
||||
|
||||
|
||||
class ForcedTimeoutException(Exception):
|
||||
@@ -21,9 +22,9 @@ class InvalidDomainError(Exception):
|
||||
...
|
||||
|
||||
|
||||
class AsyncSafeTransport(httpx.AsyncBaseTransport):
|
||||
class AsyncSafeTransport(AsyncCurlTransport):
|
||||
"""
|
||||
A wrapper around the httpx transport class that enforces a timeout value
|
||||
A wrapper around the httpx-curl-cffi transport class that enforces a timeout value
|
||||
and that the request is not made to a local IP address.
|
||||
"""
|
||||
|
||||
@@ -31,10 +32,10 @@ class AsyncSafeTransport(httpx.AsyncBaseTransport):
|
||||
|
||||
def __init__(self, log: logging.Logger | None = None, **kwargs):
|
||||
self.timeout = kwargs.pop("timeout", self.timeout)
|
||||
self._wrapper = httpx.AsyncHTTPTransport(**kwargs)
|
||||
self._log = log
|
||||
super().__init__(**kwargs)
|
||||
|
||||
async def handle_async_request(self, request) -> httpx.Response:
|
||||
async def handle_async_request(self, request: httpx.Request) -> httpx.Response:
|
||||
# override timeout value for _all_ requests
|
||||
request.extensions["timeout"] = httpx.Timeout(self.timeout, pool=self.timeout).as_dict()
|
||||
|
||||
@@ -72,7 +73,4 @@ class AsyncSafeTransport(httpx.AsyncBaseTransport):
|
||||
self._log.warning(f"invalid request on local resource: {request.url} -> {ip}")
|
||||
raise InvalidDomainError(f"invalid request on local resource: {request.url} -> {ip}")
|
||||
|
||||
return await self._wrapper.handle_async_request(request)
|
||||
|
||||
async def aclose(self):
|
||||
await self._wrapper.aclose()
|
||||
return await super().handle_async_request(request)
|
||||
|
||||
@@ -7,11 +7,9 @@ from httpx import AsyncClient, Response
|
||||
from pydantic import UUID4
|
||||
|
||||
from mealie.pkgs import img, safehttp
|
||||
from mealie.pkgs.safehttp.transport import AsyncSafeTransport
|
||||
from mealie.schema.recipe.recipe import Recipe
|
||||
from mealie.schema.recipe.recipe_image_types import RecipeImageTypes
|
||||
from mealie.services._base_service import BaseService
|
||||
from mealie.services.scraper.user_agents_manager import get_user_agents_manager
|
||||
|
||||
|
||||
async def gather_with_concurrency(n, *coros, ignore_exceptions=False):
|
||||
@@ -28,17 +26,15 @@ async def gather_with_concurrency(n, *coros, ignore_exceptions=False):
|
||||
|
||||
|
||||
async def largest_content_len(urls: list[str]) -> tuple[str, int]:
|
||||
user_agent_manager = get_user_agents_manager()
|
||||
|
||||
largest_url = ""
|
||||
largest_len = 0
|
||||
|
||||
max_concurrency = 10
|
||||
|
||||
async def do(client: AsyncClient, url: str) -> Response:
|
||||
return await client.head(url, headers=user_agent_manager.get_scrape_headers())
|
||||
return await client.head(url)
|
||||
|
||||
async with AsyncClient(transport=safehttp.AsyncSafeTransport()) as client:
|
||||
async with AsyncClient(transport=safehttp.AsyncSafeTransport(impersonate="chrome")) as client:
|
||||
tasks = [do(client, url) for url in urls]
|
||||
responses: list[Response] = await gather_with_concurrency(max_concurrency, *tasks, ignore_exceptions=True)
|
||||
for response in responses:
|
||||
@@ -117,7 +113,6 @@ class RecipeDataService(BaseService):
|
||||
|
||||
async def scrape_image(self, image_url: str | dict[str, str] | list[str]) -> None:
|
||||
self.logger.info(f"Image URL: {image_url}")
|
||||
user_agent = get_user_agents_manager().user_agents[0]
|
||||
|
||||
image_url_str = ""
|
||||
|
||||
@@ -146,9 +141,9 @@ class RecipeDataService(BaseService):
|
||||
file_name = f"{self.recipe_id!s}.{ext}"
|
||||
file_path = Recipe.directory_from_id(self.recipe_id).joinpath("images", file_name)
|
||||
|
||||
async with AsyncClient(transport=AsyncSafeTransport()) as client:
|
||||
async with AsyncClient(transport=safehttp.AsyncSafeTransport(impersonate="chrome")) as client:
|
||||
try:
|
||||
r = await client.get(image_url_str, headers={"User-Agent": user_agent})
|
||||
r = await client.get(image_url_str)
|
||||
except Exception:
|
||||
self.logger.exception("Fatal Image Request Exception")
|
||||
return None
|
||||
|
||||
@@ -32,9 +32,16 @@ from mealie.services.openai import OpenAIService
|
||||
from mealie.services.scraper.scraped_extras import ScrapedExtras
|
||||
|
||||
from . import cleaner
|
||||
from .user_agents_manager import get_user_agents_manager
|
||||
|
||||
SCRAPER_TIMEOUT = 15
|
||||
|
||||
BROWSER_IMPERSONATIONS = [
|
||||
"chrome",
|
||||
"firefox",
|
||||
"safari",
|
||||
"edge",
|
||||
]
|
||||
|
||||
logger = get_logger()
|
||||
|
||||
|
||||
@@ -51,29 +58,40 @@ class ForceTimeoutException(Exception):
|
||||
async def safe_scrape_html(url: str) -> str:
|
||||
"""
|
||||
Scrapes the html from a url but will cancel the request
|
||||
if the request takes longer than 15 seconds. This is used to mitigate
|
||||
if the request takes longer than SCRAPER_TIMEOUT seconds. This is used to mitigate
|
||||
DDOS attacks from users providing a url with arbitrary large content.
|
||||
|
||||
Cycles through browser TLS impersonations (via httpx-curl-cffi) to bypass
|
||||
bot-detection systems that fingerprint the TLS handshake (JA3/JA4),
|
||||
such as Cloudflare.
|
||||
"""
|
||||
user_agents_manager = get_user_agents_manager()
|
||||
|
||||
logger.debug(f"Scraping URL: {url}")
|
||||
async with AsyncClient(transport=safehttp.AsyncSafeTransport()) as client:
|
||||
for user_agent in user_agents_manager.user_agents:
|
||||
logger.debug(f'Trying User-Agent: "{user_agent}"')
|
||||
|
||||
response: Response | None = None
|
||||
html_bytes = b""
|
||||
html_bytes = b""
|
||||
response: Response | None = None
|
||||
|
||||
for impersonation in BROWSER_IMPERSONATIONS:
|
||||
logger.debug(f'Trying browser impersonation: "{impersonation}"')
|
||||
|
||||
html_bytes = b""
|
||||
response = None
|
||||
|
||||
transport = safehttp.AsyncSafeTransport(impersonate=impersonation)
|
||||
async with AsyncClient(transport=transport) as client:
|
||||
async with client.stream(
|
||||
"GET",
|
||||
url,
|
||||
timeout=SCRAPER_TIMEOUT,
|
||||
headers=user_agents_manager.get_scrape_headers(user_agent),
|
||||
follow_redirects=True,
|
||||
) as resp:
|
||||
if resp.status_code >= status.HTTP_400_BAD_REQUEST:
|
||||
logger.debug(f'Error status code {resp.status_code} with User-Agent: "{user_agent}"')
|
||||
if resp.status_code == 403:
|
||||
logger.debug(f'403 Forbidden with impersonation "{impersonation}", trying next')
|
||||
continue
|
||||
|
||||
if resp.status_code >= 400:
|
||||
logger.debug(f'Error status code {resp.status_code} with impersonation "{impersonation}"')
|
||||
break
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
async for chunk in resp.aiter_bytes(chunk_size=1024):
|
||||
@@ -85,33 +103,32 @@ async def safe_scrape_html(url: str) -> str:
|
||||
response = resp
|
||||
break
|
||||
|
||||
if not (response and html_bytes):
|
||||
return ""
|
||||
if not (response and html_bytes):
|
||||
return ""
|
||||
|
||||
# =====================================
|
||||
# Copied from requests text property
|
||||
# =====================================
|
||||
# Copied from requests text property
|
||||
|
||||
# Try charset from content-type
|
||||
content = None
|
||||
encoding = response.encoding
|
||||
# Try charset from content-type
|
||||
encoding = response.encoding
|
||||
|
||||
# Fallback to auto-detected encoding.
|
||||
if encoding is None:
|
||||
encoding = response.apparent_encoding
|
||||
# Fallback to auto-detected encoding.
|
||||
if encoding is None:
|
||||
encoding = response.apparent_encoding
|
||||
|
||||
# Decode unicode from given encoding.
|
||||
try:
|
||||
content = str(html_bytes, encoding, errors="replace")
|
||||
except (LookupError, TypeError):
|
||||
# A LookupError is raised if the encoding was not found which could
|
||||
# indicate a misspelling or similar mistake.
|
||||
#
|
||||
# A TypeError can be raised if encoding is None
|
||||
#
|
||||
# So we try blindly encoding.
|
||||
content = str(html_bytes, errors="replace")
|
||||
# Decode unicode from given encoding.
|
||||
try:
|
||||
content = str(html_bytes, encoding, errors="replace")
|
||||
except (LookupError, TypeError):
|
||||
# A LookupError is raised if the encoding was not found which could
|
||||
# indicate a misspelling or similar mistake.
|
||||
#
|
||||
# A TypeError can be raised if encoding is None
|
||||
#
|
||||
# So we try blindly encoding.
|
||||
content = str(html_bytes, errors="replace")
|
||||
|
||||
return content
|
||||
return content
|
||||
|
||||
|
||||
class ABCScraperStrategy(ABC):
|
||||
|
||||
@@ -1,3 +0,0 @@
|
||||
Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/26.3.1 Safari/605.1.15
|
||||
Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:148.0) Gecko/20100101 Firefox/148.0
|
||||
Mozilla/5.0 (Linux; Android 10; HD1913) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.6834.164 Mobile Safari/537.36 EdgA/131.0.2903.87
|
||||
@@ -1,65 +0,0 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import random
|
||||
|
||||
_USER_AGENTS_MANAGER: UserAgentsManager | None = None
|
||||
|
||||
|
||||
def get_user_agents_manager() -> UserAgentsManager:
|
||||
global _USER_AGENTS_MANAGER
|
||||
|
||||
if not _USER_AGENTS_MANAGER:
|
||||
_USER_AGENTS_MANAGER = UserAgentsManager()
|
||||
|
||||
return _USER_AGENTS_MANAGER
|
||||
|
||||
|
||||
class UserAgentsManager:
|
||||
def __init__(self) -> None:
|
||||
self._user_agents: list[str] | None = None
|
||||
self._user_agents_text_path = os.path.join(os.path.dirname(__file__), "user-agents.txt")
|
||||
|
||||
def get_scrape_headers(self, user_agent: str | None = None) -> dict[str, str]:
|
||||
# From: https://scrapeops.io/web-scraping-playbook/403-forbidden-error-web-scraping/#optimize-request-headers
|
||||
if user_agent is None:
|
||||
user_agent = random.choice(self.user_agents)
|
||||
|
||||
return {
|
||||
"User-Agent": user_agent,
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
|
||||
"Accept-Language": "en-US,en;q=0.5",
|
||||
"Accept-Encoding": "gzip, deflate",
|
||||
"Connection": "keep-alive",
|
||||
"Upgrade-Insecure-Requests": "1",
|
||||
"Sec-Fetch-Dest": "document",
|
||||
"Sec-Fetch-Mode": "navigate",
|
||||
"Sec-Fetch-Site": "none",
|
||||
"Sec-Fetch-User": "?1",
|
||||
"Cache-Control": "max-age=0",
|
||||
}
|
||||
|
||||
@property
|
||||
def user_agents(self) -> list[str]:
|
||||
if not self._user_agents:
|
||||
self._user_agents = self._fetch_user_agents()
|
||||
|
||||
return self._user_agents
|
||||
|
||||
def _fetch_user_agents(self) -> list[str]:
|
||||
user_agents: list[str] = []
|
||||
|
||||
try:
|
||||
from recipe_scrapers._abstract import HEADERS
|
||||
|
||||
user_agents.append(HEADERS["User-Agent"])
|
||||
except (ImportError, KeyError):
|
||||
user_agents.append("Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/128.0")
|
||||
|
||||
with open(self._user_agents_text_path) as f:
|
||||
for line in f:
|
||||
if not line:
|
||||
continue
|
||||
user_agents.append(line.strip())
|
||||
|
||||
return user_agents
|
||||
Reference in New Issue
Block a user