From 6a3b38a31e437447568ef5a19a2b3bf3841ab7bd Mon Sep 17 00:00:00 2001 From: Dean Malan Date: Thu, 12 Mar 2026 15:58:40 +0200 Subject: [PATCH] fix: Don't continue parsing recipes with errored HTTP status codes (#7230) --- mealie/services/scraper/recipe_scraper.py | 4 ++++ mealie/services/scraper/scraper_strategies.py | 4 ++-- .../test_recipe_create_from_video.py | 2 +- .../user_recipe_tests/test_recipe_crud.py | 14 +++++++------- 4 files changed, 14 insertions(+), 10 deletions(-) diff --git a/mealie/services/scraper/recipe_scraper.py b/mealie/services/scraper/recipe_scraper.py index 5223378aa..f4bca66df 100644 --- a/mealie/services/scraper/recipe_scraper.py +++ b/mealie/services/scraper/recipe_scraper.py @@ -44,6 +44,10 @@ class RecipeScraper: """ raw_html = html or await safe_scrape_html(url) + + if not raw_html: + return None, None + for ScraperClass in self.scrapers: scraper = ScraperClass(url, self.translator, raw_html=raw_html) if not scraper.can_scrape(): diff --git a/mealie/services/scraper/scraper_strategies.py b/mealie/services/scraper/scraper_strategies.py index b15c9c09c..03057723b 100644 --- a/mealie/services/scraper/scraper_strategies.py +++ b/mealie/services/scraper/scraper_strategies.py @@ -70,8 +70,8 @@ async def safe_scrape_html(url: str) -> str: headers=user_agents_manager.get_scrape_headers(user_agent), follow_redirects=True, ) as resp: - if resp.status_code == status.HTTP_403_FORBIDDEN: - logger.debug(f'403 Forbidden with User-Agent: "{user_agent}"') + if resp.status_code >= status.HTTP_400_BAD_REQUEST: + logger.debug(f'Error status code {resp.status_code} with User-Agent: "{user_agent}"') continue start_time = time.time() diff --git a/tests/integration_tests/user_recipe_tests/test_recipe_create_from_video.py b/tests/integration_tests/user_recipe_tests/test_recipe_create_from_video.py index d54443086..9313ad55b 100644 --- a/tests/integration_tests/user_recipe_tests/test_recipe_create_from_video.py +++ b/tests/integration_tests/user_recipe_tests/test_recipe_create_from_video.py @@ -33,7 +33,7 @@ def video_scraper_setup(monkeypatch: pytest.MonkeyPatch): # Prevent any real HTTP calls during scraping async def mock_safe_scrape_html(url: str) -> str: - return "" + return "" monkeypatch.setattr(recipe_scraper_module, "safe_scrape_html", mock_safe_scrape_html) diff --git a/tests/integration_tests/user_recipe_tests/test_recipe_crud.py b/tests/integration_tests/user_recipe_tests/test_recipe_crud.py index 7203527df..ba1974649 100644 --- a/tests/integration_tests/user_recipe_tests/test_recipe_crud.py +++ b/tests/integration_tests/user_recipe_tests/test_recipe_crud.py @@ -14,11 +14,11 @@ from bs4 import BeautifulSoup from fastapi.testclient import TestClient from httpx import Response from pytest import MonkeyPatch -from recipe_scrapers._abstract import AbstractScraper from recipe_scrapers._schemaorg import SchemaOrg from recipe_scrapers.plugins import SchemaOrgFillPlugin from slugify import slugify +import mealie.services.scraper.recipe_scraper as recipe_scraper_module from mealie.db.models.recipe import RecipeModel from mealie.pkgs.safehttp.transport import AsyncSafeTransport from mealie.schema.cookbook.cookbook import SaveCookBook @@ -102,12 +102,12 @@ def test_create_by_url( monkeypatch: MonkeyPatch, ): for recipe_data in recipe_test_data: - # Override init function for AbstractScraper to use the test html instead of calling the url - monkeypatch.setattr( - AbstractScraper, - "__init__", - get_init(recipe_data.html_file), - ) + # Prevent any real HTTP calls during scraping + async def mock_safe_scrape_html(url: str) -> str: + return "" + + monkeypatch.setattr(recipe_scraper_module, "safe_scrape_html", mock_safe_scrape_html) + # Override the get_html method of the RecipeScraperOpenGraph to return the test html for scraper_cls in DEFAULT_SCRAPER_STRATEGIES: monkeypatch.setattr(