fix: Don't continue parsing recipes with errored HTTP status codes (#7230)

This commit is contained in:
Dean Malan
2026-03-12 15:58:40 +02:00
committed by GitHub
parent 8189416495
commit 6a3b38a31e
4 changed files with 14 additions and 10 deletions

View File

@@ -44,6 +44,10 @@ class RecipeScraper:
""" """
raw_html = html or await safe_scrape_html(url) raw_html = html or await safe_scrape_html(url)
if not raw_html:
return None, None
for ScraperClass in self.scrapers: for ScraperClass in self.scrapers:
scraper = ScraperClass(url, self.translator, raw_html=raw_html) scraper = ScraperClass(url, self.translator, raw_html=raw_html)
if not scraper.can_scrape(): if not scraper.can_scrape():

View File

@@ -70,8 +70,8 @@ async def safe_scrape_html(url: str) -> str:
headers=user_agents_manager.get_scrape_headers(user_agent), headers=user_agents_manager.get_scrape_headers(user_agent),
follow_redirects=True, follow_redirects=True,
) as resp: ) as resp:
if resp.status_code == status.HTTP_403_FORBIDDEN: if resp.status_code >= status.HTTP_400_BAD_REQUEST:
logger.debug(f'403 Forbidden with User-Agent: "{user_agent}"') logger.debug(f'Error status code {resp.status_code} with User-Agent: "{user_agent}"')
continue continue
start_time = time.time() start_time = time.time()

View File

@@ -33,7 +33,7 @@ def video_scraper_setup(monkeypatch: pytest.MonkeyPatch):
# Prevent any real HTTP calls during scraping # Prevent any real HTTP calls during scraping
async def mock_safe_scrape_html(url: str) -> str: async def mock_safe_scrape_html(url: str) -> str:
return "" return "<html></html>"
monkeypatch.setattr(recipe_scraper_module, "safe_scrape_html", mock_safe_scrape_html) monkeypatch.setattr(recipe_scraper_module, "safe_scrape_html", mock_safe_scrape_html)

View File

@@ -14,11 +14,11 @@ from bs4 import BeautifulSoup
from fastapi.testclient import TestClient from fastapi.testclient import TestClient
from httpx import Response from httpx import Response
from pytest import MonkeyPatch from pytest import MonkeyPatch
from recipe_scrapers._abstract import AbstractScraper
from recipe_scrapers._schemaorg import SchemaOrg from recipe_scrapers._schemaorg import SchemaOrg
from recipe_scrapers.plugins import SchemaOrgFillPlugin from recipe_scrapers.plugins import SchemaOrgFillPlugin
from slugify import slugify from slugify import slugify
import mealie.services.scraper.recipe_scraper as recipe_scraper_module
from mealie.db.models.recipe import RecipeModel from mealie.db.models.recipe import RecipeModel
from mealie.pkgs.safehttp.transport import AsyncSafeTransport from mealie.pkgs.safehttp.transport import AsyncSafeTransport
from mealie.schema.cookbook.cookbook import SaveCookBook from mealie.schema.cookbook.cookbook import SaveCookBook
@@ -102,12 +102,12 @@ def test_create_by_url(
monkeypatch: MonkeyPatch, monkeypatch: MonkeyPatch,
): ):
for recipe_data in recipe_test_data: for recipe_data in recipe_test_data:
# Override init function for AbstractScraper to use the test html instead of calling the url # Prevent any real HTTP calls during scraping
monkeypatch.setattr( async def mock_safe_scrape_html(url: str) -> str:
AbstractScraper, return "<html></html>"
"__init__",
get_init(recipe_data.html_file), monkeypatch.setattr(recipe_scraper_module, "safe_scrape_html", mock_safe_scrape_html)
)
# Override the get_html method of the RecipeScraperOpenGraph to return the test html # Override the get_html method of the RecipeScraperOpenGraph to return the test html
for scraper_cls in DEFAULT_SCRAPER_STRATEGIES: for scraper_cls in DEFAULT_SCRAPER_STRATEGIES:
monkeypatch.setattr( monkeypatch.setattr(