mirror of
https://github.com/mealie-recipes/mealie.git
synced 2026-04-05 12:35:35 -04:00
fix: Don't continue parsing recipes with errored HTTP status codes (#7230)
This commit is contained in:
@@ -44,6 +44,10 @@ class RecipeScraper:
|
||||
"""
|
||||
|
||||
raw_html = html or await safe_scrape_html(url)
|
||||
|
||||
if not raw_html:
|
||||
return None, None
|
||||
|
||||
for ScraperClass in self.scrapers:
|
||||
scraper = ScraperClass(url, self.translator, raw_html=raw_html)
|
||||
if not scraper.can_scrape():
|
||||
|
||||
@@ -70,8 +70,8 @@ async def safe_scrape_html(url: str) -> str:
|
||||
headers=user_agents_manager.get_scrape_headers(user_agent),
|
||||
follow_redirects=True,
|
||||
) as resp:
|
||||
if resp.status_code == status.HTTP_403_FORBIDDEN:
|
||||
logger.debug(f'403 Forbidden with User-Agent: "{user_agent}"')
|
||||
if resp.status_code >= status.HTTP_400_BAD_REQUEST:
|
||||
logger.debug(f'Error status code {resp.status_code} with User-Agent: "{user_agent}"')
|
||||
continue
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
@@ -33,7 +33,7 @@ def video_scraper_setup(monkeypatch: pytest.MonkeyPatch):
|
||||
|
||||
# Prevent any real HTTP calls during scraping
|
||||
async def mock_safe_scrape_html(url: str) -> str:
|
||||
return ""
|
||||
return "<html></html>"
|
||||
|
||||
monkeypatch.setattr(recipe_scraper_module, "safe_scrape_html", mock_safe_scrape_html)
|
||||
|
||||
|
||||
@@ -14,11 +14,11 @@ from bs4 import BeautifulSoup
|
||||
from fastapi.testclient import TestClient
|
||||
from httpx import Response
|
||||
from pytest import MonkeyPatch
|
||||
from recipe_scrapers._abstract import AbstractScraper
|
||||
from recipe_scrapers._schemaorg import SchemaOrg
|
||||
from recipe_scrapers.plugins import SchemaOrgFillPlugin
|
||||
from slugify import slugify
|
||||
|
||||
import mealie.services.scraper.recipe_scraper as recipe_scraper_module
|
||||
from mealie.db.models.recipe import RecipeModel
|
||||
from mealie.pkgs.safehttp.transport import AsyncSafeTransport
|
||||
from mealie.schema.cookbook.cookbook import SaveCookBook
|
||||
@@ -102,12 +102,12 @@ def test_create_by_url(
|
||||
monkeypatch: MonkeyPatch,
|
||||
):
|
||||
for recipe_data in recipe_test_data:
|
||||
# Override init function for AbstractScraper to use the test html instead of calling the url
|
||||
monkeypatch.setattr(
|
||||
AbstractScraper,
|
||||
"__init__",
|
||||
get_init(recipe_data.html_file),
|
||||
)
|
||||
# Prevent any real HTTP calls during scraping
|
||||
async def mock_safe_scrape_html(url: str) -> str:
|
||||
return "<html></html>"
|
||||
|
||||
monkeypatch.setattr(recipe_scraper_module, "safe_scrape_html", mock_safe_scrape_html)
|
||||
|
||||
# Override the get_html method of the RecipeScraperOpenGraph to return the test html
|
||||
for scraper_cls in DEFAULT_SCRAPER_STRATEGIES:
|
||||
monkeypatch.setattr(
|
||||
|
||||
Reference in New Issue
Block a user