mirror of
https://github.com/mealie-recipes/mealie.git
synced 2026-04-13 00:15:34 -04:00
fix: Don't continue parsing recipes with errored HTTP status codes (#7230)
This commit is contained in:
@@ -44,6 +44,10 @@ class RecipeScraper:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
raw_html = html or await safe_scrape_html(url)
|
raw_html = html or await safe_scrape_html(url)
|
||||||
|
|
||||||
|
if not raw_html:
|
||||||
|
return None, None
|
||||||
|
|
||||||
for ScraperClass in self.scrapers:
|
for ScraperClass in self.scrapers:
|
||||||
scraper = ScraperClass(url, self.translator, raw_html=raw_html)
|
scraper = ScraperClass(url, self.translator, raw_html=raw_html)
|
||||||
if not scraper.can_scrape():
|
if not scraper.can_scrape():
|
||||||
|
|||||||
@@ -70,8 +70,8 @@ async def safe_scrape_html(url: str) -> str:
|
|||||||
headers=user_agents_manager.get_scrape_headers(user_agent),
|
headers=user_agents_manager.get_scrape_headers(user_agent),
|
||||||
follow_redirects=True,
|
follow_redirects=True,
|
||||||
) as resp:
|
) as resp:
|
||||||
if resp.status_code == status.HTTP_403_FORBIDDEN:
|
if resp.status_code >= status.HTTP_400_BAD_REQUEST:
|
||||||
logger.debug(f'403 Forbidden with User-Agent: "{user_agent}"')
|
logger.debug(f'Error status code {resp.status_code} with User-Agent: "{user_agent}"')
|
||||||
continue
|
continue
|
||||||
|
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|||||||
@@ -33,7 +33,7 @@ def video_scraper_setup(monkeypatch: pytest.MonkeyPatch):
|
|||||||
|
|
||||||
# Prevent any real HTTP calls during scraping
|
# Prevent any real HTTP calls during scraping
|
||||||
async def mock_safe_scrape_html(url: str) -> str:
|
async def mock_safe_scrape_html(url: str) -> str:
|
||||||
return ""
|
return "<html></html>"
|
||||||
|
|
||||||
monkeypatch.setattr(recipe_scraper_module, "safe_scrape_html", mock_safe_scrape_html)
|
monkeypatch.setattr(recipe_scraper_module, "safe_scrape_html", mock_safe_scrape_html)
|
||||||
|
|
||||||
|
|||||||
@@ -14,11 +14,11 @@ from bs4 import BeautifulSoup
|
|||||||
from fastapi.testclient import TestClient
|
from fastapi.testclient import TestClient
|
||||||
from httpx import Response
|
from httpx import Response
|
||||||
from pytest import MonkeyPatch
|
from pytest import MonkeyPatch
|
||||||
from recipe_scrapers._abstract import AbstractScraper
|
|
||||||
from recipe_scrapers._schemaorg import SchemaOrg
|
from recipe_scrapers._schemaorg import SchemaOrg
|
||||||
from recipe_scrapers.plugins import SchemaOrgFillPlugin
|
from recipe_scrapers.plugins import SchemaOrgFillPlugin
|
||||||
from slugify import slugify
|
from slugify import slugify
|
||||||
|
|
||||||
|
import mealie.services.scraper.recipe_scraper as recipe_scraper_module
|
||||||
from mealie.db.models.recipe import RecipeModel
|
from mealie.db.models.recipe import RecipeModel
|
||||||
from mealie.pkgs.safehttp.transport import AsyncSafeTransport
|
from mealie.pkgs.safehttp.transport import AsyncSafeTransport
|
||||||
from mealie.schema.cookbook.cookbook import SaveCookBook
|
from mealie.schema.cookbook.cookbook import SaveCookBook
|
||||||
@@ -102,12 +102,12 @@ def test_create_by_url(
|
|||||||
monkeypatch: MonkeyPatch,
|
monkeypatch: MonkeyPatch,
|
||||||
):
|
):
|
||||||
for recipe_data in recipe_test_data:
|
for recipe_data in recipe_test_data:
|
||||||
# Override init function for AbstractScraper to use the test html instead of calling the url
|
# Prevent any real HTTP calls during scraping
|
||||||
monkeypatch.setattr(
|
async def mock_safe_scrape_html(url: str) -> str:
|
||||||
AbstractScraper,
|
return "<html></html>"
|
||||||
"__init__",
|
|
||||||
get_init(recipe_data.html_file),
|
monkeypatch.setattr(recipe_scraper_module, "safe_scrape_html", mock_safe_scrape_html)
|
||||||
)
|
|
||||||
# Override the get_html method of the RecipeScraperOpenGraph to return the test html
|
# Override the get_html method of the RecipeScraperOpenGraph to return the test html
|
||||||
for scraper_cls in DEFAULT_SCRAPER_STRATEGIES:
|
for scraper_cls in DEFAULT_SCRAPER_STRATEGIES:
|
||||||
monkeypatch.setattr(
|
monkeypatch.setattr(
|
||||||
|
|||||||
Reference in New Issue
Block a user