Feature/Capture Scraper Improvement PRs (#749)

* capture #732 changes * capture PR #733 * capture PR #736 * capture pr #745 Co-authored-by: Hayden <hay-kot@pm.me>
2026-02-13 19:33:12 -05:00 · 2021-10-19 15:55:45 -08:00
parent 58349bc439
commit 89da1a2654
7 changed files with 543 additions and 475 deletions
--- a/mealie/routes/recipe/recipe_crud_routes.py
+++ b/mealie/routes/recipe/recipe_crud_routes.py
@@ -4,7 +4,6 @@ from fastapi import Depends, File
 from fastapi.datastructures import UploadFile
 from fastapi.encoders import jsonable_encoder
 from fastapi.responses import JSONResponse
-from scrape_schema_recipe import scrape_url
 from sqlalchemy.orm.session import Session
 from starlette.responses import FileResponse

@@ -16,7 +15,7 @@ from mealie.routes.routers import UserAPIRouter
 from mealie.schema.recipe import CreateRecipeByURL, Recipe, RecipeImageTypes
 from mealie.schema.recipe.recipe import CreateRecipe, RecipeSummary
 from mealie.services.recipe.recipe_service import RecipeService
-from mealie.services.scraper.scraper import create_from_url
+from mealie.services.scraper.scraper import create_from_url, scrape_from_url

 user_router = UserAPIRouter()
 logger = get_logger()
@@ -44,8 +43,11 @@ def parse_recipe_url(url: CreateRecipeByURL, recipe_service: RecipeService = Dep

@user_router.post("/test-scrape-url")
 def test_parse_recipe_url(url: CreateRecipeByURL):
-    # TODO: Replace with more current implementation of testing schema
-    return scrape_url(url.url)
+    # Debugger should produce the same result as the scraper sees before cleaning
+    scraped_data = scrape_from_url(url.url)
+    if scraped_data:
+        return scraped_data.schema.data
+    return "recipe_scrapers was unable to scrape this URL"


@user_router.post("/create-from-zip", status_code=201)
--- a/mealie/services/image/image.py
+++ b/mealie/services/image/image.py
@@ -44,10 +44,24 @@ def write_image(recipe_slug: str, file_data: bytes, extension: str) -> Path:
 def scrape_image(image_url: str, slug: str) -> Path:
    logger.info(f"Image URL: {image_url}")
    if isinstance(image_url, str):  # Handles String Types
-        image_url = image_url
+        pass

    if isinstance(image_url, list):  # Handles List Types
-        image_url = image_url[0]
+        # Multiple images have been defined in the schema - usually different resolutions
+        # Typically would be in smallest->biggest order, but can't be certain so test each.
+        # 'Google will pick the best image to display in Search results based on the aspect ratio and resolution.'
+
+        all_image_requests = []
+        for url in image_url:
+            try:
+                r = requests.get(url, stream=True, headers={"User-Agent": ""})
+            except Exception:
+                logger.exception("Image {url} could not be requested")
+                continue
+            if r.status_code == 200:
+                all_image_requests.append((url, r))
+
+        image_url, _ = max(all_image_requests, key=lambda url_r: len(url_r[1].content), default=("", 0))

    if isinstance(image_url, dict):  # Handles Dictionary Types
        for key in image_url:
@@ -70,6 +84,6 @@ def scrape_image(image_url: str, slug: str) -> Path:

        filename.unlink(missing_ok=True)

-        return slug
+        return Path(slug)

    return None
--- a/mealie/services/scraper/cleaner.py
+++ b/mealie/services/scraper/cleaner.py
@@ -2,7 +2,7 @@ import html
 import json
 import re
 from datetime import datetime, timedelta
-from typing import List
+from typing import List, Optional

 from slugify import slugify

@@ -43,9 +43,13 @@ def clean_string(text: str) -> str:
    if isinstance(text, list):
        text = text[0]

+    print(type(text))
+
    if text == "" or text is None:
        return ""

+    print(text)
+
    cleaned_text = html.unescape(text)
    cleaned_text = re.sub("<[^<]+?>", "", cleaned_text)
    cleaned_text = re.sub(" +", " ", cleaned_text)
@@ -67,6 +71,40 @@ def clean_html(raw_html):
    return re.sub(cleanr, "", raw_html)


+def clean_nutrition(nutrition: Optional[dict]) -> dict[str, str]:
+    # Assumes that all units are supplied in grams, except sodium which may be in mg.
+
+    # Fn only expects a dict[str,str]. Other structures should not be parsed.
+    if not isinstance(nutrition, dict):
+        return {}
+
+    # Allow for commas as decimals (common in Europe)
+    # Compile once for efficiency
+    re_match_digits = re.compile(r"\d+([.,]\d+)?")
+
+    output_nutrition = {}
+    for key, val in nutrition.items():
+        # If the val contains digits matching the regex, add the first match to the output dict.
+        # Handle unexpected datastructures safely.
+        try:
+            if matched_digits := re_match_digits.search(val):
+                output_nutrition[key] = matched_digits.group(0)
+        except Exception:
+            continue
+
+    output_nutrition = {key: val.replace(",", ".") for key, val in output_nutrition.items()}
+
+    if "sodiumContent" in nutrition and "m" not in nutrition["sodiumContent"] and "g" in nutrition["sodiumContent"]:
+        # Sodium is in grams. Parse its value, multiple by 1k and return to string.
+        try:
+            output_nutrition["sodiumContent"] = str(float(output_nutrition["sodiumContent"]) * 1000)
+        except ValueError:
+            # Could not parse sodium content as float, so don't touch it.
+            pass
+
+    return output_nutrition
+
+
 def image(image=None) -> str:
    if not image:
        return "no image"
@@ -167,9 +205,11 @@ def clean_time(time_entry):
    elif isinstance(time_entry, datetime):
        print(time_entry)
    elif isinstance(time_entry, str):
-        if re.match("PT.*H.*M", time_entry):
+        try:
            time_delta_object = parse_duration(time_entry)
            return pretty_print_timedelta(time_delta_object)
+        except ValueError:
+            logger.error(f"Could not parse time_entry `{time_entry}`")
    else:
        return str(time_entry)

@@ -184,48 +224,34 @@ def parse_duration(iso_duration):
    Returns:
        a datetime.timedelta instance
    """
-    m = re.match(r"^P(?:(\d+)Y)?(?:(\d+)M)?(?:(\d+)D)?T(?:(\d+)H)?(?:(\d+)M)?(?:(\d+(?:.\d+)?)S)?$", iso_duration)
+    m = re.match(
+        r"^P((\d+)Y)?((\d+)M)?((?P<days>\d+)D)?"
+        r"T((?P<hours>\d+)H)?((?P<minutes>\d+)M)?((?P<seconds>\d+(?:\.\d+)?)S)?$",
+        iso_duration,
+    )
    if m is None:
        raise ValueError("invalid ISO 8601 duration string")

-    days = 0
-    hours = 0
-    minutes = 0
-    seconds = 0.0
-
    # Years and months are not being utilized here, as there is not enough
    # information provided to determine which year and which month.
    # Python's time_delta class stores durations as days, seconds and
    # microseconds internally, and therefore we'd have to
    # convert parsed years and months to specific number of days.

-    if m[3]:
-        days = int(m[3])
-    if m[4]:
-        hours = int(m[4])
-    if m[5]:
-        minutes = int(m[5])
-    if m[6]:
-        seconds = float(m[6])
+    times = {"days": 0, "hours": 0, "minutes": 0, "seconds": 0}
+    for unit, _ in times.items():
+        if m.group(unit):
+            times[unit] = int(float(m.group(unit)))

-    return timedelta(days=days, hours=hours, minutes=minutes, seconds=seconds)
+    return timedelta(**times)


-def pretty_print_timedelta(t, max_components=None, max_decimal_places=2):
+def pretty_print_timedelta(t: timedelta, max_components=None, max_decimal_places=2):
    """
    Print a pretty string for a timedelta.
-    For example datetime.timedelta(days=2, seconds=17280) will be printed as '2 days, 4 hours, 48 minutes'. Setting max_components to e.g. 1 will change this to '2.2 days', where the
+    For example datetime.timedelta(days=2, seconds=17280) will be printed as '2 days 4 Hours 48 Minutes'. Setting max_components to e.g. 1 will change this to '2.2 days', where the
    number of decimal points can also be set.
    """
-    time_scales = [
-        timedelta(days=365),
-        timedelta(days=1),
-        timedelta(hours=1),
-        timedelta(minutes=1),
-        timedelta(seconds=1),
-        timedelta(microseconds=1000),
-        timedelta(microseconds=1),
-    ]
    time_scale_names_dict = {
        timedelta(days=365): "year",
        timedelta(days=1): "day",
@@ -236,9 +262,8 @@ def pretty_print_timedelta(t, max_components=None, max_decimal_places=2):
        timedelta(microseconds=1): "microsecond",
    }
    count = 0
-    txt = ""
-    first = True
-    for scale in time_scales:
+    out_list = []
+    for scale, scale_name in time_scale_names_dict.items():
        if t >= scale:
            count += 1
            n = t / scale if count == max_components else int(t / scale)
@@ -247,15 +272,9 @@ def pretty_print_timedelta(t, max_components=None, max_decimal_places=2):
            n_txt = str(round(n, max_decimal_places))
            if n_txt[-2:] == ".0":
                n_txt = n_txt[:-2]
-            txt += "{}{} {}{}".format(
-                "" if first else " ",
-                n_txt,
-                time_scale_names_dict[scale],
-                "s" if n > 1 else "",
-            )
-            if first:
-                first = False

-    if len(txt) == 0:
-        txt = "none"
-    return txt
+            out_list.append(f"{n_txt} {scale_name}{'s' if n > 1 else ''}")
+
+    if out_list == []:
+        return "none"
+    return " ".join(out_list)
--- a/mealie/services/scraper/scraper.py
+++ b/mealie/services/scraper/scraper.py
@@ -1,6 +1,5 @@
-import json
 from enum import Enum
-from typing import Any, Callable
+from typing import Any, Callable, Optional
 from uuid import uuid4

 import requests
@@ -8,17 +7,11 @@ from fastapi import HTTPException, status
 from recipe_scrapers import NoSchemaFoundInWildMode, SchemaScraperFactory, WebsiteNotImplementedError, scrape_me
 from slugify import slugify

-from mealie.core.config import get_app_dirs
-
-app_dirs = get_app_dirs()
 from mealie.core.root_logger import get_logger
 from mealie.schema.recipe import Recipe, RecipeStep
 from mealie.services.image.image import scrape_image
 from mealie.services.scraper import cleaner, open_graph

-LAST_JSON = app_dirs.DEBUG_DIR.joinpath("last_recipe.json")
-
-
 logger = get_logger()


@@ -32,7 +25,14 @@ def create_from_url(url: str) -> Recipe:
    Returns:
        Recipe: Recipe Object
    """
-    new_recipe = scrape_from_url(url)
+    # Try the different scrapers in order.
+    if scraped_data := scrape_from_url(url):
+        new_recipe = clean_scraper(scraped_data, url)
+    elif og_dict := extract_open_graph_values(url):
+        new_recipe = Recipe(**og_dict)
+    else:
+        raise HTTPException(status.HTTP_400_BAD_REQUEST, {"details": ParserErrors.BAD_RECIPE_DATA.value})
+
    logger.info(f"Image {new_recipe.image}")
    new_recipe.image = download_image_for_recipe(new_recipe.slug, new_recipe.image)

@@ -49,16 +49,17 @@ class ParserErrors(str, Enum):
    CONNECTION_ERROR = "CONNECTION_ERROR"


-def extract_open_graph_values(url) -> Recipe:
+def extract_open_graph_values(url) -> Optional[dict]:
    r = requests.get(url)
    recipe = open_graph.basic_recipe_from_opengraph(r.text, url)
-
-    return Recipe(**recipe)
+    if recipe.get("name", "") == "":
+        return None
+    return recipe


-def scrape_from_url(url: str) -> Recipe:
-    """Entry function to generating are recipe obejct from a url
-    This will determine if a url can be parsed and raise an appropriate error keyword
+def scrape_from_url(url: str):
+    """Entry function to scrape a recipe from a url
+    This will determine if a url can be parsed and return None if not, to allow another parser to try.
    This keyword is used on the frontend to reference a localized string to present on the UI.

    Args:
@@ -68,7 +69,7 @@ def scrape_from_url(url: str) -> Recipe:
        HTTPException: 400_BAD_REQUEST - See ParserErrors Class for Key Details

    Returns:
-        Recipe: Recipe Model
+        Optional[Scraped schema for cleaning]
    """
    try:
        scraped_schema = scrape_me(url)
@@ -76,28 +77,26 @@ def scrape_from_url(url: str) -> Recipe:
        try:
            scraped_schema = scrape_me(url, wild_mode=True)
        except (NoSchemaFoundInWildMode, AttributeError):
-            recipe = extract_open_graph_values(url)
-            if recipe.name != "":
-                return recipe
-            raise HTTPException(status.HTTP_400_BAD_REQUEST, {"details": ParserErrors.BAD_RECIPE_DATA.value})
+            # Recipe_scraper was unable to extract a recipe.
+            return None

    except ConnectionError:
        raise HTTPException(status.HTTP_400_BAD_REQUEST, {"details": ParserErrors.CONNECTION_ERROR.value})

+    # Check to see if the recipe is valid
    try:
+        ingredients = scraped_schema.ingredients()
        instruct = scraped_schema.instructions()
    except Exception:
+        ingredients = []
        instruct = []

-    try:
-        ing = scraped_schema.ingredients()
-    except Exception:
-        ing = []
+    if instruct and ingredients:
+        return scraped_schema

-    if not instruct and not ing:
-        raise HTTPException(status.HTTP_400_BAD_REQUEST, {"details": ParserErrors.NO_RECIPE_DATA.value})
-    else:
-        return clean_scraper(scraped_schema, url)
+    # recipe_scrapers did not get a valid recipe.
+    # Return None to let another scraper try.
+    return None


 def clean_scraper(scraped_data: SchemaScraperFactory.SchemaScraper, url: str) -> Recipe:
@@ -135,17 +134,22 @@ def clean_scraper(scraped_data: SchemaScraperFactory.SchemaScraper, url: str) ->
        except TypeError:
            return []

+    cook_time = try_get_default(None, "performTime", None, cleaner.clean_time) or try_get_default(
+        None, "cookTime", None, cleaner.clean_time
+    )
+
    return Recipe(
        name=try_get_default(scraped_data.title, "name", "No Name Found", cleaner.clean_string),
        slug="",
-        image=try_get_default(scraped_data.image, "image", None),
+        image=try_get_default(None, "image", None),
        description=try_get_default(None, "description", "", cleaner.clean_string),
+        nutrition=try_get_default(None, "nutrition", None, cleaner.clean_nutrition),
        recipe_yield=try_get_default(scraped_data.yields, "recipeYield", "1", cleaner.clean_string),
        recipe_ingredient=try_get_default(scraped_data.ingredients, "recipeIngredient", [""], cleaner.ingredient),
        recipe_instructions=get_instructions(),
        total_time=try_get_default(None, "totalTime", None, cleaner.clean_time),
        prep_time=try_get_default(None, "prepTime", None, cleaner.clean_time),
-        perform_time=try_get_default(None, "performTime", None, cleaner.clean_time),
+        perform_time=cook_time,
        org_url=url,
    )

@@ -160,10 +164,3 @@ def download_image_for_recipe(slug, image_url) -> dict:
        img_name = None

    return img_name or "no image"
-
-
-def dump_last_json(recipe_data: dict):
-    with open(LAST_JSON, "w") as f:
-        f.write(json.dumps(recipe_data, indent=4, default=str))
-
-    return
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -23,7 +23,6 @@ python-slugify = "^4.0.1"
 requests = "^2.25.1"
 PyYAML = "^5.3.1"
 extruct = "^0.12.0"
-scrape-schema-recipe = "^0.1.3"
 python-multipart = "^0.0.5"
 fastapi-camelcase = "^1.0.2"
 bcrypt = "^3.2.0"
@@ -50,6 +49,7 @@ coverage = "^5.5"
 pydantic-to-typescript = "^1.0.7"
 rich = "^10.7.0"
 isort = "^5.9.3"
+regex = "2021.9.30" # TODO: Remove during Upgrade -> https://github.com/psf/black/issues/2524

 [build-system]
 requires = ["poetry-core>=1.0.0"]
--- a/tests/unit_tests/test_cleaner.py
+++ b/tests/unit_tests/test_cleaner.py
@@ -1,5 +1,6 @@
 import json
 import re
+from datetime import timedelta

 import pytest

@@ -59,6 +60,24 @@ def test_clean_image():
    assert cleaner.image(["My URL!", "MY SECOND URL"]) == "My URL!"


+@pytest.mark.parametrize(
+    "nutrition,expected",
+    [
+        (None, {}),
+        ({"calories": "105 kcal"}, {"calories": "105"}),
+        ({"calories": "105 kcal 104 sugar"}, {"calories": "105"}),
+        ({"calories": ""}, {}),
+        ({"calories": ["not just a string"], "sugarContent": "but still tries 555.321"}, {"sugarContent": "555.321"}),
+        ({"sodiumContent": "5.1235g"}, {"sodiumContent": "5123.5"}),
+        ({"sodiumContent": "5mg"}, {"sodiumContent": "5"}),
+        ({"sodiumContent": "10oz"}, {"sodiumContent": "10"}),
+        ({"sodiumContent": "10.1.2g"}, {"sodiumContent": "10100.0"}),
+    ],
+)
+def test_clean_nutrition(nutrition, expected):
+    assert cleaner.clean_nutrition(nutrition) == expected
+
+
@pytest.mark.parametrize(
    "instructions",
    [
@@ -90,9 +109,29 @@ def test_html_with_recipe_data():
    assert url_validation_regex.match(recipe_data["image"])


-def test_time_cleaner():
+@pytest.mark.parametrize(
+    "time_delta,expected",
+    [
+        ("PT2H30M", "2 Hours 30 Minutes"),
+        ("PT30M", "30 Minutes"),
+        ("PT3H", "3 Hours"),
+        ("P1DT1H1M1S", "1 day 1 Hour 1 Minute 1 Second"),
+        ("P1DT1H1M1.53S", "1 day 1 Hour 1 Minute 1 Second"),
+        ("PT-3H", None),
+        ("PT", "none"),
+    ],
+)
+def test_time_cleaner(time_delta, expected):
+    assert cleaner.clean_time(time_delta) == expected

-    my_time_delta = "PT2H30M"
-    return_delta = cleaner.clean_time(my_time_delta)

-    assert return_delta == "2 Hours 30 Minutes"
+@pytest.mark.parametrize(
+    "t,max_components,max_decimal_places,expected",
+    [
+        (timedelta(days=2, seconds=17280), None, 2, "2 days 4 Hours 48 Minutes"),
+        (timedelta(days=2, seconds=17280), 1, 2, "2.2 days"),
+        (timedelta(days=365), None, 2, "1 year"),
+    ],
+)
+def test_pretty_print_timedelta(t, max_components, max_decimal_places, expected):
+    assert cleaner.pretty_print_timedelta(t, max_components, max_decimal_places) == expected