mirror of
				https://github.com/mealie-recipes/mealie.git
				synced 2025-10-31 10:13:32 -04:00 
			
		
		
		
	Feature/Capture Scraper Improvement PRs (#749)
* capture #732 changes * capture PR #733 * capture PR #736 * capture pr #745 Co-authored-by: Hayden <hay-kot@pm.me>
This commit is contained in:
		| @@ -4,7 +4,6 @@ from fastapi import Depends, File | ||||
| from fastapi.datastructures import UploadFile | ||||
| from fastapi.encoders import jsonable_encoder | ||||
| from fastapi.responses import JSONResponse | ||||
| from scrape_schema_recipe import scrape_url | ||||
| from sqlalchemy.orm.session import Session | ||||
| from starlette.responses import FileResponse | ||||
|  | ||||
| @@ -16,7 +15,7 @@ from mealie.routes.routers import UserAPIRouter | ||||
| from mealie.schema.recipe import CreateRecipeByURL, Recipe, RecipeImageTypes | ||||
| from mealie.schema.recipe.recipe import CreateRecipe, RecipeSummary | ||||
| from mealie.services.recipe.recipe_service import RecipeService | ||||
| from mealie.services.scraper.scraper import create_from_url | ||||
| from mealie.services.scraper.scraper import create_from_url, scrape_from_url | ||||
|  | ||||
| user_router = UserAPIRouter() | ||||
| logger = get_logger() | ||||
| @@ -44,8 +43,11 @@ def parse_recipe_url(url: CreateRecipeByURL, recipe_service: RecipeService = Dep | ||||
|  | ||||
| @user_router.post("/test-scrape-url") | ||||
| def test_parse_recipe_url(url: CreateRecipeByURL): | ||||
|     # TODO: Replace with more current implementation of testing schema | ||||
|     return scrape_url(url.url) | ||||
|     # Debugger should produce the same result as the scraper sees before cleaning | ||||
|     scraped_data = scrape_from_url(url.url) | ||||
|     if scraped_data: | ||||
|         return scraped_data.schema.data | ||||
|     return "recipe_scrapers was unable to scrape this URL" | ||||
|  | ||||
|  | ||||
| @user_router.post("/create-from-zip", status_code=201) | ||||
|   | ||||
| @@ -44,10 +44,24 @@ def write_image(recipe_slug: str, file_data: bytes, extension: str) -> Path: | ||||
| def scrape_image(image_url: str, slug: str) -> Path: | ||||
|     logger.info(f"Image URL: {image_url}") | ||||
|     if isinstance(image_url, str):  # Handles String Types | ||||
|         image_url = image_url | ||||
|         pass | ||||
|  | ||||
|     if isinstance(image_url, list):  # Handles List Types | ||||
|         image_url = image_url[0] | ||||
|         # Multiple images have been defined in the schema - usually different resolutions | ||||
|         # Typically would be in smallest->biggest order, but can't be certain so test each. | ||||
|         # 'Google will pick the best image to display in Search results based on the aspect ratio and resolution.' | ||||
|  | ||||
|         all_image_requests = [] | ||||
|         for url in image_url: | ||||
|             try: | ||||
|                 r = requests.get(url, stream=True, headers={"User-Agent": ""}) | ||||
|             except Exception: | ||||
|                 logger.exception("Image {url} could not be requested") | ||||
|                 continue | ||||
|             if r.status_code == 200: | ||||
|                 all_image_requests.append((url, r)) | ||||
|  | ||||
|         image_url, _ = max(all_image_requests, key=lambda url_r: len(url_r[1].content), default=("", 0)) | ||||
|  | ||||
|     if isinstance(image_url, dict):  # Handles Dictionary Types | ||||
|         for key in image_url: | ||||
| @@ -70,6 +84,6 @@ def scrape_image(image_url: str, slug: str) -> Path: | ||||
|  | ||||
|         filename.unlink(missing_ok=True) | ||||
|  | ||||
|         return slug | ||||
|         return Path(slug) | ||||
|  | ||||
|     return None | ||||
|   | ||||
| @@ -2,7 +2,7 @@ import html | ||||
| import json | ||||
| import re | ||||
| from datetime import datetime, timedelta | ||||
| from typing import List | ||||
| from typing import List, Optional | ||||
|  | ||||
| from slugify import slugify | ||||
|  | ||||
| @@ -43,9 +43,13 @@ def clean_string(text: str) -> str: | ||||
|     if isinstance(text, list): | ||||
|         text = text[0] | ||||
|  | ||||
|     print(type(text)) | ||||
|  | ||||
|     if text == "" or text is None: | ||||
|         return "" | ||||
|  | ||||
|     print(text) | ||||
|  | ||||
|     cleaned_text = html.unescape(text) | ||||
|     cleaned_text = re.sub("<[^<]+?>", "", cleaned_text) | ||||
|     cleaned_text = re.sub(" +", " ", cleaned_text) | ||||
| @@ -67,6 +71,40 @@ def clean_html(raw_html): | ||||
|     return re.sub(cleanr, "", raw_html) | ||||
|  | ||||
|  | ||||
| def clean_nutrition(nutrition: Optional[dict]) -> dict[str, str]: | ||||
|     # Assumes that all units are supplied in grams, except sodium which may be in mg. | ||||
|  | ||||
|     # Fn only expects a dict[str,str]. Other structures should not be parsed. | ||||
|     if not isinstance(nutrition, dict): | ||||
|         return {} | ||||
|  | ||||
|     # Allow for commas as decimals (common in Europe) | ||||
|     # Compile once for efficiency | ||||
|     re_match_digits = re.compile(r"\d+([.,]\d+)?") | ||||
|  | ||||
|     output_nutrition = {} | ||||
|     for key, val in nutrition.items(): | ||||
|         # If the val contains digits matching the regex, add the first match to the output dict. | ||||
|         # Handle unexpected datastructures safely. | ||||
|         try: | ||||
|             if matched_digits := re_match_digits.search(val): | ||||
|                 output_nutrition[key] = matched_digits.group(0) | ||||
|         except Exception: | ||||
|             continue | ||||
|  | ||||
|     output_nutrition = {key: val.replace(",", ".") for key, val in output_nutrition.items()} | ||||
|  | ||||
|     if "sodiumContent" in nutrition and "m" not in nutrition["sodiumContent"] and "g" in nutrition["sodiumContent"]: | ||||
|         # Sodium is in grams. Parse its value, multiple by 1k and return to string. | ||||
|         try: | ||||
|             output_nutrition["sodiumContent"] = str(float(output_nutrition["sodiumContent"]) * 1000) | ||||
|         except ValueError: | ||||
|             # Could not parse sodium content as float, so don't touch it. | ||||
|             pass | ||||
|  | ||||
|     return output_nutrition | ||||
|  | ||||
|  | ||||
| def image(image=None) -> str: | ||||
|     if not image: | ||||
|         return "no image" | ||||
| @@ -167,9 +205,11 @@ def clean_time(time_entry): | ||||
|     elif isinstance(time_entry, datetime): | ||||
|         print(time_entry) | ||||
|     elif isinstance(time_entry, str): | ||||
|         if re.match("PT.*H.*M", time_entry): | ||||
|         try: | ||||
|             time_delta_object = parse_duration(time_entry) | ||||
|             return pretty_print_timedelta(time_delta_object) | ||||
|         except ValueError: | ||||
|             logger.error(f"Could not parse time_entry `{time_entry}`") | ||||
|     else: | ||||
|         return str(time_entry) | ||||
|  | ||||
| @@ -184,48 +224,34 @@ def parse_duration(iso_duration): | ||||
|     Returns: | ||||
|         a datetime.timedelta instance | ||||
|     """ | ||||
|     m = re.match(r"^P(?:(\d+)Y)?(?:(\d+)M)?(?:(\d+)D)?T(?:(\d+)H)?(?:(\d+)M)?(?:(\d+(?:.\d+)?)S)?$", iso_duration) | ||||
|     m = re.match( | ||||
|         r"^P((\d+)Y)?((\d+)M)?((?P<days>\d+)D)?" | ||||
|         r"T((?P<hours>\d+)H)?((?P<minutes>\d+)M)?((?P<seconds>\d+(?:\.\d+)?)S)?$", | ||||
|         iso_duration, | ||||
|     ) | ||||
|     if m is None: | ||||
|         raise ValueError("invalid ISO 8601 duration string") | ||||
|  | ||||
|     days = 0 | ||||
|     hours = 0 | ||||
|     minutes = 0 | ||||
|     seconds = 0.0 | ||||
|  | ||||
|     # Years and months are not being utilized here, as there is not enough | ||||
|     # information provided to determine which year and which month. | ||||
|     # Python's time_delta class stores durations as days, seconds and | ||||
|     # microseconds internally, and therefore we'd have to | ||||
|     # convert parsed years and months to specific number of days. | ||||
|  | ||||
|     if m[3]: | ||||
|         days = int(m[3]) | ||||
|     if m[4]: | ||||
|         hours = int(m[4]) | ||||
|     if m[5]: | ||||
|         minutes = int(m[5]) | ||||
|     if m[6]: | ||||
|         seconds = float(m[6]) | ||||
|     times = {"days": 0, "hours": 0, "minutes": 0, "seconds": 0} | ||||
|     for unit, _ in times.items(): | ||||
|         if m.group(unit): | ||||
|             times[unit] = int(float(m.group(unit))) | ||||
|  | ||||
|     return timedelta(days=days, hours=hours, minutes=minutes, seconds=seconds) | ||||
|     return timedelta(**times) | ||||
|  | ||||
|  | ||||
| def pretty_print_timedelta(t, max_components=None, max_decimal_places=2): | ||||
| def pretty_print_timedelta(t: timedelta, max_components=None, max_decimal_places=2): | ||||
|     """ | ||||
|     Print a pretty string for a timedelta. | ||||
|     For example datetime.timedelta(days=2, seconds=17280) will be printed as '2 days, 4 hours, 48 minutes'. Setting max_components to e.g. 1 will change this to '2.2 days', where the | ||||
|     For example datetime.timedelta(days=2, seconds=17280) will be printed as '2 days 4 Hours 48 Minutes'. Setting max_components to e.g. 1 will change this to '2.2 days', where the | ||||
|     number of decimal points can also be set. | ||||
|     """ | ||||
|     time_scales = [ | ||||
|         timedelta(days=365), | ||||
|         timedelta(days=1), | ||||
|         timedelta(hours=1), | ||||
|         timedelta(minutes=1), | ||||
|         timedelta(seconds=1), | ||||
|         timedelta(microseconds=1000), | ||||
|         timedelta(microseconds=1), | ||||
|     ] | ||||
|     time_scale_names_dict = { | ||||
|         timedelta(days=365): "year", | ||||
|         timedelta(days=1): "day", | ||||
| @@ -236,9 +262,8 @@ def pretty_print_timedelta(t, max_components=None, max_decimal_places=2): | ||||
|         timedelta(microseconds=1): "microsecond", | ||||
|     } | ||||
|     count = 0 | ||||
|     txt = "" | ||||
|     first = True | ||||
|     for scale in time_scales: | ||||
|     out_list = [] | ||||
|     for scale, scale_name in time_scale_names_dict.items(): | ||||
|         if t >= scale: | ||||
|             count += 1 | ||||
|             n = t / scale if count == max_components else int(t / scale) | ||||
| @@ -247,15 +272,9 @@ def pretty_print_timedelta(t, max_components=None, max_decimal_places=2): | ||||
|             n_txt = str(round(n, max_decimal_places)) | ||||
|             if n_txt[-2:] == ".0": | ||||
|                 n_txt = n_txt[:-2] | ||||
|             txt += "{}{} {}{}".format( | ||||
|                 "" if first else " ", | ||||
|                 n_txt, | ||||
|                 time_scale_names_dict[scale], | ||||
|                 "s" if n > 1 else "", | ||||
|             ) | ||||
|             if first: | ||||
|                 first = False | ||||
|  | ||||
|     if len(txt) == 0: | ||||
|         txt = "none" | ||||
|     return txt | ||||
|             out_list.append(f"{n_txt} {scale_name}{'s' if n > 1 else ''}") | ||||
|  | ||||
|     if out_list == []: | ||||
|         return "none" | ||||
|     return " ".join(out_list) | ||||
|   | ||||
| @@ -1,6 +1,5 @@ | ||||
| import json | ||||
| from enum import Enum | ||||
| from typing import Any, Callable | ||||
| from typing import Any, Callable, Optional | ||||
| from uuid import uuid4 | ||||
|  | ||||
| import requests | ||||
| @@ -8,17 +7,11 @@ from fastapi import HTTPException, status | ||||
| from recipe_scrapers import NoSchemaFoundInWildMode, SchemaScraperFactory, WebsiteNotImplementedError, scrape_me | ||||
| from slugify import slugify | ||||
|  | ||||
| from mealie.core.config import get_app_dirs | ||||
|  | ||||
| app_dirs = get_app_dirs() | ||||
| from mealie.core.root_logger import get_logger | ||||
| from mealie.schema.recipe import Recipe, RecipeStep | ||||
| from mealie.services.image.image import scrape_image | ||||
| from mealie.services.scraper import cleaner, open_graph | ||||
|  | ||||
| LAST_JSON = app_dirs.DEBUG_DIR.joinpath("last_recipe.json") | ||||
|  | ||||
|  | ||||
| logger = get_logger() | ||||
|  | ||||
|  | ||||
| @@ -32,7 +25,14 @@ def create_from_url(url: str) -> Recipe: | ||||
|     Returns: | ||||
|         Recipe: Recipe Object | ||||
|     """ | ||||
|     new_recipe = scrape_from_url(url) | ||||
|     # Try the different scrapers in order. | ||||
|     if scraped_data := scrape_from_url(url): | ||||
|         new_recipe = clean_scraper(scraped_data, url) | ||||
|     elif og_dict := extract_open_graph_values(url): | ||||
|         new_recipe = Recipe(**og_dict) | ||||
|     else: | ||||
|         raise HTTPException(status.HTTP_400_BAD_REQUEST, {"details": ParserErrors.BAD_RECIPE_DATA.value}) | ||||
|  | ||||
|     logger.info(f"Image {new_recipe.image}") | ||||
|     new_recipe.image = download_image_for_recipe(new_recipe.slug, new_recipe.image) | ||||
|  | ||||
| @@ -49,16 +49,17 @@ class ParserErrors(str, Enum): | ||||
|     CONNECTION_ERROR = "CONNECTION_ERROR" | ||||
|  | ||||
|  | ||||
| def extract_open_graph_values(url) -> Recipe: | ||||
| def extract_open_graph_values(url) -> Optional[dict]: | ||||
|     r = requests.get(url) | ||||
|     recipe = open_graph.basic_recipe_from_opengraph(r.text, url) | ||||
|  | ||||
|     return Recipe(**recipe) | ||||
|     if recipe.get("name", "") == "": | ||||
|         return None | ||||
|     return recipe | ||||
|  | ||||
|  | ||||
| def scrape_from_url(url: str) -> Recipe: | ||||
|     """Entry function to generating are recipe obejct from a url | ||||
|     This will determine if a url can be parsed and raise an appropriate error keyword | ||||
| def scrape_from_url(url: str): | ||||
|     """Entry function to scrape a recipe from a url | ||||
|     This will determine if a url can be parsed and return None if not, to allow another parser to try. | ||||
|     This keyword is used on the frontend to reference a localized string to present on the UI. | ||||
|  | ||||
|     Args: | ||||
| @@ -68,7 +69,7 @@ def scrape_from_url(url: str) -> Recipe: | ||||
|         HTTPException: 400_BAD_REQUEST - See ParserErrors Class for Key Details | ||||
|  | ||||
|     Returns: | ||||
|         Recipe: Recipe Model | ||||
|         Optional[Scraped schema for cleaning] | ||||
|     """ | ||||
|     try: | ||||
|         scraped_schema = scrape_me(url) | ||||
| @@ -76,28 +77,26 @@ def scrape_from_url(url: str) -> Recipe: | ||||
|         try: | ||||
|             scraped_schema = scrape_me(url, wild_mode=True) | ||||
|         except (NoSchemaFoundInWildMode, AttributeError): | ||||
|             recipe = extract_open_graph_values(url) | ||||
|             if recipe.name != "": | ||||
|                 return recipe | ||||
|             raise HTTPException(status.HTTP_400_BAD_REQUEST, {"details": ParserErrors.BAD_RECIPE_DATA.value}) | ||||
|             # Recipe_scraper was unable to extract a recipe. | ||||
|             return None | ||||
|  | ||||
|     except ConnectionError: | ||||
|         raise HTTPException(status.HTTP_400_BAD_REQUEST, {"details": ParserErrors.CONNECTION_ERROR.value}) | ||||
|  | ||||
|     # Check to see if the recipe is valid | ||||
|     try: | ||||
|         ingredients = scraped_schema.ingredients() | ||||
|         instruct = scraped_schema.instructions() | ||||
|     except Exception: | ||||
|         ingredients = [] | ||||
|         instruct = [] | ||||
|  | ||||
|     try: | ||||
|         ing = scraped_schema.ingredients() | ||||
|     except Exception: | ||||
|         ing = [] | ||||
|     if instruct and ingredients: | ||||
|         return scraped_schema | ||||
|  | ||||
|     if not instruct and not ing: | ||||
|         raise HTTPException(status.HTTP_400_BAD_REQUEST, {"details": ParserErrors.NO_RECIPE_DATA.value}) | ||||
|     else: | ||||
|         return clean_scraper(scraped_schema, url) | ||||
|     # recipe_scrapers did not get a valid recipe. | ||||
|     # Return None to let another scraper try. | ||||
|     return None | ||||
|  | ||||
|  | ||||
| def clean_scraper(scraped_data: SchemaScraperFactory.SchemaScraper, url: str) -> Recipe: | ||||
| @@ -135,17 +134,22 @@ def clean_scraper(scraped_data: SchemaScraperFactory.SchemaScraper, url: str) -> | ||||
|         except TypeError: | ||||
|             return [] | ||||
|  | ||||
|     cook_time = try_get_default(None, "performTime", None, cleaner.clean_time) or try_get_default( | ||||
|         None, "cookTime", None, cleaner.clean_time | ||||
|     ) | ||||
|  | ||||
|     return Recipe( | ||||
|         name=try_get_default(scraped_data.title, "name", "No Name Found", cleaner.clean_string), | ||||
|         slug="", | ||||
|         image=try_get_default(scraped_data.image, "image", None), | ||||
|         image=try_get_default(None, "image", None), | ||||
|         description=try_get_default(None, "description", "", cleaner.clean_string), | ||||
|         nutrition=try_get_default(None, "nutrition", None, cleaner.clean_nutrition), | ||||
|         recipe_yield=try_get_default(scraped_data.yields, "recipeYield", "1", cleaner.clean_string), | ||||
|         recipe_ingredient=try_get_default(scraped_data.ingredients, "recipeIngredient", [""], cleaner.ingredient), | ||||
|         recipe_instructions=get_instructions(), | ||||
|         total_time=try_get_default(None, "totalTime", None, cleaner.clean_time), | ||||
|         prep_time=try_get_default(None, "prepTime", None, cleaner.clean_time), | ||||
|         perform_time=try_get_default(None, "performTime", None, cleaner.clean_time), | ||||
|         perform_time=cook_time, | ||||
|         org_url=url, | ||||
|     ) | ||||
|  | ||||
| @@ -160,10 +164,3 @@ def download_image_for_recipe(slug, image_url) -> dict: | ||||
|         img_name = None | ||||
|  | ||||
|     return img_name or "no image" | ||||
|  | ||||
|  | ||||
| def dump_last_json(recipe_data: dict): | ||||
|     with open(LAST_JSON, "w") as f: | ||||
|         f.write(json.dumps(recipe_data, indent=4, default=str)) | ||||
|  | ||||
|     return | ||||
|   | ||||
							
								
								
									
										765
									
								
								poetry.lock
									
									
									
										generated
									
									
									
								
							
							
						
						
									
										765
									
								
								poetry.lock
									
									
									
										generated
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @@ -23,7 +23,6 @@ python-slugify = "^4.0.1" | ||||
| requests = "^2.25.1" | ||||
| PyYAML = "^5.3.1" | ||||
| extruct = "^0.12.0" | ||||
| scrape-schema-recipe = "^0.1.3" | ||||
| python-multipart = "^0.0.5" | ||||
| fastapi-camelcase = "^1.0.2" | ||||
| bcrypt = "^3.2.0" | ||||
| @@ -50,6 +49,7 @@ coverage = "^5.5" | ||||
| pydantic-to-typescript = "^1.0.7" | ||||
| rich = "^10.7.0" | ||||
| isort = "^5.9.3" | ||||
| regex = "2021.9.30" # TODO: Remove during Upgrade -> https://github.com/psf/black/issues/2524 | ||||
|  | ||||
| [build-system] | ||||
| requires = ["poetry-core>=1.0.0"] | ||||
|   | ||||
| @@ -1,5 +1,6 @@ | ||||
| import json | ||||
| import re | ||||
| from datetime import timedelta | ||||
|  | ||||
| import pytest | ||||
|  | ||||
| @@ -59,6 +60,24 @@ def test_clean_image(): | ||||
|     assert cleaner.image(["My URL!", "MY SECOND URL"]) == "My URL!" | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "nutrition,expected", | ||||
|     [ | ||||
|         (None, {}), | ||||
|         ({"calories": "105 kcal"}, {"calories": "105"}), | ||||
|         ({"calories": "105 kcal 104 sugar"}, {"calories": "105"}), | ||||
|         ({"calories": ""}, {}), | ||||
|         ({"calories": ["not just a string"], "sugarContent": "but still tries 555.321"}, {"sugarContent": "555.321"}), | ||||
|         ({"sodiumContent": "5.1235g"}, {"sodiumContent": "5123.5"}), | ||||
|         ({"sodiumContent": "5mg"}, {"sodiumContent": "5"}), | ||||
|         ({"sodiumContent": "10oz"}, {"sodiumContent": "10"}), | ||||
|         ({"sodiumContent": "10.1.2g"}, {"sodiumContent": "10100.0"}), | ||||
|     ], | ||||
| ) | ||||
| def test_clean_nutrition(nutrition, expected): | ||||
|     assert cleaner.clean_nutrition(nutrition) == expected | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "instructions", | ||||
|     [ | ||||
| @@ -90,9 +109,29 @@ def test_html_with_recipe_data(): | ||||
|     assert url_validation_regex.match(recipe_data["image"]) | ||||
|  | ||||
|  | ||||
| def test_time_cleaner(): | ||||
| @pytest.mark.parametrize( | ||||
|     "time_delta,expected", | ||||
|     [ | ||||
|         ("PT2H30M", "2 Hours 30 Minutes"), | ||||
|         ("PT30M", "30 Minutes"), | ||||
|         ("PT3H", "3 Hours"), | ||||
|         ("P1DT1H1M1S", "1 day 1 Hour 1 Minute 1 Second"), | ||||
|         ("P1DT1H1M1.53S", "1 day 1 Hour 1 Minute 1 Second"), | ||||
|         ("PT-3H", None), | ||||
|         ("PT", "none"), | ||||
|     ], | ||||
| ) | ||||
| def test_time_cleaner(time_delta, expected): | ||||
|     assert cleaner.clean_time(time_delta) == expected | ||||
|  | ||||
|     my_time_delta = "PT2H30M" | ||||
|     return_delta = cleaner.clean_time(my_time_delta) | ||||
|  | ||||
|     assert return_delta == "2 Hours 30 Minutes" | ||||
| @pytest.mark.parametrize( | ||||
|     "t,max_components,max_decimal_places,expected", | ||||
|     [ | ||||
|         (timedelta(days=2, seconds=17280), None, 2, "2 days 4 Hours 48 Minutes"), | ||||
|         (timedelta(days=2, seconds=17280), 1, 2, "2.2 days"), | ||||
|         (timedelta(days=365), None, 2, "1 year"), | ||||
|     ], | ||||
| ) | ||||
| def test_pretty_print_timedelta(t, max_components, max_decimal_places, expected): | ||||
|     assert cleaner.pretty_print_timedelta(t, max_components, max_decimal_places) == expected | ||||
|   | ||||
		Reference in New Issue
	
	Block a user