mirror of
				https://github.com/mealie-recipes/mealie.git
				synced 2025-10-31 02:03:35 -04:00 
			
		
		
		
	feat: Improve Recipe Imports with Cleaner (#4517)
This commit is contained in:
		| @@ -268,6 +268,5 @@ class BaseMigrator(BaseService): | ||||
|         with contextlib.suppress(KeyError): | ||||
|             del recipe_dict["id"] | ||||
|  | ||||
|         recipe_dict = cleaner.clean(recipe_dict, self.translator, url=recipe_dict.get("org_url", None)) | ||||
|  | ||||
|         return Recipe(**recipe_dict) | ||||
|         recipe = cleaner.clean(recipe_dict, self.translator, url=recipe_dict.get("org_url", None)) | ||||
|         return recipe | ||||
|   | ||||
| @@ -32,6 +32,7 @@ from mealie.schema.user.user import PrivateUser, UserRatingCreate | ||||
| from mealie.services._base_service import BaseService | ||||
| from mealie.services.openai import OpenAIDataInjection, OpenAILocalImage, OpenAIService | ||||
| from mealie.services.recipe.recipe_data_service import RecipeDataService | ||||
| from mealie.services.scraper import cleaner | ||||
|  | ||||
| from .template_service import TemplateService | ||||
|  | ||||
| @@ -297,6 +298,7 @@ class RecipeService(RecipeServiceBase): | ||||
|             recipe_data = await openai_recipe_service.build_recipe_from_images( | ||||
|                 local_images, translate_language=translate_language | ||||
|             ) | ||||
|             recipe_data = cleaner.clean(recipe_data, self.translator) | ||||
|  | ||||
|             recipe = self.create_one(recipe_data) | ||||
|             data_service = RecipeDataService(recipe.id) | ||||
|   | ||||
| @@ -11,6 +11,7 @@ from slugify import slugify | ||||
|  | ||||
| from mealie.core.root_logger import get_logger | ||||
| from mealie.lang.providers import Translator | ||||
| from mealie.schema.recipe.recipe import Recipe | ||||
|  | ||||
| logger = get_logger("recipe-scraper") | ||||
|  | ||||
| @@ -33,16 +34,23 @@ MATCH_ERRONEOUS_WHITE_SPACE = re.compile(r"\n\s*\n") | ||||
| """ Matches multiple new lines and removes erroneous white space """ | ||||
|  | ||||
|  | ||||
| def clean(recipe_data: dict, translator: Translator, url=None) -> dict: | ||||
| def clean(recipe_data: Recipe | dict, translator: Translator, url=None) -> Recipe: | ||||
|     """Main entrypoint to clean a recipe extracted from the web | ||||
|     and format the data into an accectable format for the database | ||||
|  | ||||
|     Args: | ||||
|         recipe_data (dict): raw recipe dicitonary | ||||
|         recipe_data (dict): raw recipe or recipe dictionary | ||||
|  | ||||
|     Returns: | ||||
|         dict: cleaned recipe dictionary | ||||
|     """ | ||||
|     if not isinstance(recipe_data, dict): | ||||
|         # format the recipe like a scraped dictionary | ||||
|         recipe_data_dict = recipe_data.model_dump(by_alias=True) | ||||
|         recipe_data_dict["recipeIngredient"] = [ing.display for ing in recipe_data.recipe_ingredient] | ||||
|  | ||||
|         recipe_data = recipe_data_dict | ||||
|  | ||||
|     recipe_data["description"] = clean_string(recipe_data.get("description", "")) | ||||
|  | ||||
|     # Times | ||||
| @@ -59,7 +67,7 @@ def clean(recipe_data: dict, translator: Translator, url=None) -> dict: | ||||
|     recipe_data["notes"] = clean_notes(recipe_data.get("notes")) | ||||
|     recipe_data["rating"] = clean_int(recipe_data.get("rating")) | ||||
|  | ||||
|     return recipe_data | ||||
|     return Recipe(**recipe_data) | ||||
|  | ||||
|  | ||||
| def clean_string(text: str | list | int) -> str: | ||||
|   | ||||
| @@ -1,5 +1,7 @@ | ||||
| from mealie.core.root_logger import get_logger | ||||
| from mealie.lang.providers import Translator | ||||
| from mealie.schema.recipe.recipe import Recipe | ||||
| from mealie.services.scraper import cleaner | ||||
| from mealie.services.scraper.scraped_extras import ScrapedExtras | ||||
|  | ||||
| from .scraper_strategies import ( | ||||
| @@ -31,6 +33,7 @@ class RecipeScraper: | ||||
|  | ||||
|         self.scrapers = scrapers | ||||
|         self.translator = translator | ||||
|         self.logger = get_logger() | ||||
|  | ||||
|     async def scrape(self, url: str, html: str | None = None) -> tuple[Recipe, ScrapedExtras] | tuple[None, None]: | ||||
|         """ | ||||
| @@ -41,9 +44,23 @@ class RecipeScraper: | ||||
|         raw_html = html or await safe_scrape_html(url) | ||||
|         for scraper_type in self.scrapers: | ||||
|             scraper = scraper_type(url, self.translator, raw_html=raw_html) | ||||
|             result = await scraper.parse() | ||||
|  | ||||
|             if result is not None: | ||||
|                 return result | ||||
|             try: | ||||
|                 result = await scraper.parse() | ||||
|             except Exception: | ||||
|                 self.logger.exception(f"Failed to scrape HTML with {scraper.__class__.__name__}") | ||||
|                 result = None | ||||
|  | ||||
|             if result is None or result[0] is None: | ||||
|                 continue | ||||
|  | ||||
|             recipe_result, extras = result | ||||
|             try: | ||||
|                 recipe = cleaner.clean(recipe_result, self.translator) | ||||
|             except Exception: | ||||
|                 self.logger.exception(f"Failed to clean recipe data from {scraper.__class__.__name__}") | ||||
|                 continue | ||||
|  | ||||
|             return recipe, extras | ||||
|  | ||||
|         return None, None | ||||
|   | ||||
| @@ -253,6 +253,18 @@ class RecipeScraperOpenAI(RecipeScraperPackage): | ||||
|     rather than trying to scrape it directly. | ||||
|     """ | ||||
|  | ||||
|     def extract_json_ld_data_from_html(self, soup: bs4.BeautifulSoup) -> str: | ||||
|         data_parts: list[str] = [] | ||||
|         for script in soup.find_all("script", type="application/ld+json"): | ||||
|             try: | ||||
|                 script_data = script.string | ||||
|                 if script_data: | ||||
|                     data_parts.append(str(script_data)) | ||||
|             except AttributeError: | ||||
|                 pass | ||||
|  | ||||
|         return "\n\n".join(data_parts) | ||||
|  | ||||
|     def find_image(self, soup: bs4.BeautifulSoup) -> str | None: | ||||
|         # find the open graph image tag | ||||
|         og_image = soup.find("meta", property="og:image") | ||||
| @@ -285,8 +297,10 @@ class RecipeScraperOpenAI(RecipeScraperPackage): | ||||
|         soup = bs4.BeautifulSoup(html, "lxml") | ||||
|  | ||||
|         text = soup.get_text(separator="\n", strip=True) | ||||
|         text += self.extract_json_ld_data_from_html(soup) | ||||
|         if not text: | ||||
|             raise Exception("No text found in HTML") | ||||
|             raise Exception("No text or ld+json data found in HTML") | ||||
|  | ||||
|         try: | ||||
|             image = self.find_image(soup) | ||||
|         except Exception: | ||||
|   | ||||
| @@ -40,7 +40,7 @@ test_cleaner_data = [ | ||||
| def test_cleaner_clean(json_file: Path, num_steps): | ||||
|     translator = local_provider() | ||||
|     recipe_data = cleaner.clean(json.loads(json_file.read_text()), translator) | ||||
|     assert len(recipe_data["recipeInstructions"]) == num_steps | ||||
|     assert len(recipe_data.recipe_instructions or []) == num_steps | ||||
|  | ||||
|  | ||||
| def test_html_with_recipe_data(): | ||||
|   | ||||
		Reference in New Issue
	
	Block a user