feat: Create Recipe From HTML or JSON (#4274)

Co-authored-by: Kuchenpirat <24235032+Kuchenpirat@users.noreply.github.com>
2026-02-05 07:23:12 -05:00 · 2024-09-30 10:52:13 -05:00
parent edf420491f
commit 4c1d855690
23 changed files with 408 additions and 115 deletions
--- a/mealie/routes/recipe/recipe_crud_routes.py
+++ b/mealie/routes/recipe/recipe_crud_routes.py
@@ -40,7 +40,7 @@ from mealie.routes._base.mixins import HttpRepo
 from mealie.routes._base.routers import MealieCrudRoute, UserAPIRouter
 from mealie.schema.cookbook.cookbook import ReadCookBook
 from mealie.schema.make_dependable import make_dependable
-from mealie.schema.recipe import Recipe, RecipeImageTypes, ScrapeRecipe
+from mealie.schema.recipe import Recipe, RecipeImageTypes, ScrapeRecipe, ScrapeRecipeData
 from mealie.schema.recipe.recipe import (
    CreateRecipe,
    CreateRecipeByUrlBulk,
@@ -73,7 +73,7 @@ from mealie.services.recipe.recipe_service import RecipeService
 from mealie.services.recipe.template_service import TemplateService
 from mealie.services.scraper.recipe_bulk_scraper import RecipeBulkScraperService
 from mealie.services.scraper.scraped_extras import ScraperContext
-from mealie.services.scraper.scraper import create_from_url
+from mealie.services.scraper.scraper import create_from_html
 from mealie.services.scraper.scraper_strategies import (
    ForceTimeoutException,
    RecipeScraperOpenAI,
@@ -201,11 +201,31 @@ class RecipeController(BaseRecipeController):
    # =======================================================================
    # URL Scraping Operations

-    @router.post("/create-url", status_code=201, response_model=str)
+    @router.post("/create/html-or-json", status_code=201)
+    async def create_recipe_from_html_or_json(self, req: ScrapeRecipeData):
+        """Takes in raw HTML or a https://schema.org/Recipe object as a JSON string and parses it like a URL"""
+
+        if req.data.startswith("{"):
+            req.data = RecipeScraperPackage.ld_json_to_html(req.data)
+
+        return await self._create_recipe_from_web(req)
+
+    @router.post("/create/url", status_code=201, response_model=str)
    async def parse_recipe_url(self, req: ScrapeRecipe):
        """Takes in a URL and attempts to scrape data and load it into the database"""
+
+        return await self._create_recipe_from_web(req)
+
+    async def _create_recipe_from_web(self, req: ScrapeRecipe | ScrapeRecipeData):
+        if isinstance(req, ScrapeRecipeData):
+            html = req.data
+            url = ""
+        else:
+            html = None
+            url = req.url
+
        try:
-            recipe, extras = await create_from_url(req.url, self.translator)
+            recipe, extras = await create_from_html(url, self.translator, html)
        except ForceTimeoutException as e:
            raise HTTPException(
                status_code=408, detail=ErrorResponse.respond(message="Recipe Scraping Timed Out")
@@ -233,7 +253,7 @@ class RecipeController(BaseRecipeController):

        return new_recipe.slug

-    @router.post("/create-url/bulk", status_code=202)
+    @router.post("/create/url/bulk", status_code=202)
    def parse_recipe_url_bulk(self, bulk: CreateRecipeByUrlBulk, bg_tasks: BackgroundTasks):
        """Takes in a URL and attempts to scrape data and load it into the database"""
        bulk_scraper = RecipeBulkScraperService(self.service, self.repos, self.group, self.translator)
@@ -266,7 +286,7 @@ class RecipeController(BaseRecipeController):
    # ==================================================================================================================
    # Other Create Operations

-    @router.post("/create-from-zip", status_code=201)
+    @router.post("/create/zip", status_code=201)
    def create_recipe_from_zip(self, archive: UploadFile = File(...)):
        """Create recipe from archive"""
        with get_temporary_zip_path() as temp_path:
@@ -280,7 +300,7 @@ class RecipeController(BaseRecipeController):

        return recipe.slug

-    @router.post("/create-from-image", status_code=201)
+    @router.post("/create/image", status_code=201)
    async def create_recipe_from_image(
        self,
        images: list[UploadFile] = File(...),
--- a/mealie/schema/recipe/init.py
+++ b/mealie/schema/recipe/init.py
@@ -71,7 +71,7 @@ from .recipe_ingredient import (
 )
 from .recipe_notes import RecipeNote
 from .recipe_nutrition import Nutrition
-from .recipe_scraper import ScrapeRecipe, ScrapeRecipeTest
+from .recipe_scraper import ScrapeRecipe, ScrapeRecipeBase, ScrapeRecipeData, ScrapeRecipeTest
 from .recipe_settings import RecipeSettings
 from .recipe_share_token import RecipeShareToken, RecipeShareTokenCreate, RecipeShareTokenSave, RecipeShareTokenSummary
 from .recipe_step import IngredientReferences, RecipeStep
@@ -157,6 +157,8 @@ __all__ = [
    "RecipeTool",
    "RecipeToolPagination",
    "ScrapeRecipe",
+    "ScrapeRecipeBase",
+    "ScrapeRecipeData",
    "ScrapeRecipeTest",
    "AssignCategories",
    "AssignSettings",
--- a/mealie/schema/recipe/recipe_scraper.py
+++ b/mealie/schema/recipe/recipe_scraper.py
@@ -8,9 +8,12 @@ class ScrapeRecipeTest(MealieModel):
    use_openai: bool = Field(False, alias="useOpenAI")


-class ScrapeRecipe(MealieModel):
-    url: str
+class ScrapeRecipeBase(MealieModel):
    include_tags: bool = False
+
+
+class ScrapeRecipe(ScrapeRecipeBase):
+    url: str
    model_config = ConfigDict(
        json_schema_extra={
            "example": {
@@ -19,3 +22,8 @@ class ScrapeRecipe(MealieModel):
            },
        }
    )
+
+
+class ScrapeRecipeData(ScrapeRecipeBase):
+    data: str
+    """HTML data or JSON string of a https://schema.org/Recipe object"""
--- a/mealie/services/scraper/recipe_bulk_scraper.py
+++ b/mealie/services/scraper/recipe_bulk_scraper.py
@@ -15,7 +15,7 @@ from mealie.schema.reports.reports import (
 from mealie.schema.user.user import GroupInDB
 from mealie.services._base_service import BaseService
 from mealie.services.recipe.recipe_service import RecipeService
-from mealie.services.scraper.scraper import create_from_url
+from mealie.services.scraper.scraper import create_from_html


 class RecipeBulkScraperService(BaseService):
@@ -85,7 +85,7 @@ class RecipeBulkScraperService(BaseService):
        async def _do(url: str) -> Recipe | None:
            async with sem:
                try:
-                    recipe, _ = await create_from_url(url, self.translator)
+                    recipe, _ = await create_from_html(url, self.translator)
                    return recipe
                except Exception as e:
                    self.service.logger.error(f"failed to scrape url during bulk url import {url}")
--- a/mealie/services/scraper/recipe_scraper.py
+++ b/mealie/services/scraper/recipe_scraper.py
@@ -32,12 +32,13 @@ class RecipeScraper:
        self.scrapers = scrapers
        self.translator = translator

-    async def scrape(self, url: str) -> tuple[Recipe, ScrapedExtras] | tuple[None, None]:
+    async def scrape(self, url: str, html: str | None = None) -> tuple[Recipe, ScrapedExtras] | tuple[None, None]:
        """
        Scrapes a recipe from the web.
+        Skips the network request if `html` is provided.
        """

-        raw_html = await safe_scrape_html(url)
+        raw_html = html or await safe_scrape_html(url)
        for scraper_type in self.scrapers:
            scraper = scraper_type(url, self.translator, raw_html=raw_html)
            result = await scraper.parse()
--- a/mealie/services/scraper/scraper.py
+++ b/mealie/services/scraper/scraper.py
@@ -21,24 +21,28 @@ class ParserErrors(str, Enum):
    CONNECTION_ERROR = "CONNECTION_ERROR"


-async def create_from_url(url: str, translator: Translator) -> tuple[Recipe, ScrapedExtras | None]:
+async def create_from_html(
+    url: str, translator: Translator, html: str | None = None
+) -> tuple[Recipe, ScrapedExtras | None]:
    """Main entry point for generating a recipe from a URL. Pass in a URL and
-    a Recipe object will be returned if successful.
+    a Recipe object will be returned if successful. Optionally pass in the HTML to skip fetching it.

    Args:
        url (str): a valid string representing a URL
+        html (str | None): optional HTML string to skip network request. Defaults to None.

    Returns:
        Recipe: Recipe Object
    """
    scraper = RecipeScraper(translator)

-    extracted_url = regex_search(r"(https?://|www\.)[^\s]+", url)
+    if not html:
+        extracted_url = regex_search(r"(https?://|www\.)[^\s]+", url)
+        if not extracted_url:
+            raise HTTPException(status.HTTP_400_BAD_REQUEST, {"details": ParserErrors.BAD_RECIPE_DATA.value})
+        url = extracted_url.group(0)

-    if not extracted_url:
-        raise HTTPException(status.HTTP_400_BAD_REQUEST, {"details": ParserErrors.BAD_RECIPE_DATA.value})
-
-    new_recipe, extras = await scraper.scrape(extracted_url.group(0))
+    new_recipe, extras = await scraper.scrape(url, html)

    if not new_recipe:
        raise HTTPException(status.HTTP_400_BAD_REQUEST, {"details": ParserErrors.BAD_RECIPE_DATA.value})
--- a/mealie/services/scraper/scraper_strategies.py
+++ b/mealie/services/scraper/scraper_strategies.py
@@ -119,6 +119,14 @@ class ABCScraperStrategy(ABC):


 class RecipeScraperPackage(ABCScraperStrategy):
+    @staticmethod
+    def ld_json_to_html(ld_json: str) -> str:
+        return (
+            "<!DOCTYPE html><html><head>"
+            f'<script type="application/ld+json">{ld_json}</script>'
+            "</head><body></body></html>"
+        )
+
    async def get_html(self, url: str) -> str:
        return self.raw_html or await safe_scrape_html(url)

@@ -192,7 +200,7 @@ class RecipeScraperPackage(ABCScraperStrategy):
            total_time=try_get_default(None, "totalTime", None, cleaner.clean_time, translator=self.translator),
            prep_time=try_get_default(None, "prepTime", None, cleaner.clean_time, translator=self.translator),
            perform_time=cook_time,
-            org_url=url,
+            org_url=url or try_get_default(None, "url", None, cleaner.clean_string),
        )

        return recipe, extras
@@ -201,7 +209,8 @@ class RecipeScraperPackage(ABCScraperStrategy):
        recipe_html = await self.get_html(self.url)

        try:
-            scraped_schema = scrape_html(recipe_html, org_url=self.url, supported_only=False)
+            # scrape_html requires a URL, but we might not have one, so we default to a dummy URL
+            scraped_schema = scrape_html(recipe_html, org_url=self.url or "https://example.com", supported_only=False)
        except (NoSchemaFoundInWildMode, AttributeError):
            self.logger.error(f"Recipe Scraper was unable to extract a recipe from {self.url}")
            return None
@@ -300,11 +309,10 @@ class RecipeScraperOpenAI(RecipeScraperPackage):
            prompt = service.get_prompt("recipes.scrape-recipe")

            response_json = await service.get_response(prompt, text, force_json_response=True)
-            return (
-                "<!DOCTYPE html><html><head>"
-                f'<script type="application/ld+json">{response_json}</script>'
-                "</head><body></body></html>"
-            )
+            if not response_json:
+                raise Exception("OpenAI did not return any data")
+
+            return self.ld_json_to_html(response_json)
        except Exception:
            self.logger.exception(f"OpenAI was unable to extract a recipe from {url}")
            return ""
@@ -340,7 +348,7 @@ class RecipeScraperOpenGraph(ABCScraperStrategy):
            "recipeIngredient": ["Could not detect ingredients"],
            "recipeInstructions": [{"text": "Could not detect instructions"}],
            "slug": slugify(og_field(properties, "og:title")),
-            "orgURL": self.url,
+            "orgURL": self.url or og_field(properties, "og:url"),
            "categories": [],
            "tags": og_fields(properties, "og:article:tag"),
            "dateAdded": None,