mirror of
https://github.com/mealie-recipes/mealie.git
synced 2025-12-09 11:55:18 -05:00
feat: Create Recipe From HTML or JSON (#4274)
Co-authored-by: Kuchenpirat <24235032+Kuchenpirat@users.noreply.github.com>
This commit is contained in:
@@ -40,7 +40,7 @@ from mealie.routes._base.mixins import HttpRepo
|
||||
from mealie.routes._base.routers import MealieCrudRoute, UserAPIRouter
|
||||
from mealie.schema.cookbook.cookbook import ReadCookBook
|
||||
from mealie.schema.make_dependable import make_dependable
|
||||
from mealie.schema.recipe import Recipe, RecipeImageTypes, ScrapeRecipe
|
||||
from mealie.schema.recipe import Recipe, RecipeImageTypes, ScrapeRecipe, ScrapeRecipeData
|
||||
from mealie.schema.recipe.recipe import (
|
||||
CreateRecipe,
|
||||
CreateRecipeByUrlBulk,
|
||||
@@ -73,7 +73,7 @@ from mealie.services.recipe.recipe_service import RecipeService
|
||||
from mealie.services.recipe.template_service import TemplateService
|
||||
from mealie.services.scraper.recipe_bulk_scraper import RecipeBulkScraperService
|
||||
from mealie.services.scraper.scraped_extras import ScraperContext
|
||||
from mealie.services.scraper.scraper import create_from_url
|
||||
from mealie.services.scraper.scraper import create_from_html
|
||||
from mealie.services.scraper.scraper_strategies import (
|
||||
ForceTimeoutException,
|
||||
RecipeScraperOpenAI,
|
||||
@@ -201,11 +201,31 @@ class RecipeController(BaseRecipeController):
|
||||
# =======================================================================
|
||||
# URL Scraping Operations
|
||||
|
||||
@router.post("/create-url", status_code=201, response_model=str)
|
||||
@router.post("/create/html-or-json", status_code=201)
|
||||
async def create_recipe_from_html_or_json(self, req: ScrapeRecipeData):
|
||||
"""Takes in raw HTML or a https://schema.org/Recipe object as a JSON string and parses it like a URL"""
|
||||
|
||||
if req.data.startswith("{"):
|
||||
req.data = RecipeScraperPackage.ld_json_to_html(req.data)
|
||||
|
||||
return await self._create_recipe_from_web(req)
|
||||
|
||||
@router.post("/create/url", status_code=201, response_model=str)
|
||||
async def parse_recipe_url(self, req: ScrapeRecipe):
|
||||
"""Takes in a URL and attempts to scrape data and load it into the database"""
|
||||
|
||||
return await self._create_recipe_from_web(req)
|
||||
|
||||
async def _create_recipe_from_web(self, req: ScrapeRecipe | ScrapeRecipeData):
|
||||
if isinstance(req, ScrapeRecipeData):
|
||||
html = req.data
|
||||
url = ""
|
||||
else:
|
||||
html = None
|
||||
url = req.url
|
||||
|
||||
try:
|
||||
recipe, extras = await create_from_url(req.url, self.translator)
|
||||
recipe, extras = await create_from_html(url, self.translator, html)
|
||||
except ForceTimeoutException as e:
|
||||
raise HTTPException(
|
||||
status_code=408, detail=ErrorResponse.respond(message="Recipe Scraping Timed Out")
|
||||
@@ -233,7 +253,7 @@ class RecipeController(BaseRecipeController):
|
||||
|
||||
return new_recipe.slug
|
||||
|
||||
@router.post("/create-url/bulk", status_code=202)
|
||||
@router.post("/create/url/bulk", status_code=202)
|
||||
def parse_recipe_url_bulk(self, bulk: CreateRecipeByUrlBulk, bg_tasks: BackgroundTasks):
|
||||
"""Takes in a URL and attempts to scrape data and load it into the database"""
|
||||
bulk_scraper = RecipeBulkScraperService(self.service, self.repos, self.group, self.translator)
|
||||
@@ -266,7 +286,7 @@ class RecipeController(BaseRecipeController):
|
||||
# ==================================================================================================================
|
||||
# Other Create Operations
|
||||
|
||||
@router.post("/create-from-zip", status_code=201)
|
||||
@router.post("/create/zip", status_code=201)
|
||||
def create_recipe_from_zip(self, archive: UploadFile = File(...)):
|
||||
"""Create recipe from archive"""
|
||||
with get_temporary_zip_path() as temp_path:
|
||||
@@ -280,7 +300,7 @@ class RecipeController(BaseRecipeController):
|
||||
|
||||
return recipe.slug
|
||||
|
||||
@router.post("/create-from-image", status_code=201)
|
||||
@router.post("/create/image", status_code=201)
|
||||
async def create_recipe_from_image(
|
||||
self,
|
||||
images: list[UploadFile] = File(...),
|
||||
|
||||
@@ -71,7 +71,7 @@ from .recipe_ingredient import (
|
||||
)
|
||||
from .recipe_notes import RecipeNote
|
||||
from .recipe_nutrition import Nutrition
|
||||
from .recipe_scraper import ScrapeRecipe, ScrapeRecipeTest
|
||||
from .recipe_scraper import ScrapeRecipe, ScrapeRecipeBase, ScrapeRecipeData, ScrapeRecipeTest
|
||||
from .recipe_settings import RecipeSettings
|
||||
from .recipe_share_token import RecipeShareToken, RecipeShareTokenCreate, RecipeShareTokenSave, RecipeShareTokenSummary
|
||||
from .recipe_step import IngredientReferences, RecipeStep
|
||||
@@ -157,6 +157,8 @@ __all__ = [
|
||||
"RecipeTool",
|
||||
"RecipeToolPagination",
|
||||
"ScrapeRecipe",
|
||||
"ScrapeRecipeBase",
|
||||
"ScrapeRecipeData",
|
||||
"ScrapeRecipeTest",
|
||||
"AssignCategories",
|
||||
"AssignSettings",
|
||||
|
||||
@@ -8,9 +8,12 @@ class ScrapeRecipeTest(MealieModel):
|
||||
use_openai: bool = Field(False, alias="useOpenAI")
|
||||
|
||||
|
||||
class ScrapeRecipe(MealieModel):
|
||||
url: str
|
||||
class ScrapeRecipeBase(MealieModel):
|
||||
include_tags: bool = False
|
||||
|
||||
|
||||
class ScrapeRecipe(ScrapeRecipeBase):
|
||||
url: str
|
||||
model_config = ConfigDict(
|
||||
json_schema_extra={
|
||||
"example": {
|
||||
@@ -19,3 +22,8 @@ class ScrapeRecipe(MealieModel):
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
class ScrapeRecipeData(ScrapeRecipeBase):
|
||||
data: str
|
||||
"""HTML data or JSON string of a https://schema.org/Recipe object"""
|
||||
|
||||
@@ -15,7 +15,7 @@ from mealie.schema.reports.reports import (
|
||||
from mealie.schema.user.user import GroupInDB
|
||||
from mealie.services._base_service import BaseService
|
||||
from mealie.services.recipe.recipe_service import RecipeService
|
||||
from mealie.services.scraper.scraper import create_from_url
|
||||
from mealie.services.scraper.scraper import create_from_html
|
||||
|
||||
|
||||
class RecipeBulkScraperService(BaseService):
|
||||
@@ -85,7 +85,7 @@ class RecipeBulkScraperService(BaseService):
|
||||
async def _do(url: str) -> Recipe | None:
|
||||
async with sem:
|
||||
try:
|
||||
recipe, _ = await create_from_url(url, self.translator)
|
||||
recipe, _ = await create_from_html(url, self.translator)
|
||||
return recipe
|
||||
except Exception as e:
|
||||
self.service.logger.error(f"failed to scrape url during bulk url import {url}")
|
||||
|
||||
@@ -32,12 +32,13 @@ class RecipeScraper:
|
||||
self.scrapers = scrapers
|
||||
self.translator = translator
|
||||
|
||||
async def scrape(self, url: str) -> tuple[Recipe, ScrapedExtras] | tuple[None, None]:
|
||||
async def scrape(self, url: str, html: str | None = None) -> tuple[Recipe, ScrapedExtras] | tuple[None, None]:
|
||||
"""
|
||||
Scrapes a recipe from the web.
|
||||
Skips the network request if `html` is provided.
|
||||
"""
|
||||
|
||||
raw_html = await safe_scrape_html(url)
|
||||
raw_html = html or await safe_scrape_html(url)
|
||||
for scraper_type in self.scrapers:
|
||||
scraper = scraper_type(url, self.translator, raw_html=raw_html)
|
||||
result = await scraper.parse()
|
||||
|
||||
@@ -21,24 +21,28 @@ class ParserErrors(str, Enum):
|
||||
CONNECTION_ERROR = "CONNECTION_ERROR"
|
||||
|
||||
|
||||
async def create_from_url(url: str, translator: Translator) -> tuple[Recipe, ScrapedExtras | None]:
|
||||
async def create_from_html(
|
||||
url: str, translator: Translator, html: str | None = None
|
||||
) -> tuple[Recipe, ScrapedExtras | None]:
|
||||
"""Main entry point for generating a recipe from a URL. Pass in a URL and
|
||||
a Recipe object will be returned if successful.
|
||||
a Recipe object will be returned if successful. Optionally pass in the HTML to skip fetching it.
|
||||
|
||||
Args:
|
||||
url (str): a valid string representing a URL
|
||||
html (str | None): optional HTML string to skip network request. Defaults to None.
|
||||
|
||||
Returns:
|
||||
Recipe: Recipe Object
|
||||
"""
|
||||
scraper = RecipeScraper(translator)
|
||||
|
||||
extracted_url = regex_search(r"(https?://|www\.)[^\s]+", url)
|
||||
if not html:
|
||||
extracted_url = regex_search(r"(https?://|www\.)[^\s]+", url)
|
||||
if not extracted_url:
|
||||
raise HTTPException(status.HTTP_400_BAD_REQUEST, {"details": ParserErrors.BAD_RECIPE_DATA.value})
|
||||
url = extracted_url.group(0)
|
||||
|
||||
if not extracted_url:
|
||||
raise HTTPException(status.HTTP_400_BAD_REQUEST, {"details": ParserErrors.BAD_RECIPE_DATA.value})
|
||||
|
||||
new_recipe, extras = await scraper.scrape(extracted_url.group(0))
|
||||
new_recipe, extras = await scraper.scrape(url, html)
|
||||
|
||||
if not new_recipe:
|
||||
raise HTTPException(status.HTTP_400_BAD_REQUEST, {"details": ParserErrors.BAD_RECIPE_DATA.value})
|
||||
|
||||
@@ -119,6 +119,14 @@ class ABCScraperStrategy(ABC):
|
||||
|
||||
|
||||
class RecipeScraperPackage(ABCScraperStrategy):
|
||||
@staticmethod
|
||||
def ld_json_to_html(ld_json: str) -> str:
|
||||
return (
|
||||
"<!DOCTYPE html><html><head>"
|
||||
f'<script type="application/ld+json">{ld_json}</script>'
|
||||
"</head><body></body></html>"
|
||||
)
|
||||
|
||||
async def get_html(self, url: str) -> str:
|
||||
return self.raw_html or await safe_scrape_html(url)
|
||||
|
||||
@@ -192,7 +200,7 @@ class RecipeScraperPackage(ABCScraperStrategy):
|
||||
total_time=try_get_default(None, "totalTime", None, cleaner.clean_time, translator=self.translator),
|
||||
prep_time=try_get_default(None, "prepTime", None, cleaner.clean_time, translator=self.translator),
|
||||
perform_time=cook_time,
|
||||
org_url=url,
|
||||
org_url=url or try_get_default(None, "url", None, cleaner.clean_string),
|
||||
)
|
||||
|
||||
return recipe, extras
|
||||
@@ -201,7 +209,8 @@ class RecipeScraperPackage(ABCScraperStrategy):
|
||||
recipe_html = await self.get_html(self.url)
|
||||
|
||||
try:
|
||||
scraped_schema = scrape_html(recipe_html, org_url=self.url, supported_only=False)
|
||||
# scrape_html requires a URL, but we might not have one, so we default to a dummy URL
|
||||
scraped_schema = scrape_html(recipe_html, org_url=self.url or "https://example.com", supported_only=False)
|
||||
except (NoSchemaFoundInWildMode, AttributeError):
|
||||
self.logger.error(f"Recipe Scraper was unable to extract a recipe from {self.url}")
|
||||
return None
|
||||
@@ -300,11 +309,10 @@ class RecipeScraperOpenAI(RecipeScraperPackage):
|
||||
prompt = service.get_prompt("recipes.scrape-recipe")
|
||||
|
||||
response_json = await service.get_response(prompt, text, force_json_response=True)
|
||||
return (
|
||||
"<!DOCTYPE html><html><head>"
|
||||
f'<script type="application/ld+json">{response_json}</script>'
|
||||
"</head><body></body></html>"
|
||||
)
|
||||
if not response_json:
|
||||
raise Exception("OpenAI did not return any data")
|
||||
|
||||
return self.ld_json_to_html(response_json)
|
||||
except Exception:
|
||||
self.logger.exception(f"OpenAI was unable to extract a recipe from {url}")
|
||||
return ""
|
||||
@@ -340,7 +348,7 @@ class RecipeScraperOpenGraph(ABCScraperStrategy):
|
||||
"recipeIngredient": ["Could not detect ingredients"],
|
||||
"recipeInstructions": [{"text": "Could not detect instructions"}],
|
||||
"slug": slugify(og_field(properties, "og:title")),
|
||||
"orgURL": self.url,
|
||||
"orgURL": self.url or og_field(properties, "og:url"),
|
||||
"categories": [],
|
||||
"tags": og_fields(properties, "og:article:tag"),
|
||||
"dateAdded": None,
|
||||
|
||||
Reference in New Issue
Block a user