feat: Create Recipe From HTML or JSON (#4274)

Co-authored-by: Kuchenpirat <24235032+Kuchenpirat@users.noreply.github.com>
This commit is contained in:
Michael Genson
2024-09-30 10:52:13 -05:00
committed by GitHub
parent edf420491f
commit 4c1d855690
23 changed files with 408 additions and 115 deletions

View File

@@ -15,7 +15,7 @@ from mealie.schema.reports.reports import (
from mealie.schema.user.user import GroupInDB
from mealie.services._base_service import BaseService
from mealie.services.recipe.recipe_service import RecipeService
from mealie.services.scraper.scraper import create_from_url
from mealie.services.scraper.scraper import create_from_html
class RecipeBulkScraperService(BaseService):
@@ -85,7 +85,7 @@ class RecipeBulkScraperService(BaseService):
async def _do(url: str) -> Recipe | None:
async with sem:
try:
recipe, _ = await create_from_url(url, self.translator)
recipe, _ = await create_from_html(url, self.translator)
return recipe
except Exception as e:
self.service.logger.error(f"failed to scrape url during bulk url import {url}")

View File

@@ -32,12 +32,13 @@ class RecipeScraper:
self.scrapers = scrapers
self.translator = translator
async def scrape(self, url: str) -> tuple[Recipe, ScrapedExtras] | tuple[None, None]:
async def scrape(self, url: str, html: str | None = None) -> tuple[Recipe, ScrapedExtras] | tuple[None, None]:
"""
Scrapes a recipe from the web.
Skips the network request if `html` is provided.
"""
raw_html = await safe_scrape_html(url)
raw_html = html or await safe_scrape_html(url)
for scraper_type in self.scrapers:
scraper = scraper_type(url, self.translator, raw_html=raw_html)
result = await scraper.parse()

View File

@@ -21,24 +21,28 @@ class ParserErrors(str, Enum):
CONNECTION_ERROR = "CONNECTION_ERROR"
async def create_from_url(url: str, translator: Translator) -> tuple[Recipe, ScrapedExtras | None]:
async def create_from_html(
url: str, translator: Translator, html: str | None = None
) -> tuple[Recipe, ScrapedExtras | None]:
"""Main entry point for generating a recipe from a URL. Pass in a URL and
a Recipe object will be returned if successful.
a Recipe object will be returned if successful. Optionally pass in the HTML to skip fetching it.
Args:
url (str): a valid string representing a URL
html (str | None): optional HTML string to skip network request. Defaults to None.
Returns:
Recipe: Recipe Object
"""
scraper = RecipeScraper(translator)
extracted_url = regex_search(r"(https?://|www\.)[^\s]+", url)
if not html:
extracted_url = regex_search(r"(https?://|www\.)[^\s]+", url)
if not extracted_url:
raise HTTPException(status.HTTP_400_BAD_REQUEST, {"details": ParserErrors.BAD_RECIPE_DATA.value})
url = extracted_url.group(0)
if not extracted_url:
raise HTTPException(status.HTTP_400_BAD_REQUEST, {"details": ParserErrors.BAD_RECIPE_DATA.value})
new_recipe, extras = await scraper.scrape(extracted_url.group(0))
new_recipe, extras = await scraper.scrape(url, html)
if not new_recipe:
raise HTTPException(status.HTTP_400_BAD_REQUEST, {"details": ParserErrors.BAD_RECIPE_DATA.value})

View File

@@ -119,6 +119,14 @@ class ABCScraperStrategy(ABC):
class RecipeScraperPackage(ABCScraperStrategy):
@staticmethod
def ld_json_to_html(ld_json: str) -> str:
return (
"<!DOCTYPE html><html><head>"
f'<script type="application/ld+json">{ld_json}</script>'
"</head><body></body></html>"
)
async def get_html(self, url: str) -> str:
return self.raw_html or await safe_scrape_html(url)
@@ -192,7 +200,7 @@ class RecipeScraperPackage(ABCScraperStrategy):
total_time=try_get_default(None, "totalTime", None, cleaner.clean_time, translator=self.translator),
prep_time=try_get_default(None, "prepTime", None, cleaner.clean_time, translator=self.translator),
perform_time=cook_time,
org_url=url,
org_url=url or try_get_default(None, "url", None, cleaner.clean_string),
)
return recipe, extras
@@ -201,7 +209,8 @@ class RecipeScraperPackage(ABCScraperStrategy):
recipe_html = await self.get_html(self.url)
try:
scraped_schema = scrape_html(recipe_html, org_url=self.url, supported_only=False)
# scrape_html requires a URL, but we might not have one, so we default to a dummy URL
scraped_schema = scrape_html(recipe_html, org_url=self.url or "https://example.com", supported_only=False)
except (NoSchemaFoundInWildMode, AttributeError):
self.logger.error(f"Recipe Scraper was unable to extract a recipe from {self.url}")
return None
@@ -300,11 +309,10 @@ class RecipeScraperOpenAI(RecipeScraperPackage):
prompt = service.get_prompt("recipes.scrape-recipe")
response_json = await service.get_response(prompt, text, force_json_response=True)
return (
"<!DOCTYPE html><html><head>"
f'<script type="application/ld+json">{response_json}</script>'
"</head><body></body></html>"
)
if not response_json:
raise Exception("OpenAI did not return any data")
return self.ld_json_to_html(response_json)
except Exception:
self.logger.exception(f"OpenAI was unable to extract a recipe from {url}")
return ""
@@ -340,7 +348,7 @@ class RecipeScraperOpenGraph(ABCScraperStrategy):
"recipeIngredient": ["Could not detect ingredients"],
"recipeInstructions": [{"text": "Could not detect instructions"}],
"slug": slugify(og_field(properties, "og:title")),
"orgURL": self.url,
"orgURL": self.url or og_field(properties, "og:url"),
"categories": [],
"tags": og_fields(properties, "og:article:tag"),
"dateAdded": None,