mirror of
https://github.com/mealie-recipes/mealie.git
synced 2026-02-14 03:43:10 -05:00
feat: Migrate from CRF++ to Ingredient Parser (a Python package) (#5061)
This commit is contained in:
@@ -250,7 +250,7 @@ class RepositoryGeneric(Generic[Schema, Model]):
|
||||
match_key = match_key or self.primary_key
|
||||
|
||||
result = self._query_one(value, match_key)
|
||||
results_as_model = self.schema.model_validate(result)
|
||||
result_as_model = self.schema.model_validate(result)
|
||||
|
||||
try:
|
||||
self.session.delete(result)
|
||||
@@ -259,10 +259,10 @@ class RepositoryGeneric(Generic[Schema, Model]):
|
||||
self.session.rollback()
|
||||
raise e
|
||||
|
||||
return results_as_model
|
||||
return result_as_model
|
||||
|
||||
def delete_many(self, values: Iterable) -> Schema:
|
||||
query = self._query().filter(self.model.id.in_(values)) # type: ignore
|
||||
def delete_many(self, values: Iterable) -> list[Schema]:
|
||||
query = self._query().filter(self.model.id.in_(values))
|
||||
results = self.session.execute(query).unique().scalars().all()
|
||||
results_as_model = [self.schema.model_validate(result) for result in results]
|
||||
|
||||
@@ -277,7 +277,7 @@ class RepositoryGeneric(Generic[Schema, Model]):
|
||||
self.session.rollback()
|
||||
raise e
|
||||
|
||||
return results_as_model # type: ignore
|
||||
return results_as_model
|
||||
|
||||
def delete_all(self) -> None:
|
||||
delete(self.model)
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
import re as re
|
||||
from collections.abc import Sequence
|
||||
from collections.abc import Iterable, Sequence
|
||||
from random import randint
|
||||
from typing import Self, cast
|
||||
from uuid import UUID
|
||||
@@ -103,6 +103,51 @@ class RepositoryRecipes(HouseholdRepositoryGeneric[Recipe, RecipeModel]):
|
||||
if i >= max_retries:
|
||||
raise
|
||||
|
||||
def _delete_recipe(self, recipe: RecipeModel) -> Recipe:
|
||||
recipe_as_model = self.schema.model_validate(recipe)
|
||||
|
||||
# first remove UserToRecipe entries so we don't run into stale data errors
|
||||
try:
|
||||
user_to_recipe_delete_query = sa.delete(UserToRecipe).where(UserToRecipe.recipe_id == recipe.id)
|
||||
self.session.execute(user_to_recipe_delete_query)
|
||||
self.session.commit()
|
||||
except Exception:
|
||||
self.session.rollback()
|
||||
raise
|
||||
|
||||
# remove the recipe
|
||||
try:
|
||||
self.session.delete(recipe)
|
||||
self.session.commit()
|
||||
except Exception:
|
||||
self.session.rollback()
|
||||
raise
|
||||
|
||||
return recipe_as_model
|
||||
|
||||
def delete(self, value, match_key: str | None = None) -> Recipe:
|
||||
match_key = match_key or self.primary_key
|
||||
recipe_in_db = self._query_one(value, match_key)
|
||||
return self._delete_recipe(recipe_in_db)
|
||||
|
||||
def delete_many(self, values: Iterable) -> list[Recipe]:
|
||||
query = self._query().filter(self.model.id.in_(values))
|
||||
recipes_in_db = self.session.execute(query).unique().scalars().all()
|
||||
results: list[Recipe] = []
|
||||
|
||||
# we create a delete statement for each row
|
||||
# we don't delete the whole query in one statement because postgres doesn't cascade correctly
|
||||
for recipe_in_db in recipes_in_db:
|
||||
results.append(self._delete_recipe(recipe_in_db))
|
||||
|
||||
try:
|
||||
self.session.commit()
|
||||
except Exception as e:
|
||||
self.session.rollback()
|
||||
raise e
|
||||
|
||||
return results
|
||||
|
||||
def update_image(self, slug: str, _: str | None = None) -> int:
|
||||
entry: RecipeModel = self._query_one(match_value=slug)
|
||||
entry.image = randint(0, 255)
|
||||
|
||||
@@ -1,21 +0,0 @@
|
||||
import requests
|
||||
|
||||
from mealie.services.parser_services import crfpp
|
||||
|
||||
MODEL_URL = "https://github.com/mealie-recipes/nlp-model/releases/download/v1.0.0/model.crfmodel"
|
||||
|
||||
|
||||
def main():
|
||||
"""
|
||||
Install the model into the crfpp directory
|
||||
"""
|
||||
|
||||
r = requests.get(MODEL_URL, stream=True, allow_redirects=True)
|
||||
with open(crfpp.MODEL_PATH, "wb") as f:
|
||||
for chunk in r.iter_content(chunk_size=1024):
|
||||
if chunk:
|
||||
f.write(chunk)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1 +0,0 @@
|
||||
from .processor import *
|
||||
@@ -1,69 +0,0 @@
|
||||
import re
|
||||
|
||||
from mealie.services.parser_services.parser_utils import convert_vulgar_fractions_to_regular_fractions
|
||||
|
||||
replace_abbreviations = {
|
||||
"cup": " cup ",
|
||||
"g": " gram ",
|
||||
"kg": " kilogram ",
|
||||
"lb": " pound ",
|
||||
"ml": " milliliter ",
|
||||
"oz": " ounce ",
|
||||
"pint": " pint ",
|
||||
"qt": " quart ",
|
||||
"tbsp": " tablespoon ",
|
||||
"tbs": " tablespoon ", # Order Matters!, 'tsb' must come after 'tbsp' in case of duplicate matches
|
||||
"tsp": " teaspoon ",
|
||||
}
|
||||
|
||||
|
||||
def replace_common_abbreviations(string: str) -> str:
|
||||
for k, v in replace_abbreviations.items():
|
||||
regex = rf"(?<=\d)\s?({k}\bs?)"
|
||||
string = re.sub(regex, v, string)
|
||||
|
||||
return string
|
||||
|
||||
|
||||
def remove_periods(string: str) -> str:
|
||||
"""Removes periods not sournded by digets"""
|
||||
return re.sub(r"(?<!\d)\.(?!\d)", "", string)
|
||||
|
||||
|
||||
def wrap_or_clause(string: str):
|
||||
"""
|
||||
Attempts to wrap or clauses in ()
|
||||
|
||||
Examples:
|
||||
'1 tsp. Diamond Crystal or ½ tsp. Morton kosher salt, plus more'
|
||||
-> '1 teaspoon diamond crystal (or 1/2 teaspoon morton kosher salt), plus more'
|
||||
|
||||
"""
|
||||
# TODO: Needs more adequite testing to be sure this doesn't have side effects.
|
||||
split_by_or = string.split(" or ")
|
||||
|
||||
split_by_comma = split_by_or[1].split(",")
|
||||
|
||||
if len(split_by_comma) > 0:
|
||||
return f"{split_by_or[0]} (or {split_by_comma[0]}),{''.join(split_by_comma[1:])}".strip().removesuffix(",")
|
||||
|
||||
return string
|
||||
|
||||
|
||||
def pre_process_string(string: str) -> str:
|
||||
"""
|
||||
Series of preprocessing functions to make best use of the CRF++ model. The ideal string looks something like...
|
||||
|
||||
{qty} {unit} {food}, {additional}
|
||||
1 tbs. wine, expensive or other white wine, plus more
|
||||
|
||||
"""
|
||||
string = string.lower()
|
||||
string = convert_vulgar_fractions_to_regular_fractions(string)
|
||||
string = remove_periods(string)
|
||||
string = replace_common_abbreviations(string)
|
||||
|
||||
if " or " in string:
|
||||
string = wrap_or_clause(string)
|
||||
|
||||
return string
|
||||
@@ -1,63 +0,0 @@
|
||||
import os
|
||||
import subprocess
|
||||
import tempfile
|
||||
from fractions import Fraction
|
||||
from pathlib import Path
|
||||
from typing import Annotated
|
||||
|
||||
from pydantic import BaseModel, Field, field_validator
|
||||
from pydantic_core.core_schema import ValidationInfo
|
||||
|
||||
from mealie.schema._mealie.types import NoneFloat
|
||||
|
||||
from . import utils
|
||||
from .pre_processor import pre_process_string
|
||||
|
||||
CWD = Path(__file__).parent
|
||||
MODEL_PATH = os.getenv("CRF_MODEL_PATH", default=CWD / "model.crfmodel")
|
||||
|
||||
|
||||
class CRFConfidence(BaseModel):
|
||||
average: float = 0.0
|
||||
comment: NoneFloat = None
|
||||
name: NoneFloat = None
|
||||
unit: NoneFloat = None
|
||||
qty: Annotated[NoneFloat, Field(validate_default=True)] = None
|
||||
|
||||
|
||||
class CRFIngredient(BaseModel):
|
||||
input: str = ""
|
||||
name: str = ""
|
||||
other: str = ""
|
||||
qty: Annotated[str, Field(validate_default=True)] = ""
|
||||
comment: str = ""
|
||||
unit: str = ""
|
||||
confidence: CRFConfidence
|
||||
|
||||
@field_validator("qty", mode="before")
|
||||
def validate_qty(cls, qty, info: ValidationInfo):
|
||||
if qty is not None and qty != "":
|
||||
return qty
|
||||
|
||||
# Check if other contains a fraction
|
||||
try:
|
||||
if info.data["other"] is not None and info.data["other"].find("/") != -1:
|
||||
return str(round(float(Fraction(info.data["other"])), 3))
|
||||
else:
|
||||
return "0"
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
|
||||
def _exec_crf_test(input_text):
|
||||
with tempfile.NamedTemporaryFile(mode="w") as input_file:
|
||||
input_file.write(utils.export_data(input_text))
|
||||
input_file.flush()
|
||||
return subprocess.check_output(["crf_test", "--verbose=1", "--model", MODEL_PATH, input_file.name]).decode(
|
||||
"utf-8"
|
||||
)
|
||||
|
||||
|
||||
def convert_list_to_crf_model(list_of_ingrdeint_text: list[str]):
|
||||
crf_output = _exec_crf_test([pre_process_string(x) for x in list_of_ingrdeint_text])
|
||||
return [CRFIngredient(**ingredient) for ingredient in utils.import_data(crf_output.split("\n"))]
|
||||
@@ -1,38 +0,0 @@
|
||||
import re
|
||||
|
||||
|
||||
def clumpFractions(s):
|
||||
"""
|
||||
Replaces the whitespace between the integer and fractional part of a quantity
|
||||
with a dollar sign, so it's interpreted as a single token. The rest of the
|
||||
string is left alone.
|
||||
clumpFractions("aaa 1 2/3 bbb")
|
||||
# => "aaa 1$2/3 bbb"
|
||||
"""
|
||||
|
||||
return re.sub(r"(\d+)\s+(\d)/(\d)", r"\1$\2/\3", s)
|
||||
|
||||
|
||||
def tokenize(s):
|
||||
"""
|
||||
Tokenize on parenthesis, punctuation, spaces and American units followed by a slash.
|
||||
We sometimes give American units and metric units for baking recipes. For example:
|
||||
* 2 tablespoons/30 mililiters milk or cream
|
||||
* 2 1/2 cups/300 grams all-purpose flour
|
||||
The recipe database only allows for one unit, and we want to use the American one.
|
||||
But we must split the text on "cups/" etc. in order to pick it up.
|
||||
"""
|
||||
|
||||
# handle abbreviation like "100g" by treating it as "100 grams"
|
||||
s = re.sub(r"(\d+)g", r"\1 grams", s)
|
||||
s = re.sub(r"(\d+)oz", r"\1 ounces", s)
|
||||
s = re.sub(r"(\d+)ml", r"\1 milliliters", s, flags=re.IGNORECASE)
|
||||
|
||||
# TODO: Replace american_units with list of units from database?
|
||||
american_units = ["cup", "tablespoon", "teaspoon", "pound", "ounce", "quart", "pint"]
|
||||
# The following removes slashes following American units and replaces it with a space.
|
||||
for unit in american_units:
|
||||
s = s.replace(unit + "/", unit + " ")
|
||||
s = s.replace(unit + "s/", unit + "s ")
|
||||
|
||||
return [token.strip() for token in re.split(r"([,()\s]{1})", clumpFractions(s)) if token and token.strip()]
|
||||
@@ -1,266 +0,0 @@
|
||||
import re
|
||||
from statistics import mean
|
||||
|
||||
from . import tokenizer
|
||||
|
||||
|
||||
def joinLine(columns):
|
||||
return "\t".join(columns)
|
||||
|
||||
|
||||
def unclump(s):
|
||||
"""
|
||||
Replacess $'s with spaces. The reverse of clumpFractions.
|
||||
"""
|
||||
return re.sub(r"\$", " ", s)
|
||||
|
||||
|
||||
def getFeatures(token, index, tokens):
|
||||
"""
|
||||
Returns a list of features for a given token.
|
||||
"""
|
||||
length = len(tokens)
|
||||
|
||||
return [
|
||||
f"I{index}",
|
||||
f"L{lengthGroup(length)}",
|
||||
f"{'Yes' if isCapitalized(token) else 'No'}CAP",
|
||||
f"{'Yes' if insideParenthesis(token, tokens) else 'No'}PAREN",
|
||||
]
|
||||
|
||||
|
||||
def singularize(word):
|
||||
"""
|
||||
A poor replacement for the pattern.en singularize function, but ok for now.
|
||||
"""
|
||||
|
||||
units = {
|
||||
"cups": "cup",
|
||||
"tablespoons": "tablespoon",
|
||||
"teaspoons": "teaspoon",
|
||||
"pounds": "pound",
|
||||
"ounces": "ounce",
|
||||
"cloves": "clove",
|
||||
"sprigs": "sprig",
|
||||
"pinches": "pinch",
|
||||
"bunches": "bunch",
|
||||
"slices": "slice",
|
||||
"grams": "gram",
|
||||
"heads": "head",
|
||||
"quarts": "quart",
|
||||
"stalks": "stalk",
|
||||
"pints": "pint",
|
||||
"pieces": "piece",
|
||||
"sticks": "stick",
|
||||
"dashes": "dash",
|
||||
"fillets": "fillet",
|
||||
"cans": "can",
|
||||
"ears": "ear",
|
||||
"packages": "package",
|
||||
"strips": "strip",
|
||||
"bulbs": "bulb",
|
||||
"bottles": "bottle",
|
||||
}
|
||||
|
||||
if word in units.keys():
|
||||
return units[word]
|
||||
else:
|
||||
return word
|
||||
|
||||
|
||||
def isCapitalized(token):
|
||||
"""
|
||||
Returns true if a given token starts with a capital letter.
|
||||
"""
|
||||
return re.match(r"^[A-Z]", token) is not None
|
||||
|
||||
|
||||
def lengthGroup(actualLength):
|
||||
"""
|
||||
Buckets the length of the ingredient into 6 buckets.
|
||||
"""
|
||||
for n in [4, 8, 12, 16, 20]:
|
||||
if actualLength < n:
|
||||
return str(n)
|
||||
|
||||
return "X"
|
||||
|
||||
|
||||
def insideParenthesis(token, tokens):
|
||||
"""
|
||||
Returns true if the word is inside parenthesis in the phrase.
|
||||
"""
|
||||
if token in ["(", ")"]:
|
||||
return True
|
||||
else:
|
||||
line = " ".join(tokens)
|
||||
return (
|
||||
re.match(r".*\(.*" + re.escape(token) + r".*\).*", line) is not None # - invalid dscape sequence
|
||||
)
|
||||
|
||||
|
||||
def displayIngredient(ingredient):
|
||||
"""
|
||||
Format a list of (tag, [tokens]) tuples as an HTML string for display.
|
||||
|
||||
displayIngredient([("qty", ["1"]), ("name", ["cat", "pie"])])
|
||||
# => <span class='qty'>1</span> <span class='name'>cat pie</span>
|
||||
"""
|
||||
|
||||
return "".join(["<span class='{}'>{}</span>".format(tag, " ".join(tokens)) for tag, tokens in ingredient])
|
||||
|
||||
|
||||
# HACK: fix this
|
||||
def smartJoin(words):
|
||||
"""
|
||||
Joins list of words with spaces, but is smart about not adding spaces
|
||||
before commas.
|
||||
"""
|
||||
|
||||
input = " ".join(words)
|
||||
|
||||
# replace " , " with ", "
|
||||
input = input.replace(" , ", ", ")
|
||||
|
||||
# replace " ( " with " ("
|
||||
input = input.replace("( ", "(")
|
||||
|
||||
# replace " ) " with ") "
|
||||
input = input.replace(" )", ")")
|
||||
|
||||
return input
|
||||
|
||||
|
||||
def import_data(lines):
|
||||
"""
|
||||
This thing takes the output of CRF++ and turns it into an actual
|
||||
data structure.
|
||||
"""
|
||||
data = [{}]
|
||||
display = [[]]
|
||||
prevTag = None
|
||||
|
||||
confidence_all = [{}]
|
||||
|
||||
#
|
||||
# iterate lines in the data file, which looks like:
|
||||
#
|
||||
# # 0.511035
|
||||
# 1/2 I1 L12 NoCAP X B-QTY/0.982850
|
||||
# teaspoon I2 L12 NoCAP X B-UNIT/0.982200
|
||||
# fresh I3 L12 NoCAP X B-COMMENT/0.716364
|
||||
# thyme I4 L12 NoCAP X B-NAME/0.816803
|
||||
# leaves I5 L12 NoCAP X I-NAME/0.960524
|
||||
# , I6 L12 NoCAP X B-COMMENT/0.772231
|
||||
# finely I7 L12 NoCAP X I-COMMENT/0.825956
|
||||
# chopped I8 L12 NoCAP X I-COMMENT/0.893379
|
||||
#
|
||||
# # 0.505999
|
||||
# Black I1 L8 YesCAP X B-NAME/0.765461
|
||||
# pepper I2 L8 NoCAP X I-NAME/0.756614
|
||||
# , I3 L8 NoCAP X OTHER/0.798040
|
||||
# to I4 L8 NoCAP X B-COMMENT/0.683089
|
||||
# taste I5 L8 NoCAP X I-COMMENT/0.848617
|
||||
#
|
||||
# i.e. the output of crf_test -v 1
|
||||
#
|
||||
for line in lines:
|
||||
# blank line starts a new ingredient
|
||||
if line in ("", "\n"):
|
||||
data.append({})
|
||||
display.append([])
|
||||
prevTag = None
|
||||
|
||||
confidence_all.append({})
|
||||
|
||||
# ignore comments
|
||||
elif line[0] == "#":
|
||||
pass
|
||||
|
||||
# otherwise it's a token
|
||||
# e.g.: potato \t I2 \t L5 \t NoCAP \t B-NAME/0.978253
|
||||
else:
|
||||
columns = re.split("\t", line.strip())
|
||||
token = columns[0].strip()
|
||||
|
||||
# unclump fractions
|
||||
token = unclump(token)
|
||||
|
||||
# turn B-NAME/123 back into "name"
|
||||
tag, confidence = re.split(r"/", columns[-1], maxsplit=1)
|
||||
tag = re.sub(r"^[BI]\-", "", tag).lower() # - invalid dscape sequence
|
||||
|
||||
# ====================
|
||||
# Confidence Getter
|
||||
if prevTag != tag:
|
||||
if confidence_all[-1].get(tag):
|
||||
confidence_all[-1][tag].append(confidence)
|
||||
else:
|
||||
confidence_all[-1][tag] = [confidence]
|
||||
else:
|
||||
if confidence_all[-1].get(tag):
|
||||
confidence_all[-1][tag].append(confidence)
|
||||
else:
|
||||
confidence_all[-1][tag] = [confidence]
|
||||
# ---- DISPLAY ----
|
||||
# build a structure which groups each token by its tag, so we can
|
||||
# rebuild the original display name later.
|
||||
|
||||
if prevTag != tag:
|
||||
display[-1].append((tag, [token]))
|
||||
prevTag = tag
|
||||
|
||||
else:
|
||||
display[-1][-1][1].append(token)
|
||||
# ^- token
|
||||
# ^---- tag
|
||||
# ^-------- ingredient
|
||||
|
||||
# ---- DATA ----
|
||||
# build a dict grouping tokens by their tag
|
||||
|
||||
# initialize this attribute if this is the first token of its kind
|
||||
if tag not in data[-1]:
|
||||
data[-1][tag] = []
|
||||
|
||||
# HACK: If this token is a unit, singularize it so Scoop accepts it.
|
||||
if tag == "unit":
|
||||
token = singularize(token)
|
||||
|
||||
data[-1][tag].append(token)
|
||||
|
||||
# reassemble the output into a list of dicts.
|
||||
output = [{k: smartJoin(tokens) for k, tokens in ingredient.items()} for ingredient in data if len(ingredient)]
|
||||
|
||||
# Preclean Confidence
|
||||
for i, c in enumerate(confidence_all):
|
||||
avg_of_all = []
|
||||
for k, v in c.items():
|
||||
v = [float(x) for x in v]
|
||||
avg = round(mean(v), 2)
|
||||
avg_of_all.append(avg)
|
||||
confidence_all[i][k] = avg
|
||||
|
||||
if avg_of_all:
|
||||
confidence_all[i]["average"] = round(mean(avg_of_all), 2)
|
||||
|
||||
# Add the raw ingredient phrase
|
||||
for i, _ in enumerate(output):
|
||||
output[i]["input"] = smartJoin([" ".join(tokens) for _, tokens in display[i]])
|
||||
output[i]["confidence"] = confidence_all[i]
|
||||
|
||||
return output
|
||||
|
||||
|
||||
def export_data(lines):
|
||||
"""Parse "raw" ingredient lines into CRF-ready output"""
|
||||
output = []
|
||||
for line in lines:
|
||||
line_clean = re.sub("<[^<]+?>", "", line)
|
||||
tokens = tokenizer.tokenize(line_clean)
|
||||
|
||||
for i, token in enumerate(tokens):
|
||||
features = getFeatures(token, i + 1, tokens)
|
||||
output.append(joinLine([token, *features]))
|
||||
output.append("")
|
||||
return "\n".join(output)
|
||||
@@ -1,12 +1,12 @@
|
||||
from fractions import Fraction
|
||||
|
||||
from ingredient_parser import parse_ingredient
|
||||
from ingredient_parser.dataclasses import CompositeIngredientAmount, IngredientAmount
|
||||
from ingredient_parser.dataclasses import ParsedIngredient as IngredientParserParsedIngredient
|
||||
from pydantic import UUID4
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from mealie.core.root_logger import get_logger
|
||||
from mealie.schema.recipe import RecipeIngredient
|
||||
from mealie.schema.recipe.recipe_ingredient import (
|
||||
MAX_INGREDIENT_DENOMINATOR,
|
||||
CreateIngredientFood,
|
||||
CreateIngredientUnit,
|
||||
IngredientConfidence,
|
||||
@@ -14,8 +14,9 @@ from mealie.schema.recipe.recipe_ingredient import (
|
||||
RegisteredParser,
|
||||
)
|
||||
|
||||
from . import brute, crfpp, openai
|
||||
from . import brute, openai
|
||||
from ._base import ABCIngredientParser
|
||||
from .parser_utils import extract_quantity_from_string
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
@@ -47,50 +48,110 @@ class BruteForceParser(ABCIngredientParser):
|
||||
|
||||
class NLPParser(ABCIngredientParser):
|
||||
"""
|
||||
Class for CRFPP ingredient parsers.
|
||||
Class for Ingredient Parser library
|
||||
"""
|
||||
|
||||
def _crf_to_ingredient(self, crf_model: crfpp.CRFIngredient) -> ParsedIngredient:
|
||||
ingredient = None
|
||||
@staticmethod
|
||||
def _extract_amount(ingredient: IngredientParserParsedIngredient) -> IngredientAmount:
|
||||
if not (ingredient_amounts := ingredient.amount):
|
||||
return IngredientAmount(quantity=0, quantity_max=0, unit="", text="", confidence=0, starting_index=-1)
|
||||
|
||||
try:
|
||||
ingredient = RecipeIngredient(
|
||||
title="",
|
||||
note=crf_model.comment,
|
||||
unit=CreateIngredientUnit(name=crf_model.unit),
|
||||
food=CreateIngredientFood(name=crf_model.name),
|
||||
disable_amount=False,
|
||||
quantity=float(
|
||||
sum(Fraction(s).limit_denominator(MAX_INGREDIENT_DENOMINATOR) for s in crf_model.qty.split())
|
||||
),
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to parse ingredient: {crf_model}: {e}")
|
||||
# TODO: Capture some sort of state for the user to see that an exception occurred
|
||||
ingredient = RecipeIngredient(
|
||||
title="",
|
||||
note=crf_model.input,
|
||||
)
|
||||
ingredient_amount = ingredient_amounts[0]
|
||||
if isinstance(ingredient_amount, CompositeIngredientAmount):
|
||||
ingredient_amount = ingredient_amount.amounts[0]
|
||||
|
||||
return ingredient_amount
|
||||
|
||||
@staticmethod
|
||||
def _extract_quantity(ingredient_amount: IngredientAmount) -> tuple[float, float]:
|
||||
confidence = ingredient_amount.confidence
|
||||
|
||||
if isinstance(ingredient_amount.quantity, str):
|
||||
return extract_quantity_from_string(ingredient_amount.quantity)[0], confidence
|
||||
else:
|
||||
try:
|
||||
return float(ingredient_amount.quantity), confidence
|
||||
except ValueError:
|
||||
return 0, 0
|
||||
|
||||
@staticmethod
|
||||
def _extract_unit(ingredient_amount: IngredientAmount) -> tuple[str, float]:
|
||||
confidence = ingredient_amount.confidence
|
||||
unit = str(ingredient_amount.unit) if ingredient_amount.unit else ""
|
||||
return unit, confidence
|
||||
|
||||
@staticmethod
|
||||
def _extract_food(ingredient: IngredientParserParsedIngredient) -> tuple[str, float]:
|
||||
confidence = ingredient.name.confidence if ingredient.name else 0
|
||||
food = str(ingredient.name.text) if ingredient.name else ""
|
||||
return food, confidence
|
||||
|
||||
@staticmethod
|
||||
def _extract_note(ingredient: IngredientParserParsedIngredient) -> tuple[str, float]:
|
||||
confidences: list[float] = []
|
||||
note_parts: list[str] = []
|
||||
if ingredient.size:
|
||||
note_parts.append(ingredient.size.text)
|
||||
confidences.append(ingredient.size.confidence)
|
||||
if ingredient.preparation:
|
||||
note_parts.append(ingredient.preparation.text)
|
||||
confidences.append(ingredient.preparation.confidence)
|
||||
if ingredient.comment:
|
||||
note_parts.append(ingredient.comment.text)
|
||||
confidences.append(ingredient.comment.confidence)
|
||||
|
||||
# average confidence among all note parts
|
||||
confidence = sum(confidences) / len(confidences) if confidences else 0
|
||||
note = ", ".join(note_parts)
|
||||
note = note.replace("(", "").replace(")", "")
|
||||
|
||||
return note, confidence
|
||||
|
||||
def _convert_ingredient(self, ingredient: IngredientParserParsedIngredient) -> ParsedIngredient:
|
||||
ingredient_amount = self._extract_amount(ingredient)
|
||||
qty, qty_conf = self._extract_quantity(ingredient_amount)
|
||||
unit, unit_conf = self._extract_unit(ingredient_amount)
|
||||
food, food_conf = self._extract_food(ingredient)
|
||||
note, note_conf = self._extract_note(ingredient)
|
||||
|
||||
# average confidence for components which were parsed
|
||||
confidences: list[float] = []
|
||||
if qty:
|
||||
confidences.append(qty_conf)
|
||||
if unit:
|
||||
confidences.append(unit_conf)
|
||||
if food:
|
||||
confidences.append(food_conf)
|
||||
if note:
|
||||
confidences.append(note_conf)
|
||||
|
||||
parsed_ingredient = ParsedIngredient(
|
||||
input=crf_model.input,
|
||||
ingredient=ingredient,
|
||||
input=ingredient.sentence,
|
||||
confidence=IngredientConfidence(
|
||||
quantity=crf_model.confidence.qty,
|
||||
food=crf_model.confidence.name,
|
||||
**crf_model.confidence.model_dump(),
|
||||
average=(sum(confidences) / len(confidences)) if confidences else 0,
|
||||
quantity=qty_conf,
|
||||
unit=unit_conf,
|
||||
food=food_conf,
|
||||
comment=note_conf,
|
||||
),
|
||||
ingredient=RecipeIngredient(
|
||||
title="",
|
||||
quantity=qty,
|
||||
unit=CreateIngredientUnit(name=unit) if unit else None,
|
||||
food=CreateIngredientFood(name=food) if food else None,
|
||||
disable_amount=False,
|
||||
note=note,
|
||||
),
|
||||
)
|
||||
|
||||
return self.find_ingredient_match(parsed_ingredient)
|
||||
|
||||
async def parse(self, ingredients: list[str]) -> list[ParsedIngredient]:
|
||||
crf_models = crfpp.convert_list_to_crf_model(ingredients)
|
||||
return [self._crf_to_ingredient(crf_model) for crf_model in crf_models]
|
||||
|
||||
async def parse_one(self, ingredient_string: str) -> ParsedIngredient:
|
||||
items = await self.parse([ingredient_string])
|
||||
return items[0]
|
||||
parsed_ingredient = parse_ingredient(ingredient_string)
|
||||
return self._convert_ingredient(parsed_ingredient)
|
||||
|
||||
async def parse(self, ingredients: list[str]) -> list[ParsedIngredient]:
|
||||
return [await self.parse_one(ingredient) for ingredient in ingredients]
|
||||
|
||||
|
||||
__registrar: dict[RegisteredParser, type[ABCIngredientParser]] = {
|
||||
|
||||
Reference in New Issue
Block a user