feat: ✨ Add brute strategy to ingredient processor (#744)

* fix UI column width * words * update parser to support diff strats * add new model url * make button more visible * fix nutrition error * feat(backend): ✨ add 'brute' strategy for parsing ingredients * satisfy linter * update UI for creation page * feat(backend): ✨ log 422 errors in detail when not in PRODUCTION * add strategy selector Co-authored-by: Hayden <hay-kot@pm.me>
2026-02-05 07:23:12 -05:00 · 2021-10-16 16:06:13 -08:00
parent 60908e5a88
commit 3b920babe3
25 changed files with 961 additions and 131 deletions
--- a/mealie/services/parser_services/init.py
+++ b/mealie/services/parser_services/init.py
@@ -1 +1,2 @@
+from .ingredient_parser import *
 from .ingredient_parser_service import *
--- a/mealie/services/parser_services/_helpers/init.py
+++ b/mealie/services/parser_services/_helpers/init.py
@@ -0,0 +1 @@
+from .string_utils import *
--- a/mealie/services/parser_services/_helpers/string_utils.py
+++ b/mealie/services/parser_services/_helpers/string_utils.py
@@ -0,0 +1,23 @@
+import re
+
+compiled_match = re.compile(r"(.){1,6}\s\((.[^\(\)])+\)\s")
+compiled_search = re.compile(r"\((.[^\(])+\)")
+
+
+def move_parens_to_end(ing_str) -> str:
+    """
+    Moves all parentheses in the string to the end of the string using Regex.
+    If no parentheses are found, the string is returned unchanged.
+    """
+    if re.match(compiled_match, ing_str):
+        match = re.search(compiled_search, ing_str)
+        start = match.start()
+        end = match.end()
+        ing_str = ing_str[:start] + ing_str[end:] + " " + ing_str[start:end]
+
+    return ing_str
+
+
+def check_char(char, *eql) -> bool:
+    """Helper method to check if a charaters matches any of the additional provided arguments"""
+    return any(char == eql_char for eql_char in eql)
--- a/mealie/services/parser_services/brute/init.py
+++ b/mealie/services/parser_services/brute/init.py
@@ -0,0 +1 @@
+from .process import parse
--- a/mealie/services/parser_services/brute/process.py
+++ b/mealie/services/parser_services/brute/process.py
@@ -0,0 +1,204 @@
+import string
+import unicodedata
+from typing import Tuple
+
+from pydantic import BaseModel
+
+from .._helpers import check_char, move_parens_to_end
+
+
+class BruteParsedIngredient(BaseModel):
+    food: str = ""
+    note: str = ""
+    amount: float = ""
+    unit: str = ""
+
+    class Config:
+        anystr_strip_whitespace = True
+
+
+def parse_fraction(x):
+    if len(x) == 1 and "fraction" in unicodedata.decomposition(x):
+        frac_split = unicodedata.decomposition(x[-1:]).split()
+        return float((frac_split[1]).replace("003", "")) / float((frac_split[3]).replace("003", ""))
+    else:
+        frac_split = x.split("/")
+        if len(frac_split) != 2:
+            raise ValueError
+        try:
+            return int(frac_split[0]) / int(frac_split[1])
+        except ZeroDivisionError:
+            raise ValueError
+
+
+def parse_amount(ing_str) -> Tuple[float, str, str]:
+    def keep_looping(ing_str, end) -> bool:
+        """
+        Checks if:
+        1. the end of the string is reached
+        2. or if the next character is a digit
+        3. or if the next character looks like an number (e.g. 1/2, 1.3, 1,500)
+        """
+        if end >= len(ing_str):
+            return False
+
+        if ing_str[end] in string.digits:
+            return True
+
+        if check_char(ing_str[end], ".", ",", "/") and end + 1 < len(ing_str) and ing_str[end + 1] in string.digits:
+            return True
+
+    amount = 0
+    unit = ""
+    note = ""
+
+    did_check_frac = False
+    end = 0
+
+    while keep_looping(ing_str, end):
+        end += 1
+
+    if end > 0:
+        if "/" in ing_str[:end]:
+            amount = parse_fraction(ing_str[:end])
+        else:
+            amount = float(ing_str[:end].replace(",", "."))
+    else:
+        amount = parse_fraction(ing_str[0])
+        end += 1
+        did_check_frac = True
+    if end < len(ing_str):
+        if did_check_frac:
+            unit = ing_str[end:]
+        else:
+            try:
+                amount += parse_fraction(ing_str[end])
+
+                unit_end = end + 1
+                unit = ing_str[unit_end:]
+            except ValueError:
+                unit = ing_str[end:]
+
+    # i dont know any unit that starts with ( or - so its likely an alternative like 1L (500ml) Water or 2-3
+    if unit.startswith("(") or unit.startswith("-"):
+        unit = ""
+        note = ing_str
+
+    return amount, unit, note
+
+
+def parse_ingredient_with_comma(tokens) -> Tuple[str, str]:
+    ingredient = ""
+    note = ""
+    start = 0
+    # search for first occurrence of an argument ending in a comma
+    while start < len(tokens) and not tokens[start].endswith(","):
+        start += 1
+    if start == len(tokens):
+        # no token ending in a comma found -> use everything as ingredient
+        ingredient = " ".join(tokens)
+    else:
+        ingredient = " ".join(tokens[: start + 1])[:-1]
+
+        note_end = start + 1
+        note = " ".join(tokens[note_end:])
+    return ingredient, note
+
+
+def parse_ingredient(tokens) -> Tuple[str, str]:
+    ingredient = ""
+    note = ""
+    if tokens[-1].endswith(")"):
+        # Check if the matching opening bracket is in the same token
+        if (not tokens[-1].startswith("(")) and ("(" in tokens[-1]):
+            return parse_ingredient_with_comma(tokens)
+        # last argument ends with closing bracket -> look for opening bracket
+        start = len(tokens) - 1
+        while not tokens[start].startswith("(") and start != 0:
+            start -= 1
+        if start == 0:
+            # the whole list is wrapped in brackets -> assume it is an error (e.g. assumed first argument was the unit)  # noqa: E501
+            raise ValueError
+        elif start < 0:
+            # no opening bracket anywhere -> just ignore the last bracket
+            ingredient, note = parse_ingredient_with_comma(tokens)
+        else:
+            # opening bracket found -> split in ingredient and note, remove brackets from note  # noqa: E501
+            note = " ".join(tokens[start:])[1:-1]
+            ingredient = " ".join(tokens[:start])
+    else:
+        ingredient, note = parse_ingredient_with_comma(tokens)
+    return ingredient, note
+
+
+def parse(ing_str) -> BruteParsedIngredient:
+    amount = 0
+    unit = ""
+    ingredient = ""
+    note = ""
+    unit_note = ""
+
+    ing_str = move_parens_to_end(ing_str)
+
+    tokens = ing_str.split()
+
+    # Early return if the ingrdient is a single token and therefore has no other properties
+    if len(tokens) == 1:
+        ingredient = tokens[0]
+        # TODO Refactor to expect BFP to be returned instead of Tuple
+        return BruteParsedIngredient(food=ingredient, note=note, amount=amount, unit=unit)
+
+    try:
+        # try to parse first argument as amount
+        amount, unit, unit_note = parse_amount(tokens[0])
+        # only try to parse second argument as amount if there are at least
+        # three arguments if it already has a unit there can't be
+        # a fraction for the amount
+        if len(tokens) > 2:
+            try:
+                if unit != "":
+                    # a unit is already found, no need to try the second argument for a fraction
+                    # probably not the best method to do it, but I didn't want to make an if check and paste the exact same thing in the else as already is in the except  # noqa: E501
+                    raise ValueError
+                # try to parse second argument as amount and add that, in case of '2 1/2' or '2 ½'
+                amount += parse_fraction(tokens[1])
+                # assume that units can't end with a comma
+                if len(tokens) > 3 and not tokens[2].endswith(","):
+                    # try to use third argument as unit and everything else as ingredient, use everything as ingredient if it fails  # noqa: E501
+                    try:
+                        ingredient, note = parse_ingredient(tokens[3:])
+                        unit = tokens[2]
+                    except ValueError:
+                        ingredient, note = parse_ingredient(tokens[2:])
+                else:
+                    ingredient, note = parse_ingredient(tokens[2:])
+            except ValueError:
+                # assume that units can't end with a comma
+                if not tokens[1].endswith(","):
+                    # try to use second argument as unit and everything else as ingredient, use everything as ingredient if it fails  # noqa: E501
+                    try:
+                        ingredient, note = parse_ingredient(tokens[2:])
+                        if unit == "":
+                            unit = tokens[1]
+                        else:
+                            note = tokens[1]
+                    except ValueError:
+                        ingredient, note = parse_ingredient(tokens[1:])
+                else:
+                    ingredient, note = parse_ingredient(tokens[1:])
+        else:
+            # only two arguments, first one is the amount
+            # which means this is the ingredient
+            ingredient = tokens[1]
+    except ValueError:
+        try:
+            # can't parse first argument as amount
+            # -> no unit -> parse everything as ingredient
+            ingredient, note = parse_ingredient(tokens)
+        except ValueError:
+            ingredient = " ".join(tokens[1:])
+
+    if unit_note not in note:
+        note += " " + unit_note
+
+    return BruteParsedIngredient(food=ingredient, note=note, amount=amount, unit=unit)
--- a/mealie/services/parser_services/crfpp/init.py
+++ b/mealie/services/parser_services/crfpp/init.py
@@ -0,0 +1 @@
+from .processor import *
--- a/mealie/services/parser_services/crfpp/pre_processor.py
+++ b/mealie/services/parser_services/crfpp/pre_processor.py
@@ -2,23 +2,25 @@ import re
 import unicodedata

 replace_abbreviations = {
-    "cup ": "cup ",
-    " g ": "gram ",
-    "kg ": "kilogram ",
-    "lb ": "pound ",
-    "ml ": "milliliter ",
-    "oz ": "ounce ",
-    "pint ": "pint ",
-    "qt ": "quart ",
-    "tbs ": "tablespoon ",
-    "tbsp ": "tablespoon ",
-    "tsp ": "teaspoon ",
+    "cup": " cup ",
+    "g": " gram ",
+    "kg": " kilogram ",
+    "lb": " pound ",
+    "ml": " milliliter ",
+    "oz": " ounce ",
+    "pint": " pint ",
+    "qt": " quart ",
+    "tbsp": " tablespoon ",
+    "tbs": " tablespoon ",  # Order Matters!, 'tsb' must come after 'tbsp' incase of duplicate matches
+    "tsp": " teaspoon ",
 }


 def replace_common_abbreviations(string: str) -> str:
+
    for k, v in replace_abbreviations.items():
-        string = string.replace(k, v)
+        regex = rf"(?<=\d)\s?({k}s?)"
+        string = re.sub(regex, v, string)

    return string

@@ -81,17 +83,3 @@ def pre_process_string(string: str) -> str:
        string = wrap_or_clause(string)

    return string
-
-
-def main():
-    # TODO: Migrate to unittests
-    print("Starting...")
-    print(pre_process_string("1 tsp. Diamond Crystal or ½ tsp. Morton kosher salt, plus more"))
-    print(pre_process_string("1 tsp. Diamond Crystal or ½ tsp. Morton kosher salt"))
-    print(pre_process_string("¼ cup michiu tou or other rice wine"))
-    print(pre_process_string("1 tbs. wine, expensive or other white wine, plus more"))
-    print("Finished...")
-
-
-if __name__ == "__main__":
-    main()
--- a/mealie/services/parser_services/crfpp/processor.py
+++ b/mealie/services/parser_services/crfpp/processor.py
@@ -12,6 +12,14 @@ CWD = Path(__file__).parent
 MODEL_PATH = CWD / "model.crfmodel"


+class CRFConfidence(BaseModel):
+    average: float = 0.0
+    comment: float = None
+    name: float = None
+    unit: float = None
+    qty: float = None
+
+
 class CRFIngredient(BaseModel):
    input: str = ""
    name: str = ""
@@ -19,15 +27,19 @@ class CRFIngredient(BaseModel):
    qty: str = ""
    comment: str = ""
    unit: str = ""
+    confidence: CRFConfidence

    @validator("qty", always=True, pre=True)
    def validate_qty(qty, values):  # sourcery skip: merge-nested-ifs
        if qty is None or qty == "":
            # Check if other contains a fraction
-            if values["other"] is not None and values["other"].find("/") != -1:
-                return float(Fraction(values["other"])).__round__(1)
-            else:
-                return 1
+            try:
+                if values["other"] is not None and values["other"].find("/") != -1:
+                    return float(Fraction(values["other"])).__round__(1)
+                else:
+                    return 1
+            except Exception:
+                pass

        return qty

--- a/mealie/services/parser_services/crfpp/utils.py
+++ b/mealie/services/parser_services/crfpp/utils.py
@@ -1,4 +1,5 @@
 import re
+from statistics import mean

 from . import tokenizer

@@ -179,6 +180,9 @@ def import_data(lines):
    data = [{}]
    display = [[]]
    prevTag = None
+
+    confidence_all = [{}]
+
    #
    # iterate lines in the data file, which looks like:
    #
@@ -208,6 +212,8 @@ def import_data(lines):
            display.append([])
            prevTag = None

+            confidence_all.append({})
+
        # ignore comments
        elif line[0] == "#":
            pass
@@ -226,6 +232,18 @@ def import_data(lines):
            tag, confidence = re.split(r"/", columns[-1], 1)
            tag = re.sub("^[BI]\-", "", tag).lower()  # noqa: W605 - invalid dscape sequence

+            # ====================
+            # Confidence Getter
+            if prevTag != tag:
+                if confidence_all[-1].get(tag):
+                    confidence_all[-1][tag].append(confidence)
+                else:
+                    confidence_all[-1][tag] = [confidence]
+            else:
+                if confidence_all[-1].get(tag):
+                    confidence_all[-1][tag].append(confidence)
+                else:
+                    confidence_all[-1][tag] = [confidence]
            # ---- DISPLAY ----
            # build a structure which groups each token by its tag, so we can
            # rebuild the original display name later.
@@ -257,13 +275,23 @@ def import_data(lines):
    output = [
        dict([(k, smartJoin(tokens)) for k, tokens in ingredient.items()]) for ingredient in data if len(ingredient)
    ]
-    # Add the marked-up display data
-    for i, v in enumerate(output):
-        output[i]["display"] = displayIngredient(display[i])
+
+    # Preclean Confidence
+    for i, c in enumerate(confidence_all):
+        avg_of_all = []
+        for k, v in c.items():
+            v = [float(x) for x in v]
+            avg = round(mean(v), 2)
+            avg_of_all.append(avg)
+            confidence_all[i][k] = avg
+
+        if avg_of_all:
+            confidence_all[i]["average"] = round(mean(avg_of_all), 2)

    # Add the raw ingredient phrase
-    for i, v in enumerate(output):
-        output[i]["input"] = smartJoin([" ".join(tokens) for k, tokens in display[i]])
+    for i, _ in enumerate(output):
+        output[i]["input"] = smartJoin([" ".join(tokens) for _, tokens in display[i]])
+        output[i]["confidence"] = confidence_all[i]

    return output

--- a/mealie/services/parser_services/ingredient_parser.py
+++ b/mealie/services/parser_services/ingredient_parser.py
@@ -3,9 +3,15 @@ from fractions import Fraction

 from mealie.core.root_logger import get_logger
 from mealie.schema.recipe import RecipeIngredient
-from mealie.schema.recipe.recipe_ingredient import CreateIngredientFood, CreateIngredientUnit
+from mealie.schema.recipe.recipe_ingredient import (
+    CreateIngredientFood,
+    CreateIngredientUnit,
+    IngredientConfidence,
+    ParsedIngredient,
+    RegisteredParser,
+)

-from .crfpp.processor import CRFIngredient, convert_list_to_crf_model
+from . import brute, crfpp

 logger = get_logger(__name__)

@@ -15,12 +21,41 @@ class ABCIngredientParser(ABC):
    Abstract class for ingredient parsers.
    """

+    def parse_one(self, ingredient_string: str) -> ParsedIngredient:
+        pass
+
    @abstractmethod
-    def parse(self, ingredients: list[str]) -> list[RecipeIngredient]:
+    def parse(self, ingredients: list[str]) -> list[ParsedIngredient]:
        ...


-class CRFPPIngredientParser(ABCIngredientParser):
+class BruteForceParser(ABCIngredientParser):
+    """
+    Brute force ingredient parser.
+    """
+
+    def __init__(self) -> None:
+        pass
+
+    def parse_one(self, ingredient: str) -> ParsedIngredient:
+        bfi = brute.parse(ingredient)
+
+        return ParsedIngredient(
+            input=ingredient,
+            ingredient=RecipeIngredient(
+                unit=CreateIngredientUnit(name=bfi.unit),
+                food=CreateIngredientFood(name=bfi.food),
+                disable_amount=False,
+                quantity=bfi.amount,
+                note=bfi.note,
+            ),
+        )
+
+    def parse(self, ingredients: list[str]) -> list[ParsedIngredient]:
+        return [self.parse_one(ingredient) for ingredient in ingredients]
+
+
+class NLPParser(ABCIngredientParser):
    """
    Class for CRFPP ingredient parsers.
    """
@@ -28,7 +63,7 @@ class CRFPPIngredientParser(ABCIngredientParser):
    def __init__(self) -> None:
        pass

-    def _crf_to_ingredient(self, crf_model: CRFIngredient) -> RecipeIngredient:
+    def _crf_to_ingredient(self, crf_model: crfpp.CRFIngredient) -> ParsedIngredient:
        ingredient = None

        try:
@@ -41,15 +76,37 @@ class CRFPPIngredientParser(ABCIngredientParser):
                quantity=float(sum(Fraction(s) for s in crf_model.qty.split())),
            )
        except Exception as e:
+            logger.error(f"Failed to parse ingredient: {crf_model}: {e}")
            # TODO: Capture some sort of state for the user to see that an exception occured
-            logger.exception(e)
            ingredient = RecipeIngredient(
                title="",
                note=crf_model.input,
            )

-        return ingredient
+        return ParsedIngredient(
+            input=crf_model.input,
+            ingredient=ingredient,
+            confidence=IngredientConfidence(
+                quantity=crf_model.confidence.qty,
+                food=crf_model.confidence.name,
+                **crf_model.confidence.dict(),
+            ),
+        )

-    def parse(self, ingredients: list[str]) -> list[RecipeIngredient]:
-        crf_models = convert_list_to_crf_model(ingredients)
+    def parse(self, ingredients: list[str]) -> list[ParsedIngredient]:
+        crf_models = crfpp.convert_list_to_crf_model(ingredients)
        return [self._crf_to_ingredient(crf_model) for crf_model in crf_models]
+
+
+__registrar = {
+    RegisteredParser.nlp: NLPParser,
+    RegisteredParser.brute: BruteForceParser,
+}
+
+
+def get_parser(parser: RegisteredParser) -> ABCIngredientParser:
+    """
+    get_parser returns an ingrdeint parser based on the string enum value
+    passed in.
+    """
+    return __registrar.get(parser, NLPParser)()
--- a/mealie/services/parser_services/ingredient_parser_service.py
+++ b/mealie/services/parser_services/ingredient_parser_service.py
@@ -1,14 +1,19 @@
 from mealie.schema.recipe import RecipeIngredient
 from mealie.services._base_http_service.http_services import UserHttpService

-from .ingredient_parser import ABCIngredientParser, CRFPPIngredientParser
+from .ingredient_parser import ABCIngredientParser, RegisteredParser, get_parser


 class IngredientParserService(UserHttpService):
-    def __init__(self, parser: ABCIngredientParser = None, *args, **kwargs) -> None:
-        self.parser: ABCIngredientParser = parser() if parser else CRFPPIngredientParser()
+    parser: ABCIngredientParser
+
+    def __init__(self, parser: RegisteredParser = RegisteredParser.nlp, *args, **kwargs) -> None:
+        self.set_parser(parser)
        super().__init__(*args, **kwargs)

+    def set_parser(self, parser: RegisteredParser) -> None:
+        self.parser = get_parser(parser)
+
    def populate_item(self) -> None:
        """Satisfy abstract method"""
        pass