Feature/CRF++ and server side locales (#731)

* add universal toast plugin * add server side locales * integrate CRF++ into CI/CD Pipeline * docs(docs): 📝 add recipe parser docs * feat(backend): ✨ Continued work on ingredient parsers * add new model dest * feat(frontend): ✨ New ingredient parser page * formatting Co-authored-by: Hayden <hay-kot@pm.me>
2025-12-15 06:45:23 -05:00 · 2021-10-09 13:08:23 -08:00
parent c16f07950f
commit 60908e5a88
43 changed files with 610 additions and 186 deletions
--- a/mealie/services/parser_services/init.py
+++ b/mealie/services/parser_services/init.py
@@ -0,0 +1 @@
+from .ingredient_parser_service import *
--- a/mealie/services/parser_services/crfpp/init.py
+++ b/mealie/services/parser_services/crfpp/init.py
--- a/mealie/services/parser_services/crfpp/pre_processor.py
+++ b/mealie/services/parser_services/crfpp/pre_processor.py
@@ -0,0 +1,97 @@
+import re
+import unicodedata
+
+replace_abbreviations = {
+    "cup ": "cup ",
+    " g ": "gram ",
+    "kg ": "kilogram ",
+    "lb ": "pound ",
+    "ml ": "milliliter ",
+    "oz ": "ounce ",
+    "pint ": "pint ",
+    "qt ": "quart ",
+    "tbs ": "tablespoon ",
+    "tbsp ": "tablespoon ",
+    "tsp ": "teaspoon ",
+}
+
+
+def replace_common_abbreviations(string: str) -> str:
+    for k, v in replace_abbreviations.items():
+        string = string.replace(k, v)
+
+    return string
+
+
+def remove_periods(string: str) -> str:
+    """Removes periods not sournded by digets"""
+    return re.sub(r"(?<!\d)\.(?!\d)", "", string)
+
+
+def replace_fraction_unicode(string: str):
+    # TODO: I'm not confident this works well enough for production needs some testing and/or refacorting
+    # TODO: Breaks on multiple unicode fractions
+    for c in string:
+        try:
+            name = unicodedata.name(c)
+        except ValueError:
+            continue
+        if name.startswith("VULGAR FRACTION"):
+            normalized = unicodedata.normalize("NFKC", c)
+            numerator, _, denominator = normalized.partition("⁄")  # _ = slash
+            text = f" {numerator}/{denominator}"
+            return string.replace(c, text).replace("  ", " ")
+
+    return string
+
+
+def wrap_or_clause(string: str):
+    """
+    Attempts to wrap or clauses in ()
+
+    Examples:
+    '1 tsp. Diamond Crystal or ½ tsp. Morton kosher salt, plus more' -> '1 teaspoon diamond crystal (or 1/2 teaspoon morton kosher salt), plus more'
+
+    """
+    # TODO: Needs more adequite testing to be sure this doens't have side effects.
+    split_by_or = string.split(" or ")
+
+    split_by_comma = split_by_or[1].split(",")
+
+    if len(split_by_comma) > 0:
+        return f"{split_by_or[0]} (or {split_by_comma[0]}),{''.join(split_by_comma[1:])}".strip().removesuffix(",")
+
+    return string
+
+
+def pre_process_string(string: str) -> str:
+    """
+    Series of preprocessing functions to make best use of the CRF++ model. The ideal string looks something like...
+
+    {qty} {unit} {food}, {additional}
+    1 tbs. wine, expensive or other white wine, plus more
+
+    """
+    string = string.lower()
+    string = replace_fraction_unicode(string)
+    string = remove_periods(string)
+    string = replace_common_abbreviations(string)
+
+    if " or " in string:
+        string = wrap_or_clause(string)
+
+    return string
+
+
+def main():
+    # TODO: Migrate to unittests
+    print("Starting...")
+    print(pre_process_string("1 tsp. Diamond Crystal or ½ tsp. Morton kosher salt, plus more"))
+    print(pre_process_string("1 tsp. Diamond Crystal or ½ tsp. Morton kosher salt"))
+    print(pre_process_string("¼ cup michiu tou or other rice wine"))
+    print(pre_process_string("1 tbs. wine, expensive or other white wine, plus more"))
+    print("Finished...")
+
+
+if __name__ == "__main__":
+    main()
--- a/mealie/services/parser_services/crfpp/processor.py
+++ b/mealie/services/parser_services/crfpp/processor.py
@@ -0,0 +1,46 @@
+import subprocess
+import tempfile
+from fractions import Fraction
+from pathlib import Path
+
+from pydantic import BaseModel, validator
+
+from . import utils
+from .pre_processor import pre_process_string
+
+CWD = Path(__file__).parent
+MODEL_PATH = CWD / "model.crfmodel"
+
+
+class CRFIngredient(BaseModel):
+    input: str = ""
+    name: str = ""
+    other: str = ""
+    qty: str = ""
+    comment: str = ""
+    unit: str = ""
+
+    @validator("qty", always=True, pre=True)
+    def validate_qty(qty, values):  # sourcery skip: merge-nested-ifs
+        if qty is None or qty == "":
+            # Check if other contains a fraction
+            if values["other"] is not None and values["other"].find("/") != -1:
+                return float(Fraction(values["other"])).__round__(1)
+            else:
+                return 1
+
+        return qty
+
+
+def _exec_crf_test(input_text):
+    with tempfile.NamedTemporaryFile(mode="w") as input_file:
+        input_file.write(utils.export_data(input_text))
+        input_file.flush()
+        return subprocess.check_output(["crf_test", "--verbose=1", "--model", MODEL_PATH, input_file.name]).decode(
+            "utf-8"
+        )
+
+
+def convert_list_to_crf_model(list_of_ingrdeint_text: list[str]):
+    crf_output = _exec_crf_test([pre_process_string(x) for x in list_of_ingrdeint_text])
+    return [CRFIngredient(**ingredient) for ingredient in utils.import_data(crf_output.split("\n"))]
--- a/mealie/services/parser_services/crfpp/tokenizer.py
+++ b/mealie/services/parser_services/crfpp/tokenizer.py
@@ -0,0 +1,38 @@
+import re
+
+
+def clumpFractions(s):
+    """
+    Replaces the whitespace between the integer and fractional part of a quantity
+    with a dollar sign, so it's interpreted as a single token. The rest of the
+    string is left alone.
+        clumpFractions("aaa 1 2/3 bbb")
+        # => "aaa 1$2/3 bbb"
+    """
+
+    return re.sub(r"(\d+)\s+(\d)/(\d)", r"\1$\2/\3", s)
+
+
+def tokenize(s):
+    """
+    Tokenize on parenthesis, punctuation, spaces and American units followed by a slash.
+    We sometimes give American units and metric units for baking recipes. For example:
+        * 2 tablespoons/30 mililiters milk or cream
+        * 2 1/2 cups/300 grams all-purpose flour
+    The recipe database only allows for one unit, and we want to use the American one.
+    But we must split the text on "cups/" etc. in order to pick it up.
+    """
+
+    # handle abbreviation like "100g" by treating it as "100 grams"
+    s = re.sub(r"(\d+)g", r"\1 grams", s)
+    s = re.sub(r"(\d+)oz", r"\1 ounces", s)
+    s = re.sub(r"(\d+)ml", r"\1 milliliters", s, flags=re.IGNORECASE)
+
+    # TODO: Replace american_units with list of units from database?
+    american_units = ["cup", "tablespoon", "teaspoon", "pound", "ounce", "quart", "pint"]
+    # The following removes slashes following American units and replaces it with a space.
+    for unit in american_units:
+        s = s.replace(unit + "/", unit + " ")
+        s = s.replace(unit + "s/", unit + "s ")
+
+    return [token.strip() for token in re.split(r"([,()\s]{1})", clumpFractions(s)) if token and token.strip()]
--- a/mealie/services/parser_services/crfpp/utils.py
+++ b/mealie/services/parser_services/crfpp/utils.py
@@ -0,0 +1,282 @@
+import re
+
+from . import tokenizer
+
+
+def joinLine(columns):
+    return "\t".join(columns)
+
+
+def cleanUnicodeFractions(s):
+    """
+    Replace unicode fractions with ascii representation, preceded by a
+    space.
+
+    "1\x215e" => "1 7/8"
+    """
+
+    fractions = {
+        "\x215b": "1/8",
+        "\x215c": "3/8",
+        "\x215d": "5/8",
+        "\x215e": "7/8",
+        "\x2159": "1/6",
+        "\x215a": "5/6",
+        "\x2155": "1/5",
+        "\x2156": "2/5",
+        "\x2157": "3/5",
+        "\x2158": "4/5",
+        "\xbc": " 1/4",
+        "\xbe": "3/4",
+        "\x2153": "1/3",
+        "\x2154": "2/3",
+        "\xbd": "1/2",
+    }
+
+    for f_unicode, f_ascii in fractions.items():
+        s = s.replace(f_unicode, " " + f_ascii)
+
+    return s
+
+
+def unclump(s):
+    """
+    Replacess $'s with spaces. The reverse of clumpFractions.
+    """
+    return re.sub(r"\$", " ", s)
+
+
+def normalizeToken(s):
+    """
+    ToDo: FIX THIS. We used to use the pattern.en package to singularize words, but
+    in the name of simple deployments, we took it out. We should fix this at some
+    point.
+    """
+    return singularize(s)
+
+
+def getFeatures(token, index, tokens):
+    """
+    Returns a list of features for a given token.
+    """
+    length = len(tokens)
+
+    return [
+        ("I%s" % index),
+        ("L%s" % lengthGroup(length)),
+        ("Yes" if isCapitalized(token) else "No") + "CAP",
+        ("Yes" if insideParenthesis(token, tokens) else "No") + "PAREN",
+    ]
+
+
+def singularize(word):
+    """
+    A poor replacement for the pattern.en singularize function, but ok for now.
+    """
+
+    units = {
+        "cups": "cup",
+        "tablespoons": "tablespoon",
+        "teaspoons": "teaspoon",
+        "pounds": "pound",
+        "ounces": "ounce",
+        "cloves": "clove",
+        "sprigs": "sprig",
+        "pinches": "pinch",
+        "bunches": "bunch",
+        "slices": "slice",
+        "grams": "gram",
+        "heads": "head",
+        "quarts": "quart",
+        "stalks": "stalk",
+        "pints": "pint",
+        "pieces": "piece",
+        "sticks": "stick",
+        "dashes": "dash",
+        "fillets": "fillet",
+        "cans": "can",
+        "ears": "ear",
+        "packages": "package",
+        "strips": "strip",
+        "bulbs": "bulb",
+        "bottles": "bottle",
+    }
+
+    if word in units.keys():
+        return units[word]
+    else:
+        return word
+
+
+def isCapitalized(token):
+    """
+    Returns true if a given token starts with a capital letter.
+    """
+    return re.match(r"^[A-Z]", token) is not None
+
+
+def lengthGroup(actualLength):
+    """
+    Buckets the length of the ingredient into 6 buckets.
+    """
+    for n in [4, 8, 12, 16, 20]:
+        if actualLength < n:
+            return str(n)
+
+    return "X"
+
+
+def insideParenthesis(token, tokens):
+    """
+    Returns true if the word is inside parenthesis in the phrase.
+    """
+    if token in ["(", ")"]:
+        return True
+    else:
+        line = " ".join(tokens)
+        return (
+            re.match(r".*\(.*" + re.escape(token) + ".*\).*", line) is not None  # noqa: W605 - invalid dscape sequence
+        )
+
+
+def displayIngredient(ingredient):
+    """
+    Format a list of (tag, [tokens]) tuples as an HTML string for display.
+
+        displayIngredient([("qty", ["1"]), ("name", ["cat", "pie"])])
+        # => <span class='qty'>1</span> <span class='name'>cat pie</span>
+    """
+
+    return "".join(["<span class='%s'>%s</span>" % (tag, " ".join(tokens)) for tag, tokens in ingredient])
+
+
+# HACK: fix this
+def smartJoin(words):
+    """
+    Joins list of words with spaces, but is smart about not adding spaces
+    before commas.
+    """
+
+    input = " ".join(words)
+
+    # replace " , " with ", "
+    input = input.replace(" , ", ", ")
+
+    # replace " ( " with " ("
+    input = input.replace("( ", "(")
+
+    # replace " ) " with ") "
+    input = input.replace(" )", ")")
+
+    return input
+
+
+def import_data(lines):
+    """
+    This thing takes the output of CRF++ and turns it into an actual
+    data structure.
+    """
+    data = [{}]
+    display = [[]]
+    prevTag = None
+    #
+    # iterate lines in the data file, which looks like:
+    #
+    #   # 0.511035
+    #   1/2       I1  L12  NoCAP  X  B-QTY/0.982850
+    #   teaspoon  I2  L12  NoCAP  X  B-UNIT/0.982200
+    #   fresh     I3  L12  NoCAP  X  B-COMMENT/0.716364
+    #   thyme     I4  L12  NoCAP  X  B-NAME/0.816803
+    #   leaves    I5  L12  NoCAP  X  I-NAME/0.960524
+    #   ,         I6  L12  NoCAP  X  B-COMMENT/0.772231
+    #   finely    I7  L12  NoCAP  X  I-COMMENT/0.825956
+    #   chopped   I8  L12  NoCAP  X  I-COMMENT/0.893379
+    #
+    #   # 0.505999
+    #   Black   I1  L8  YesCAP  X  B-NAME/0.765461
+    #   pepper  I2  L8  NoCAP   X  I-NAME/0.756614
+    #   ,       I3  L8  NoCAP   X  OTHER/0.798040
+    #   to      I4  L8  NoCAP   X  B-COMMENT/0.683089
+    #   taste   I5  L8  NoCAP   X  I-COMMENT/0.848617
+    #
+    # i.e. the output of crf_test -v 1
+    #
+    for line in lines:
+        # blank line starts a new ingredient
+        if line in ("", "\n"):
+            data.append({})
+            display.append([])
+            prevTag = None
+
+        # ignore comments
+        elif line[0] == "#":
+            pass
+
+        # otherwise it's a token
+        # e.g.: potato \t I2 \t L5 \t NoCAP \t B-NAME/0.978253
+        else:
+
+            columns = re.split("\t", line.strip())
+            token = columns[0].strip()
+
+            # unclump fractions
+            token = unclump(token)
+
+            # turn B-NAME/123 back into "name"
+            tag, confidence = re.split(r"/", columns[-1], 1)
+            tag = re.sub("^[BI]\-", "", tag).lower()  # noqa: W605 - invalid dscape sequence
+
+            # ---- DISPLAY ----
+            # build a structure which groups each token by its tag, so we can
+            # rebuild the original display name later.
+
+            if prevTag != tag:
+                display[-1].append((tag, [token]))
+                prevTag = tag
+
+            else:
+                display[-1][-1][1].append(token)
+                #               ^- token
+                #            ^---- tag
+                #        ^-------- ingredient
+
+            # ---- DATA ----
+            # build a dict grouping tokens by their tag
+
+            # initialize this attribute if this is the first token of its kind
+            if tag not in data[-1]:
+                data[-1][tag] = []
+
+            # HACK: If this token is a unit, singularize it so Scoop accepts it.
+            if tag == "unit":
+                token = singularize(token)
+
+            data[-1][tag].append(token)
+
+    # reassemble the output into a list of dicts.
+    output = [
+        dict([(k, smartJoin(tokens)) for k, tokens in ingredient.items()]) for ingredient in data if len(ingredient)
+    ]
+    # Add the marked-up display data
+    for i, v in enumerate(output):
+        output[i]["display"] = displayIngredient(display[i])
+
+    # Add the raw ingredient phrase
+    for i, v in enumerate(output):
+        output[i]["input"] = smartJoin([" ".join(tokens) for k, tokens in display[i]])
+
+    return output
+
+
+def export_data(lines):
+    """ Parse "raw" ingredient lines into CRF-ready output """
+    output = []
+    for line in lines:
+        line_clean = re.sub("<[^<]+?>", "", line)
+        tokens = tokenizer.tokenize(line_clean)
+
+        for i, token in enumerate(tokens):
+            features = getFeatures(token, i + 1, tokens)
+            output.append(joinLine([token] + features))
+        output.append("")
+    return "\n".join(output)
--- a/mealie/services/parser_services/ingredient_parser.py
+++ b/mealie/services/parser_services/ingredient_parser.py
@@ -0,0 +1,55 @@
+from abc import ABC, abstractmethod
+from fractions import Fraction
+
+from mealie.core.root_logger import get_logger
+from mealie.schema.recipe import RecipeIngredient
+from mealie.schema.recipe.recipe_ingredient import CreateIngredientFood, CreateIngredientUnit
+
+from .crfpp.processor import CRFIngredient, convert_list_to_crf_model
+
+logger = get_logger(__name__)
+
+
+class ABCIngredientParser(ABC):
+    """
+    Abstract class for ingredient parsers.
+    """
+
+    @abstractmethod
+    def parse(self, ingredients: list[str]) -> list[RecipeIngredient]:
+        ...
+
+
+class CRFPPIngredientParser(ABCIngredientParser):
+    """
+    Class for CRFPP ingredient parsers.
+    """
+
+    def __init__(self) -> None:
+        pass
+
+    def _crf_to_ingredient(self, crf_model: CRFIngredient) -> RecipeIngredient:
+        ingredient = None
+
+        try:
+            ingredient = RecipeIngredient(
+                title="",
+                note=crf_model.comment,
+                unit=CreateIngredientUnit(name=crf_model.unit),
+                food=CreateIngredientFood(name=crf_model.name),
+                disable_amount=False,
+                quantity=float(sum(Fraction(s) for s in crf_model.qty.split())),
+            )
+        except Exception as e:
+            # TODO: Capture some sort of state for the user to see that an exception occured
+            logger.exception(e)
+            ingredient = RecipeIngredient(
+                title="",
+                note=crf_model.input,
+            )
+
+        return ingredient
+
+    def parse(self, ingredients: list[str]) -> list[RecipeIngredient]:
+        crf_models = convert_list_to_crf_model(ingredients)
+        return [self._crf_to_ingredient(crf_model) for crf_model in crf_models]
--- a/mealie/services/parser_services/ingredient_parser_service.py
+++ b/mealie/services/parser_services/ingredient_parser_service.py
@@ -0,0 +1,28 @@
+from mealie.schema.recipe import RecipeIngredient
+from mealie.services._base_http_service.http_services import UserHttpService
+
+from .ingredient_parser import ABCIngredientParser, CRFPPIngredientParser
+
+
+class IngredientParserService(UserHttpService):
+    def __init__(self, parser: ABCIngredientParser = None, *args, **kwargs) -> None:
+        self.parser: ABCIngredientParser = parser() if parser else CRFPPIngredientParser()
+        super().__init__(*args, **kwargs)
+
+    def populate_item(self) -> None:
+        """Satisfy abstract method"""
+        pass
+
+    def parse_ingredient(self, ingredient: str) -> RecipeIngredient:
+        parsed = self.parser.parse([ingredient])
+
+        if parsed:
+            return parsed[0]
+        # TODO: Raise Exception
+
+    def parse_ingredients(self, ingredients: list[str]) -> list[RecipeIngredient]:
+        parsed = self.parser.parse(ingredients)
+
+        if parsed:
+            return parsed
+        # TODO: Raise Exception