update NLP for ingredients

2026-06-12 20:10:14 -04:00 · 2021-08-29 17:10:51 -08:00
parent 086098899d
commit 2e6352cfbd
7 changed files with 164 additions and 22 deletions
--- a/mealie/services/backups/imports.py
+++ b/mealie/services/backups/imports.py
@@ -22,7 +22,7 @@ from mealie.schema.admin import (
 )
 from mealie.schema.events import EventNotificationIn
 from mealie.schema.recipe import CommentOut, Recipe
-from mealie.schema.user import UpdateGroup, PrivateUser
+from mealie.schema.user import PrivateUser, UpdateGroup
 from mealie.services.image import minify


--- a/mealie/services/scraper/ingredient_nlp/pre_processor.py
+++ b/mealie/services/scraper/ingredient_nlp/pre_processor.py
@@ -0,0 +1,97 @@
+import re
+import unicodedata
+
+replace_abbreviations = {
+    "cup ": "cup ",
+    "g ": "gram ",
+    "kg ": "kilogram ",
+    "lb ": "pound ",
+    "ml ": "milliliter ",
+    "oz ": "ounce ",
+    "pint ": "pint ",
+    "qt ": "quart ",
+    "tbs ": "tablespoon ",
+    "tbsp ": "tablespoon ",
+    "tsp ": "teaspoon ",
+}
+
+
+def replace_common_abbreviations(string: str) -> str:
+    for k, v in replace_abbreviations.items():
+        string = string.replace(k, v)
+
+    return string
+
+
+def remove_periods(string: str) -> str:
+    """Removes periods not sournded by digets"""
+    return re.sub(r"(?<!\d)\.(?!\d)", "", string)
+
+
+def replace_fraction_unicode(string: str):
+    # TODO: I'm not confident this works well enough for production needs some testing and/or refacorting
+    # TODO: Breaks on multiple unicode fractions
+    for c in string:
+        try:
+            name = unicodedata.name(c)
+        except ValueError:
+            continue
+        if name.startswith("VULGAR FRACTION"):
+            normalized = unicodedata.normalize("NFKC", c)
+            numerator, _slash, denominator = normalized.partition("⁄")
+            text = f" {numerator}/{denominator}"
+            return string.replace(c, text).replace("  ", " ")
+
+    return string
+
+
+def wrap_or_clause(string: str):
+    """
+    Attempts to wrap or clauses in ()
+
+    Examples:
+    '1 tsp. Diamond Crystal or ½ tsp. Morton kosher salt, plus more' -> '1 teaspoon diamond crystal (or 1/2 teaspoon morton kosher salt), plus more'
+
+    """
+    # TODO: Needs more adequite testing to be sure this doens't have side effects.
+    split_by_or = string.split(" or ")
+
+    split_by_comma = split_by_or[1].split(",")
+
+    if len(split_by_comma) > 0:
+        return f"{split_by_or[0]} (or {split_by_comma[0]}),{''.join(split_by_comma[1:])}".strip().removesuffix(",")
+
+    return string
+
+
+def pre_process_string(string: str) -> str:
+    """
+    Series of preprocessing functions to make best use of the CRF++ model. The ideal string looks something like...
+
+    {qty} {unit} {food}, {additional}
+    1 tbs. wine, expensive or other white wine, plus more
+
+    """
+    string = string.lower()
+    string = replace_fraction_unicode(string)
+    string = remove_periods(string)
+    string = replace_common_abbreviations(string)
+
+    if " or " in string:
+        string = wrap_or_clause(string)
+
+    return string
+
+
+def main():
+    # TODO: Migrate to unittests
+    print("Starting...")
+    print(pre_process_string("1 tsp. Diamond Crystal or ½ tsp. Morton kosher salt, plus more"))
+    print(pre_process_string("1 tsp. Diamond Crystal or ½ tsp. Morton kosher salt"))
+    print(pre_process_string("¼ cup michiu tou or other rice wine"))
+    print(pre_process_string("1 tbs. wine, expensive or other white wine, plus more"))
+    print("Finished...")
+
+
+if __name__ == "__main__":
+    main()
--- a/mealie/services/scraper/ingredient_nlp/processor.py
+++ b/mealie/services/scraper/ingredient_nlp/processor.py
@@ -1,17 +1,17 @@
 import subprocess
 import tempfile
-import unicodedata
 from fractions import Fraction
 from pathlib import Path
 from typing import Optional

-from pydantic import BaseModel
+from pydantic import BaseModel, validator

 from mealie.core.config import settings
 from mealie.schema.recipe import RecipeIngredient
 from mealie.schema.recipe.recipe_ingredient import CreateIngredientFood, CreateIngredientUnit

 from . import utils
+from .pre_processor import pre_process_string

 CWD = Path(__file__).parent
 MODEL_PATH = CWD / "model.crfmodel"
@@ -33,6 +33,17 @@ class CRFIngredient(BaseModel):
    comment: Optional[str] = ""
    unit: Optional[str] = ""

+    @validator("qty", always=True, pre=True)
+    def validate_qty(qty, values):  # sourcery skip: merge-nested-ifs
+        if qty is None or qty == "":
+            # Check if other contains a fraction
+            if values["other"] is not None and values["other"].find("/") != -1:
+                return float(Fraction(values["other"])).__round__(1)
+            else:
+                return 1
+
+        return qty
+

 def _exec_crf_test(input_text):
    with tempfile.NamedTemporaryFile(mode="w") as input_file:
@@ -43,24 +54,8 @@ def _exec_crf_test(input_text):
        )


-def fraction_finder(string: str):
-    # TODO: I'm not confident this works well enough for production needs some testing and/or refacorting
-    for c in string:
-        try:
-            name = unicodedata.name(c)
-        except ValueError:
-            continue
-        if name.startswith("VULGAR FRACTION"):
-            normalized = unicodedata.normalize("NFKC", c)
-            numerator, _slash, denominator = normalized.partition("⁄")
-            text = f"{numerator}/{denominator}"
-            return string.replace(c, text)
-
-    return string
-
-
 def convert_list_to_crf_model(list_of_ingrdeint_text: list[str]):
-    crf_output = _exec_crf_test([fraction_finder(x) for x in list_of_ingrdeint_text])
+    crf_output = _exec_crf_test([pre_process_string(x) for x in list_of_ingrdeint_text])

    crf_models = [CRFIngredient(**ingredient) for ingredient in utils.import_data(crf_output.split("\n"))]

@@ -89,4 +84,4 @@ if __name__ == "__main__":
    ingredients = convert_crf_models_to_ingredients(crf_models)

    for ingredient in ingredients:
-        print(ingredient)
+        print(ingredient.input)
--- a/mealie/services/scraper/ingredient_nlp/tokenizer.py
+++ b/mealie/services/scraper/ingredient_nlp/tokenizer.py
@@ -28,6 +28,7 @@ def tokenize(s):
    s = re.sub(r"(\d+)oz", r"\1 ounces", s)
    s = re.sub(r"(\d+)ml", r"\1 milliliters", s, flags=re.IGNORECASE)

+    # TODO: Replace american_units with list of units from database?
    american_units = ["cup", "tablespoon", "teaspoon", "pound", "ounce", "quart", "pint"]
    # The following removes slashes following American units and replaces it with a space.
    for unit in american_units:
--- a/mealie/services/scraper/ingredient_nlp/unicode_fraction_dict.py
+++ b/mealie/services/scraper/ingredient_nlp/unicode_fraction_dict.py
--- a/mealie/services/scraper/ingredient_nlp/utils.py
+++ b/mealie/services/scraper/ingredient_nlp/utils.py
@@ -47,7 +47,7 @@ def unclump(s):

 def normalizeToken(s):
    """
-    ToDo: FIX THIS. We used to use the pattern.en package to singularize words, but
+    TODO: FIX THIS. We used to use the pattern.en package to singularize words, but
    in the name of simple deployments, we took it out. We should fix this at some
    point.
    """
@@ -222,6 +222,20 @@ def import_data(lines):
            tag, confidence = re.split(r"/", columns[-1], 1)
            tag = re.sub(r"^[BI]\-", "", tag).lower()

+            # TODO: Integrate Confidence into API Response
+            print("Confidence", confidence)
+
+            # new token
+            if prevTag != tag or token == "n/a":
+                display[-1].append((tag, [token]))
+                data[-1][tag] = []
+                prevTag = tag
+
+            # continuation
+            else:
+                display[-1][-1][1].append(token)
+                data[-1][tag].append(token)
+
            # ---- DISPLAY ----
            # build a structure which groups each token by its tag, so we can
            # rebuild the original display name later.
--- a/tests/unit_tests/test_nlp_parser.py
+++ b/tests/unit_tests/test_nlp_parser.py
@@ -0,0 +1,35 @@
+from dataclasses import dataclass
+from fractions import Fraction
+
+from mealie.services.scraper.ingredient_nlp.processor import CRFIngredient, convert_list_to_crf_model
+
+
+@dataclass
+class TestIngredient:
+    input: str
+    quantity: float
+
+
+test_ingredients = [
+    TestIngredient("½ cup all-purpose flour", 0.5),
+    TestIngredient("1 ½ teaspoons ground black pepper", 1.5),
+    TestIngredient("⅔ cup unsweetened flaked coconut", 0.7),
+    TestIngredient("⅓ cup panko bread crumbs", 0.3),
+]
+
+
+def test_nlp_parser():
+    models: list[CRFIngredient] = convert_list_to_crf_model([x.input for x in test_ingredients])
+
+    # Itterate over mdoels and test_ingreidnets to gether
+    print()
+    for model, test_ingredient in zip(models, test_ingredients):
+        print("Testing:", test_ingredient.input, end="")
+
+        assert float(sum(Fraction(s) for s in model.qty.split())) == test_ingredient.quantity
+
+        print(" ✅ Passed")
+
+
+if __name__ == "__main__":
+    test_nlp_parser()