mirror of
				https://github.com/mealie-recipes/mealie.git
				synced 2025-10-31 10:13:32 -04:00 
			
		
		
		
	update NLP for ingredients
This commit is contained in:
		| @@ -22,7 +22,7 @@ from mealie.schema.admin import ( | |||||||
| ) | ) | ||||||
| from mealie.schema.events import EventNotificationIn | from mealie.schema.events import EventNotificationIn | ||||||
| from mealie.schema.recipe import CommentOut, Recipe | from mealie.schema.recipe import CommentOut, Recipe | ||||||
| from mealie.schema.user import UpdateGroup, PrivateUser | from mealie.schema.user import PrivateUser, UpdateGroup | ||||||
| from mealie.services.image import minify | from mealie.services.image import minify | ||||||
|  |  | ||||||
|  |  | ||||||
|   | |||||||
							
								
								
									
										97
									
								
								mealie/services/scraper/ingredient_nlp/pre_processor.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										97
									
								
								mealie/services/scraper/ingredient_nlp/pre_processor.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,97 @@ | |||||||
|  | import re | ||||||
|  | import unicodedata | ||||||
|  |  | ||||||
|  | replace_abbreviations = { | ||||||
|  |     "cup ": "cup ", | ||||||
|  |     "g ": "gram ", | ||||||
|  |     "kg ": "kilogram ", | ||||||
|  |     "lb ": "pound ", | ||||||
|  |     "ml ": "milliliter ", | ||||||
|  |     "oz ": "ounce ", | ||||||
|  |     "pint ": "pint ", | ||||||
|  |     "qt ": "quart ", | ||||||
|  |     "tbs ": "tablespoon ", | ||||||
|  |     "tbsp ": "tablespoon ", | ||||||
|  |     "tsp ": "teaspoon ", | ||||||
|  | } | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def replace_common_abbreviations(string: str) -> str: | ||||||
|  |     for k, v in replace_abbreviations.items(): | ||||||
|  |         string = string.replace(k, v) | ||||||
|  |  | ||||||
|  |     return string | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def remove_periods(string: str) -> str: | ||||||
|  |     """Removes periods not sournded by digets""" | ||||||
|  |     return re.sub(r"(?<!\d)\.(?!\d)", "", string) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def replace_fraction_unicode(string: str): | ||||||
|  |     # TODO: I'm not confident this works well enough for production needs some testing and/or refacorting | ||||||
|  |     # TODO: Breaks on multiple unicode fractions | ||||||
|  |     for c in string: | ||||||
|  |         try: | ||||||
|  |             name = unicodedata.name(c) | ||||||
|  |         except ValueError: | ||||||
|  |             continue | ||||||
|  |         if name.startswith("VULGAR FRACTION"): | ||||||
|  |             normalized = unicodedata.normalize("NFKC", c) | ||||||
|  |             numerator, _slash, denominator = normalized.partition("⁄") | ||||||
|  |             text = f" {numerator}/{denominator}" | ||||||
|  |             return string.replace(c, text).replace("  ", " ") | ||||||
|  |  | ||||||
|  |     return string | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def wrap_or_clause(string: str): | ||||||
|  |     """ | ||||||
|  |     Attempts to wrap or clauses in () | ||||||
|  |  | ||||||
|  |     Examples: | ||||||
|  |     '1 tsp. Diamond Crystal or ½ tsp. Morton kosher salt, plus more' -> '1 teaspoon diamond crystal (or 1/2 teaspoon morton kosher salt), plus more' | ||||||
|  |  | ||||||
|  |     """ | ||||||
|  |     # TODO: Needs more adequite testing to be sure this doens't have side effects. | ||||||
|  |     split_by_or = string.split(" or ") | ||||||
|  |  | ||||||
|  |     split_by_comma = split_by_or[1].split(",") | ||||||
|  |  | ||||||
|  |     if len(split_by_comma) > 0: | ||||||
|  |         return f"{split_by_or[0]} (or {split_by_comma[0]}),{''.join(split_by_comma[1:])}".strip().removesuffix(",") | ||||||
|  |  | ||||||
|  |     return string | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def pre_process_string(string: str) -> str: | ||||||
|  |     """ | ||||||
|  |     Series of preprocessing functions to make best use of the CRF++ model. The ideal string looks something like... | ||||||
|  |  | ||||||
|  |     {qty} {unit} {food}, {additional} | ||||||
|  |     1 tbs. wine, expensive or other white wine, plus more | ||||||
|  |  | ||||||
|  |     """ | ||||||
|  |     string = string.lower() | ||||||
|  |     string = replace_fraction_unicode(string) | ||||||
|  |     string = remove_periods(string) | ||||||
|  |     string = replace_common_abbreviations(string) | ||||||
|  |  | ||||||
|  |     if " or " in string: | ||||||
|  |         string = wrap_or_clause(string) | ||||||
|  |  | ||||||
|  |     return string | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def main(): | ||||||
|  |     # TODO: Migrate to unittests | ||||||
|  |     print("Starting...") | ||||||
|  |     print(pre_process_string("1 tsp. Diamond Crystal or ½ tsp. Morton kosher salt, plus more")) | ||||||
|  |     print(pre_process_string("1 tsp. Diamond Crystal or ½ tsp. Morton kosher salt")) | ||||||
|  |     print(pre_process_string("¼ cup michiu tou or other rice wine")) | ||||||
|  |     print(pre_process_string("1 tbs. wine, expensive or other white wine, plus more")) | ||||||
|  |     print("Finished...") | ||||||
|  |  | ||||||
|  |  | ||||||
|  | if __name__ == "__main__": | ||||||
|  |     main() | ||||||
| @@ -1,17 +1,17 @@ | |||||||
| import subprocess | import subprocess | ||||||
| import tempfile | import tempfile | ||||||
| import unicodedata |  | ||||||
| from fractions import Fraction | from fractions import Fraction | ||||||
| from pathlib import Path | from pathlib import Path | ||||||
| from typing import Optional | from typing import Optional | ||||||
|  |  | ||||||
| from pydantic import BaseModel | from pydantic import BaseModel, validator | ||||||
|  |  | ||||||
| from mealie.core.config import settings | from mealie.core.config import settings | ||||||
| from mealie.schema.recipe import RecipeIngredient | from mealie.schema.recipe import RecipeIngredient | ||||||
| from mealie.schema.recipe.recipe_ingredient import CreateIngredientFood, CreateIngredientUnit | from mealie.schema.recipe.recipe_ingredient import CreateIngredientFood, CreateIngredientUnit | ||||||
|  |  | ||||||
| from . import utils | from . import utils | ||||||
|  | from .pre_processor import pre_process_string | ||||||
|  |  | ||||||
| CWD = Path(__file__).parent | CWD = Path(__file__).parent | ||||||
| MODEL_PATH = CWD / "model.crfmodel" | MODEL_PATH = CWD / "model.crfmodel" | ||||||
| @@ -33,6 +33,17 @@ class CRFIngredient(BaseModel): | |||||||
|     comment: Optional[str] = "" |     comment: Optional[str] = "" | ||||||
|     unit: Optional[str] = "" |     unit: Optional[str] = "" | ||||||
|  |  | ||||||
|  |     @validator("qty", always=True, pre=True) | ||||||
|  |     def validate_qty(qty, values):  # sourcery skip: merge-nested-ifs | ||||||
|  |         if qty is None or qty == "": | ||||||
|  |             # Check if other contains a fraction | ||||||
|  |             if values["other"] is not None and values["other"].find("/") != -1: | ||||||
|  |                 return float(Fraction(values["other"])).__round__(1) | ||||||
|  |             else: | ||||||
|  |                 return 1 | ||||||
|  |  | ||||||
|  |         return qty | ||||||
|  |  | ||||||
|  |  | ||||||
| def _exec_crf_test(input_text): | def _exec_crf_test(input_text): | ||||||
|     with tempfile.NamedTemporaryFile(mode="w") as input_file: |     with tempfile.NamedTemporaryFile(mode="w") as input_file: | ||||||
| @@ -43,24 +54,8 @@ def _exec_crf_test(input_text): | |||||||
|         ) |         ) | ||||||
|  |  | ||||||
|  |  | ||||||
| def fraction_finder(string: str): |  | ||||||
|     # TODO: I'm not confident this works well enough for production needs some testing and/or refacorting |  | ||||||
|     for c in string: |  | ||||||
|         try: |  | ||||||
|             name = unicodedata.name(c) |  | ||||||
|         except ValueError: |  | ||||||
|             continue |  | ||||||
|         if name.startswith("VULGAR FRACTION"): |  | ||||||
|             normalized = unicodedata.normalize("NFKC", c) |  | ||||||
|             numerator, _slash, denominator = normalized.partition("⁄") |  | ||||||
|             text = f"{numerator}/{denominator}" |  | ||||||
|             return string.replace(c, text) |  | ||||||
|  |  | ||||||
|     return string |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def convert_list_to_crf_model(list_of_ingrdeint_text: list[str]): | def convert_list_to_crf_model(list_of_ingrdeint_text: list[str]): | ||||||
|     crf_output = _exec_crf_test([fraction_finder(x) for x in list_of_ingrdeint_text]) |     crf_output = _exec_crf_test([pre_process_string(x) for x in list_of_ingrdeint_text]) | ||||||
|  |  | ||||||
|     crf_models = [CRFIngredient(**ingredient) for ingredient in utils.import_data(crf_output.split("\n"))] |     crf_models = [CRFIngredient(**ingredient) for ingredient in utils.import_data(crf_output.split("\n"))] | ||||||
|  |  | ||||||
| @@ -89,4 +84,4 @@ if __name__ == "__main__": | |||||||
|     ingredients = convert_crf_models_to_ingredients(crf_models) |     ingredients = convert_crf_models_to_ingredients(crf_models) | ||||||
|  |  | ||||||
|     for ingredient in ingredients: |     for ingredient in ingredients: | ||||||
|         print(ingredient) |         print(ingredient.input) | ||||||
|   | |||||||
| @@ -28,6 +28,7 @@ def tokenize(s): | |||||||
|     s = re.sub(r"(\d+)oz", r"\1 ounces", s) |     s = re.sub(r"(\d+)oz", r"\1 ounces", s) | ||||||
|     s = re.sub(r"(\d+)ml", r"\1 milliliters", s, flags=re.IGNORECASE) |     s = re.sub(r"(\d+)ml", r"\1 milliliters", s, flags=re.IGNORECASE) | ||||||
|  |  | ||||||
|  |     # TODO: Replace american_units with list of units from database? | ||||||
|     american_units = ["cup", "tablespoon", "teaspoon", "pound", "ounce", "quart", "pint"] |     american_units = ["cup", "tablespoon", "teaspoon", "pound", "ounce", "quart", "pint"] | ||||||
|     # The following removes slashes following American units and replaces it with a space. |     # The following removes slashes following American units and replaces it with a space. | ||||||
|     for unit in american_units: |     for unit in american_units: | ||||||
|   | |||||||
| @@ -47,7 +47,7 @@ def unclump(s): | |||||||
|  |  | ||||||
| def normalizeToken(s): | def normalizeToken(s): | ||||||
|     """ |     """ | ||||||
|     ToDo: FIX THIS. We used to use the pattern.en package to singularize words, but |     TODO: FIX THIS. We used to use the pattern.en package to singularize words, but | ||||||
|     in the name of simple deployments, we took it out. We should fix this at some |     in the name of simple deployments, we took it out. We should fix this at some | ||||||
|     point. |     point. | ||||||
|     """ |     """ | ||||||
| @@ -222,6 +222,20 @@ def import_data(lines): | |||||||
|             tag, confidence = re.split(r"/", columns[-1], 1) |             tag, confidence = re.split(r"/", columns[-1], 1) | ||||||
|             tag = re.sub(r"^[BI]\-", "", tag).lower() |             tag = re.sub(r"^[BI]\-", "", tag).lower() | ||||||
|  |  | ||||||
|  |             # TODO: Integrate Confidence into API Response | ||||||
|  |             print("Confidence", confidence) | ||||||
|  |  | ||||||
|  |             # new token | ||||||
|  |             if prevTag != tag or token == "n/a": | ||||||
|  |                 display[-1].append((tag, [token])) | ||||||
|  |                 data[-1][tag] = [] | ||||||
|  |                 prevTag = tag | ||||||
|  |  | ||||||
|  |             # continuation | ||||||
|  |             else: | ||||||
|  |                 display[-1][-1][1].append(token) | ||||||
|  |                 data[-1][tag].append(token) | ||||||
|  |  | ||||||
|             # ---- DISPLAY ---- |             # ---- DISPLAY ---- | ||||||
|             # build a structure which groups each token by its tag, so we can |             # build a structure which groups each token by its tag, so we can | ||||||
|             # rebuild the original display name later. |             # rebuild the original display name later. | ||||||
|   | |||||||
							
								
								
									
										35
									
								
								tests/unit_tests/test_nlp_parser.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										35
									
								
								tests/unit_tests/test_nlp_parser.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,35 @@ | |||||||
|  | from dataclasses import dataclass | ||||||
|  | from fractions import Fraction | ||||||
|  |  | ||||||
|  | from mealie.services.scraper.ingredient_nlp.processor import CRFIngredient, convert_list_to_crf_model | ||||||
|  |  | ||||||
|  |  | ||||||
|  | @dataclass | ||||||
|  | class TestIngredient: | ||||||
|  |     input: str | ||||||
|  |     quantity: float | ||||||
|  |  | ||||||
|  |  | ||||||
|  | test_ingredients = [ | ||||||
|  |     TestIngredient("½ cup all-purpose flour", 0.5), | ||||||
|  |     TestIngredient("1 ½ teaspoons ground black pepper", 1.5), | ||||||
|  |     TestIngredient("⅔ cup unsweetened flaked coconut", 0.7), | ||||||
|  |     TestIngredient("⅓ cup panko bread crumbs", 0.3), | ||||||
|  | ] | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_nlp_parser(): | ||||||
|  |     models: list[CRFIngredient] = convert_list_to_crf_model([x.input for x in test_ingredients]) | ||||||
|  |  | ||||||
|  |     # Itterate over mdoels and test_ingreidnets to gether | ||||||
|  |     print() | ||||||
|  |     for model, test_ingredient in zip(models, test_ingredients): | ||||||
|  |         print("Testing:", test_ingredient.input, end="") | ||||||
|  |  | ||||||
|  |         assert float(sum(Fraction(s) for s in model.qty.split())) == test_ingredient.quantity | ||||||
|  |  | ||||||
|  |         print(" ✅ Passed") | ||||||
|  |  | ||||||
|  |  | ||||||
|  | if __name__ == "__main__": | ||||||
|  |     test_nlp_parser() | ||||||
		Reference in New Issue
	
	Block a user