mirror of
				https://github.com/mealie-recipes/mealie.git
				synced 2025-10-31 18:23:18 -04:00 
			
		
		
		
	update NLP for ingredients
This commit is contained in:
		| @@ -22,7 +22,7 @@ from mealie.schema.admin import ( | ||||
| ) | ||||
| from mealie.schema.events import EventNotificationIn | ||||
| from mealie.schema.recipe import CommentOut, Recipe | ||||
| from mealie.schema.user import UpdateGroup, PrivateUser | ||||
| from mealie.schema.user import PrivateUser, UpdateGroup | ||||
| from mealie.services.image import minify | ||||
|  | ||||
|  | ||||
|   | ||||
							
								
								
									
										97
									
								
								mealie/services/scraper/ingredient_nlp/pre_processor.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										97
									
								
								mealie/services/scraper/ingredient_nlp/pre_processor.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,97 @@ | ||||
| import re | ||||
| import unicodedata | ||||
|  | ||||
| replace_abbreviations = { | ||||
|     "cup ": "cup ", | ||||
|     "g ": "gram ", | ||||
|     "kg ": "kilogram ", | ||||
|     "lb ": "pound ", | ||||
|     "ml ": "milliliter ", | ||||
|     "oz ": "ounce ", | ||||
|     "pint ": "pint ", | ||||
|     "qt ": "quart ", | ||||
|     "tbs ": "tablespoon ", | ||||
|     "tbsp ": "tablespoon ", | ||||
|     "tsp ": "teaspoon ", | ||||
| } | ||||
|  | ||||
|  | ||||
| def replace_common_abbreviations(string: str) -> str: | ||||
|     for k, v in replace_abbreviations.items(): | ||||
|         string = string.replace(k, v) | ||||
|  | ||||
|     return string | ||||
|  | ||||
|  | ||||
| def remove_periods(string: str) -> str: | ||||
|     """Removes periods not sournded by digets""" | ||||
|     return re.sub(r"(?<!\d)\.(?!\d)", "", string) | ||||
|  | ||||
|  | ||||
| def replace_fraction_unicode(string: str): | ||||
|     # TODO: I'm not confident this works well enough for production needs some testing and/or refacorting | ||||
|     # TODO: Breaks on multiple unicode fractions | ||||
|     for c in string: | ||||
|         try: | ||||
|             name = unicodedata.name(c) | ||||
|         except ValueError: | ||||
|             continue | ||||
|         if name.startswith("VULGAR FRACTION"): | ||||
|             normalized = unicodedata.normalize("NFKC", c) | ||||
|             numerator, _slash, denominator = normalized.partition("⁄") | ||||
|             text = f" {numerator}/{denominator}" | ||||
|             return string.replace(c, text).replace("  ", " ") | ||||
|  | ||||
|     return string | ||||
|  | ||||
|  | ||||
| def wrap_or_clause(string: str): | ||||
|     """ | ||||
|     Attempts to wrap or clauses in () | ||||
|  | ||||
|     Examples: | ||||
|     '1 tsp. Diamond Crystal or ½ tsp. Morton kosher salt, plus more' -> '1 teaspoon diamond crystal (or 1/2 teaspoon morton kosher salt), plus more' | ||||
|  | ||||
|     """ | ||||
|     # TODO: Needs more adequite testing to be sure this doens't have side effects. | ||||
|     split_by_or = string.split(" or ") | ||||
|  | ||||
|     split_by_comma = split_by_or[1].split(",") | ||||
|  | ||||
|     if len(split_by_comma) > 0: | ||||
|         return f"{split_by_or[0]} (or {split_by_comma[0]}),{''.join(split_by_comma[1:])}".strip().removesuffix(",") | ||||
|  | ||||
|     return string | ||||
|  | ||||
|  | ||||
| def pre_process_string(string: str) -> str: | ||||
|     """ | ||||
|     Series of preprocessing functions to make best use of the CRF++ model. The ideal string looks something like... | ||||
|  | ||||
|     {qty} {unit} {food}, {additional} | ||||
|     1 tbs. wine, expensive or other white wine, plus more | ||||
|  | ||||
|     """ | ||||
|     string = string.lower() | ||||
|     string = replace_fraction_unicode(string) | ||||
|     string = remove_periods(string) | ||||
|     string = replace_common_abbreviations(string) | ||||
|  | ||||
|     if " or " in string: | ||||
|         string = wrap_or_clause(string) | ||||
|  | ||||
|     return string | ||||
|  | ||||
|  | ||||
| def main(): | ||||
|     # TODO: Migrate to unittests | ||||
|     print("Starting...") | ||||
|     print(pre_process_string("1 tsp. Diamond Crystal or ½ tsp. Morton kosher salt, plus more")) | ||||
|     print(pre_process_string("1 tsp. Diamond Crystal or ½ tsp. Morton kosher salt")) | ||||
|     print(pre_process_string("¼ cup michiu tou or other rice wine")) | ||||
|     print(pre_process_string("1 tbs. wine, expensive or other white wine, plus more")) | ||||
|     print("Finished...") | ||||
|  | ||||
|  | ||||
| if __name__ == "__main__": | ||||
|     main() | ||||
| @@ -1,17 +1,17 @@ | ||||
| import subprocess | ||||
| import tempfile | ||||
| import unicodedata | ||||
| from fractions import Fraction | ||||
| from pathlib import Path | ||||
| from typing import Optional | ||||
|  | ||||
| from pydantic import BaseModel | ||||
| from pydantic import BaseModel, validator | ||||
|  | ||||
| from mealie.core.config import settings | ||||
| from mealie.schema.recipe import RecipeIngredient | ||||
| from mealie.schema.recipe.recipe_ingredient import CreateIngredientFood, CreateIngredientUnit | ||||
|  | ||||
| from . import utils | ||||
| from .pre_processor import pre_process_string | ||||
|  | ||||
| CWD = Path(__file__).parent | ||||
| MODEL_PATH = CWD / "model.crfmodel" | ||||
| @@ -33,6 +33,17 @@ class CRFIngredient(BaseModel): | ||||
|     comment: Optional[str] = "" | ||||
|     unit: Optional[str] = "" | ||||
|  | ||||
|     @validator("qty", always=True, pre=True) | ||||
|     def validate_qty(qty, values):  # sourcery skip: merge-nested-ifs | ||||
|         if qty is None or qty == "": | ||||
|             # Check if other contains a fraction | ||||
|             if values["other"] is not None and values["other"].find("/") != -1: | ||||
|                 return float(Fraction(values["other"])).__round__(1) | ||||
|             else: | ||||
|                 return 1 | ||||
|  | ||||
|         return qty | ||||
|  | ||||
|  | ||||
| def _exec_crf_test(input_text): | ||||
|     with tempfile.NamedTemporaryFile(mode="w") as input_file: | ||||
| @@ -43,24 +54,8 @@ def _exec_crf_test(input_text): | ||||
|         ) | ||||
|  | ||||
|  | ||||
| def fraction_finder(string: str): | ||||
|     # TODO: I'm not confident this works well enough for production needs some testing and/or refacorting | ||||
|     for c in string: | ||||
|         try: | ||||
|             name = unicodedata.name(c) | ||||
|         except ValueError: | ||||
|             continue | ||||
|         if name.startswith("VULGAR FRACTION"): | ||||
|             normalized = unicodedata.normalize("NFKC", c) | ||||
|             numerator, _slash, denominator = normalized.partition("⁄") | ||||
|             text = f"{numerator}/{denominator}" | ||||
|             return string.replace(c, text) | ||||
|  | ||||
|     return string | ||||
|  | ||||
|  | ||||
| def convert_list_to_crf_model(list_of_ingrdeint_text: list[str]): | ||||
|     crf_output = _exec_crf_test([fraction_finder(x) for x in list_of_ingrdeint_text]) | ||||
|     crf_output = _exec_crf_test([pre_process_string(x) for x in list_of_ingrdeint_text]) | ||||
|  | ||||
|     crf_models = [CRFIngredient(**ingredient) for ingredient in utils.import_data(crf_output.split("\n"))] | ||||
|  | ||||
| @@ -89,4 +84,4 @@ if __name__ == "__main__": | ||||
|     ingredients = convert_crf_models_to_ingredients(crf_models) | ||||
|  | ||||
|     for ingredient in ingredients: | ||||
|         print(ingredient) | ||||
|         print(ingredient.input) | ||||
|   | ||||
| @@ -28,6 +28,7 @@ def tokenize(s): | ||||
|     s = re.sub(r"(\d+)oz", r"\1 ounces", s) | ||||
|     s = re.sub(r"(\d+)ml", r"\1 milliliters", s, flags=re.IGNORECASE) | ||||
|  | ||||
|     # TODO: Replace american_units with list of units from database? | ||||
|     american_units = ["cup", "tablespoon", "teaspoon", "pound", "ounce", "quart", "pint"] | ||||
|     # The following removes slashes following American units and replaces it with a space. | ||||
|     for unit in american_units: | ||||
|   | ||||
| @@ -47,7 +47,7 @@ def unclump(s): | ||||
|  | ||||
| def normalizeToken(s): | ||||
|     """ | ||||
|     ToDo: FIX THIS. We used to use the pattern.en package to singularize words, but | ||||
|     TODO: FIX THIS. We used to use the pattern.en package to singularize words, but | ||||
|     in the name of simple deployments, we took it out. We should fix this at some | ||||
|     point. | ||||
|     """ | ||||
| @@ -222,6 +222,20 @@ def import_data(lines): | ||||
|             tag, confidence = re.split(r"/", columns[-1], 1) | ||||
|             tag = re.sub(r"^[BI]\-", "", tag).lower() | ||||
|  | ||||
|             # TODO: Integrate Confidence into API Response | ||||
|             print("Confidence", confidence) | ||||
|  | ||||
|             # new token | ||||
|             if prevTag != tag or token == "n/a": | ||||
|                 display[-1].append((tag, [token])) | ||||
|                 data[-1][tag] = [] | ||||
|                 prevTag = tag | ||||
|  | ||||
|             # continuation | ||||
|             else: | ||||
|                 display[-1][-1][1].append(token) | ||||
|                 data[-1][tag].append(token) | ||||
|  | ||||
|             # ---- DISPLAY ---- | ||||
|             # build a structure which groups each token by its tag, so we can | ||||
|             # rebuild the original display name later. | ||||
|   | ||||
							
								
								
									
										35
									
								
								tests/unit_tests/test_nlp_parser.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										35
									
								
								tests/unit_tests/test_nlp_parser.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,35 @@ | ||||
| from dataclasses import dataclass | ||||
| from fractions import Fraction | ||||
|  | ||||
| from mealie.services.scraper.ingredient_nlp.processor import CRFIngredient, convert_list_to_crf_model | ||||
|  | ||||
|  | ||||
| @dataclass | ||||
| class TestIngredient: | ||||
|     input: str | ||||
|     quantity: float | ||||
|  | ||||
|  | ||||
| test_ingredients = [ | ||||
|     TestIngredient("½ cup all-purpose flour", 0.5), | ||||
|     TestIngredient("1 ½ teaspoons ground black pepper", 1.5), | ||||
|     TestIngredient("⅔ cup unsweetened flaked coconut", 0.7), | ||||
|     TestIngredient("⅓ cup panko bread crumbs", 0.3), | ||||
| ] | ||||
|  | ||||
|  | ||||
| def test_nlp_parser(): | ||||
|     models: list[CRFIngredient] = convert_list_to_crf_model([x.input for x in test_ingredients]) | ||||
|  | ||||
|     # Itterate over mdoels and test_ingreidnets to gether | ||||
|     print() | ||||
|     for model, test_ingredient in zip(models, test_ingredients): | ||||
|         print("Testing:", test_ingredient.input, end="") | ||||
|  | ||||
|         assert float(sum(Fraction(s) for s in model.qty.split())) == test_ingredient.quantity | ||||
|  | ||||
|         print(" ✅ Passed") | ||||
|  | ||||
|  | ||||
| if __name__ == "__main__": | ||||
|     test_nlp_parser() | ||||
		Reference in New Issue
	
	Block a user