mirror of
				https://github.com/mealie-recipes/mealie.git
				synced 2025-10-31 10:13:32 -04:00 
			
		
		
		
	Better bruteforce parsing for units (#3066)
* try to match units when brute parsing and no amount is matched * brute parser: better handle multiple word food items Also checks the case when a food might have been split in a unit + ingredient * fix formatting * add test cases for ingredient parsing that don't start with an amount * parametrized tests and added ingredient data fixture * fixed group_id ref in tests * fixed test inputs * add extra tests for units as third token --------- Co-authored-by: Michael Genson <71845777+michael-genson@users.noreply.github.com>
This commit is contained in:
		| @@ -132,7 +132,7 @@ def parse_ingredient(tokens) -> tuple[str, str]: | ||||
|     return ingredient, note | ||||
|  | ||||
|  | ||||
| def parse(ing_str) -> BruteParsedIngredient: | ||||
| def parse(ing_str, parser) -> BruteParsedIngredient: | ||||
|     amount = 0.0 | ||||
|     unit = "" | ||||
|     ingredient = "" | ||||
| @@ -192,12 +192,20 @@ def parse(ing_str) -> BruteParsedIngredient: | ||||
|             # which means this is the ingredient | ||||
|             ingredient = tokens[1] | ||||
|     except ValueError: | ||||
|         try: | ||||
|             # can't parse first argument as amount | ||||
|             # -> no unit -> parse everything as ingredient | ||||
|             ingredient, note = parse_ingredient(tokens) | ||||
|         except ValueError: | ||||
|             ingredient = " ".join(tokens[1:]) | ||||
|         # can't parse first argument as amount | ||||
|         # try to parse as unit and ingredient (e.g. "a tblsp salt"), with unit in first three tokens | ||||
|         # won't work for units that have spaces | ||||
|         for index, token in enumerate(tokens[:3]): | ||||
|             if parser.find_unit_match(token): | ||||
|                 unit = token | ||||
|                 ingredient, note = parse_ingredient(tokens[index + 1 :]) | ||||
|                 break | ||||
|         if not unit: | ||||
|             try: | ||||
|                 # no unit -> parse everything as ingredient | ||||
|                 ingredient, note = parse_ingredient(tokens) | ||||
|             except ValueError: | ||||
|                 ingredient = " ".join(tokens[1:]) | ||||
|  | ||||
|     if unit_note not in note: | ||||
|         note += " " + unit_note | ||||
|   | ||||
| @@ -126,22 +126,24 @@ class ABCIngredientParser(ABC): | ||||
|  | ||||
|         return store_map[fuzz_result[0]] | ||||
|  | ||||
|     def find_food_match(self, food: IngredientFood | CreateIngredientFood) -> IngredientFood | None: | ||||
|     def find_food_match(self, food: IngredientFood | CreateIngredientFood | str) -> IngredientFood | None: | ||||
|         if isinstance(food, IngredientFood): | ||||
|             return food | ||||
|  | ||||
|         match_value = IngredientFoodModel.normalize(food.name) | ||||
|         food_name = food if isinstance(food, str) else food.name | ||||
|         match_value = IngredientFoodModel.normalize(food_name) | ||||
|         return self.find_match( | ||||
|             match_value, | ||||
|             store_map=self.foods_by_alias, | ||||
|             fuzzy_match_threshold=self.food_fuzzy_match_threshold, | ||||
|         ) | ||||
|  | ||||
|     def find_unit_match(self, unit: IngredientUnit | CreateIngredientUnit) -> IngredientUnit | None: | ||||
|     def find_unit_match(self, unit: IngredientUnit | CreateIngredientUnit | str) -> IngredientUnit | None: | ||||
|         if isinstance(unit, IngredientUnit): | ||||
|             return unit | ||||
|  | ||||
|         match_value = IngredientUnitModel.normalize(unit.name) | ||||
|         unit_name = unit if isinstance(unit, str) else unit.name | ||||
|         match_value = IngredientUnitModel.normalize(unit_name) | ||||
|         return self.find_match( | ||||
|             match_value, | ||||
|             store_map=self.units_by_alias, | ||||
| @@ -155,6 +157,16 @@ class ABCIngredientParser(ABC): | ||||
|         if ingredient.ingredient.unit and (unit_match := self.find_unit_match(ingredient.ingredient.unit)): | ||||
|             ingredient.ingredient.unit = unit_match | ||||
|  | ||||
|         # Parser might have wrongly split a food into a unit and food. | ||||
|         if isinstance(ingredient.ingredient.food, CreateIngredientFood) and isinstance( | ||||
|             ingredient.ingredient.unit, CreateIngredientUnit | ||||
|         ): | ||||
|             if food_match := self.find_food_match( | ||||
|                 f"{ingredient.ingredient.unit.name} {ingredient.ingredient.food.name}" | ||||
|             ): | ||||
|                 ingredient.ingredient.food = food_match | ||||
|                 ingredient.ingredient.unit = None | ||||
|  | ||||
|         return ingredient | ||||
|  | ||||
|  | ||||
| @@ -164,7 +176,7 @@ class BruteForceParser(ABCIngredientParser): | ||||
|     """ | ||||
|  | ||||
|     def parse_one(self, ingredient: str) -> ParsedIngredient: | ||||
|         bfi = brute.parse(ingredient) | ||||
|         bfi = brute.parse(ingredient, self) | ||||
|  | ||||
|         parsed_ingredient = ParsedIngredient( | ||||
|             input=ingredient, | ||||
|   | ||||
| @@ -135,7 +135,7 @@ test_ingredients = [ | ||||
|  | ||||
|  | ||||
| @pytest.mark.skipif(not crf_exists(), reason="CRF++ not installed") | ||||
| def test_nlp_parser(): | ||||
| def test_nlp_parser() -> None: | ||||
|     models: list[CRFIngredient] = convert_list_to_crf_model([x.input for x in test_ingredients]) | ||||
|  | ||||
|     # Iterate over models and test_ingredients to gather | ||||
| @@ -147,37 +147,102 @@ def test_nlp_parser(): | ||||
|         assert model.unit == test_ingredient.unit | ||||
|  | ||||
|  | ||||
| def test_brute_parser(unique_user: TestUser): | ||||
|     # input: (quantity, unit, food, comments) | ||||
|     expectations = { | ||||
|         # Dutch | ||||
|         "1 theelepel koffie": (1, "theelepel", "koffie", ""), | ||||
|         "3 theelepels koffie": (3, "theelepels", "koffie", ""), | ||||
|         "1 eetlepel tarwe": (1, "eetlepel", "tarwe", ""), | ||||
|         "20 eetlepels bloem": (20, "eetlepels", "bloem", ""), | ||||
|         "1 mespunt kaneel": (1, "mespunt", "kaneel", ""), | ||||
|         "1 snuf(je) zout": (1, "snuf(je)", "zout", ""), | ||||
|         "2 tbsp minced cilantro, leaves and stems": (2, "tbsp", "minced cilantro", "leaves and stems"), | ||||
|         "1 large yellow onion, coarsely chopped": (1, "large", "yellow onion", "coarsely chopped"), | ||||
|         "1 1/2 tsp garam masala": (1.5, "tsp", "garam masala", ""), | ||||
|         "2 cups mango chunks, (2 large mangoes) (fresh or frozen)": ( | ||||
| @pytest.mark.parametrize( | ||||
|     "input, quantity, unit, food, comment", | ||||
|     [ | ||||
|         pytest.param("1 theelepel koffie", 1, "theelepel", "koffie", "", id="1 theelepel koffie"), | ||||
|         pytest.param("3 theelepels koffie", 3, "theelepels", "koffie", "", id="3 theelepels koffie"), | ||||
|         pytest.param("1 eetlepel tarwe", 1, "eetlepel", "tarwe", "", id="1 eetlepel tarwe"), | ||||
|         pytest.param("20 eetlepels bloem", 20, "eetlepels", "bloem", "", id="20 eetlepels bloem"), | ||||
|         pytest.param("1 mespunt kaneel", 1, "mespunt", "kaneel", "", id="1 mespunt kaneel"), | ||||
|         pytest.param("1 snuf(je) zout", 1, "snuf(je)", "zout", "", id="1 snuf(je) zout"), | ||||
|         pytest.param( | ||||
|             "2 tbsp minced cilantro, leaves and stems", | ||||
|             2, | ||||
|             "cups", | ||||
|             "tbsp", | ||||
|             "minced cilantro", | ||||
|             "leaves and stems", | ||||
|             id="2 tbsp minced cilantro, leaves and stems", | ||||
|         ), | ||||
|         pytest.param( | ||||
|             "1 large yellow onion, coarsely chopped", | ||||
|             1, | ||||
|             "large", | ||||
|             "yellow onion", | ||||
|             "coarsely chopped", | ||||
|             id="1 large yellow onion, coarsely chopped", | ||||
|         ), | ||||
|         pytest.param("1 1/2 tsp garam masala", 1.5, "tsp", "garam masala", "", id="1 1/2 tsp garam masala"), | ||||
|         pytest.param( | ||||
|             "2 cups mango chunks, (2 large mangoes) (fresh or frozen)", | ||||
|             2, | ||||
|             "Cups", | ||||
|             "mango chunks, (2 large mangoes)", | ||||
|             "fresh or frozen", | ||||
|             id="2 cups mango chunks, (2 large mangoes) (fresh or frozen)", | ||||
|         ), | ||||
|     } | ||||
|  | ||||
|         pytest.param("stalk onion", 0, "Stalk", "onion", "", id="stalk onion"), | ||||
|         pytest.param("a stalk bell peppers", 0, "Stalk", "bell peppers", "", id="a stalk bell peppers"), | ||||
|         pytest.param("a tablespoon unknownFood", 0, "Tablespoon", "unknownFood", "", id="a tablespoon unknownFood"), | ||||
|         pytest.param( | ||||
|             "stalk bell peppers, cut in pieces", | ||||
|             0, | ||||
|             "Stalk", | ||||
|             "bell peppers", | ||||
|             "cut in pieces", | ||||
|             id="stalk bell peppers, cut in pieces", | ||||
|         ), | ||||
|         pytest.param( | ||||
|             "a stalk bell peppers, cut in pieces", | ||||
|             0, | ||||
|             "Stalk", | ||||
|             "bell peppers", | ||||
|             "cut in pieces", | ||||
|             id="stalk bell peppers, cut in pieces", | ||||
|         ), | ||||
|         pytest.param("red pepper flakes", 0, "", "red pepper flakes", "", id="red pepper flakes"), | ||||
|         pytest.param("1 red pepper flakes", 1, "", "red pepper flakes", "", id="1 red pepper flakes"), | ||||
|         pytest.param("1 bell peppers", 1, "", "bell peppers", "", id="1 bell peppers"), | ||||
|         pytest.param("1 stalk bell peppers", 1, "Stalk", "bell peppers", "", id="1 big stalk bell peppers"), | ||||
|         pytest.param("a big stalk bell peppers", 0, "Stalk", "bell peppers", "", id="a big stalk bell peppers"), | ||||
|         pytest.param( | ||||
|             "1 bell peppers, cut in pieces", 1, "", "bell peppers", "cut in pieces", id="1 bell peppers, cut in pieces" | ||||
|         ), | ||||
|         pytest.param( | ||||
|             "bell peppers, cut in pieces", 0, "", "bell peppers", "cut in pieces", id="bell peppers, cut in pieces" | ||||
|         ), | ||||
|     ], | ||||
| ) | ||||
| def test_brute_parser( | ||||
|     unique_local_group_id: UUID4, | ||||
|     parsed_ingredient_data: tuple[list[IngredientFood], list[IngredientUnit]],  # required so database is populated | ||||
|     input: str, | ||||
|     quantity: int | float, | ||||
|     unit: str, | ||||
|     food: str, | ||||
|     comment: str, | ||||
| ): | ||||
|     with session_context() as session: | ||||
|         parser = get_parser(RegisteredParser.brute, unique_user.group_id, session) | ||||
|         parser = get_parser(RegisteredParser.brute, unique_local_group_id, session) | ||||
|         parsed = parser.parse_one(input) | ||||
|         ing = parsed.ingredient | ||||
|  | ||||
|         for key, val in expectations.items(): | ||||
|             parsed = parser.parse_one(key) | ||||
|  | ||||
|             assert parsed.ingredient.quantity == val[0] | ||||
|             assert parsed.ingredient.unit.name == val[1] | ||||
|             assert parsed.ingredient.food.name == val[2] | ||||
|             assert parsed.ingredient.note in {val[3], None} | ||||
|         if ing.quantity: | ||||
|             assert ing.quantity == quantity | ||||
|         else: | ||||
|             assert not quantity | ||||
|         if ing.unit: | ||||
|             assert ing.unit.name == unit | ||||
|         else: | ||||
|             assert not unit | ||||
|         if ing.food: | ||||
|             assert ing.food.name == food | ||||
|         else: | ||||
|             assert not food | ||||
|         if ing.note: | ||||
|             assert ing.note == comment | ||||
|         else: | ||||
|             assert not comment | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|   | ||||
		Reference in New Issue
	
	Block a user