feat: Improved Ingredient Matching (#2535)

* added normalization to foods and units

* changed search to reference new normalized fields

* fix tests

* added parsed food matching to backend

* prevent pagination from ordering when searching

* added extra fuzzy matching to sqlite ing matching

* added tests

* only apply search ordering when order_by is null

* enabled post-search fuzzy matching for postgres

* fixed postgres fuzzy search test

* idk why this is failing

* 🤦

* simplified frontend ing matching
and restored automatic unit creation

* tightened food fuzzy threshold

* change to rapidfuzz

* sped up fuzzy matching with process

* fixed units not matching by abbreviation

* fast return for exact matches

* replace db searching with pure fuzz

* added fuzzy normalization

* tightened unit fuzzy matching thresh

* cleaned up comments/var names

* ran matching logic through the dryer

* oops

* simplified order by application logic
This commit is contained in:
Michael Genson
2023-09-15 12:19:34 -05:00
committed by GitHub
parent 084ad4228b
commit 2dfbe9f08d
17 changed files with 738 additions and 97 deletions

View File

@@ -1,20 +1,32 @@
from abc import ABC, abstractmethod
from fractions import Fraction
from typing import TypeVar
from pydantic import UUID4, BaseModel
from rapidfuzz import fuzz, process
from sqlalchemy.orm import Session
from mealie.core.root_logger import get_logger
from mealie.db.models.recipe.ingredient import IngredientFoodModel, IngredientUnitModel
from mealie.repos.all_repositories import get_repositories
from mealie.repos.repository_factory import AllRepositories
from mealie.schema.recipe import RecipeIngredient
from mealie.schema.recipe.recipe_ingredient import (
MAX_INGREDIENT_DENOMINATOR,
CreateIngredientFood,
CreateIngredientUnit,
IngredientConfidence,
IngredientFood,
IngredientUnit,
ParsedIngredient,
RegisteredParser,
)
from mealie.schema.response.pagination import PaginationQuery
from . import brute, crfpp
logger = get_logger(__name__)
T = TypeVar("T", bound=BaseModel)
class ABCIngredientParser(ABC):
@@ -22,6 +34,53 @@ class ABCIngredientParser(ABC):
Abstract class for ingredient parsers.
"""
def __init__(self, group_id: UUID4, session: Session) -> None:
self.group_id = group_id
self.session = session
self._foods_by_name: dict[str, IngredientFood] | None = None
self._units_by_name: dict[str, IngredientUnit] | None = None
@property
def _repos(self) -> AllRepositories:
return get_repositories(self.session)
@property
def foods_by_normalized_name(self) -> dict[str, IngredientFood]:
if self._foods_by_name is None:
foods_repo = self._repos.ingredient_foods.by_group(self.group_id)
query = PaginationQuery(page=1, per_page=-1)
all_foods = foods_repo.page_all(query).items
self._foods_by_name = {IngredientFoodModel.normalize(food.name): food for food in all_foods if food.name}
return self._foods_by_name
@property
def units_by_normalized_name_or_abbreviation(self) -> dict[str, IngredientUnit]:
if self._units_by_name is None:
units_repo = self._repos.ingredient_units.by_group(self.group_id)
query = PaginationQuery(page=1, per_page=-1)
all_units = units_repo.page_all(query).items
self._units_by_name = {
IngredientUnitModel.normalize(unit.name): unit for unit in all_units if unit.name
} | {IngredientUnitModel.normalize(unit.abbreviation): unit for unit in all_units if unit.abbreviation}
return self._units_by_name
@property
def food_fuzzy_match_threshold(self) -> int:
"""Minimum threshold to fuzzy match against a database food search"""
return 85
@property
def unit_fuzzy_match_threshold(self) -> int:
"""Minimum threshold to fuzzy match against a database unit search"""
return 70
@abstractmethod
def parse_one(self, ingredient_string: str) -> ParsedIngredient:
...
@@ -30,19 +89,64 @@ class ABCIngredientParser(ABC):
def parse(self, ingredients: list[str]) -> list[ParsedIngredient]:
...
@classmethod
def find_match(cls, match_value: str, *, store_map: dict[str, T], fuzzy_match_threshold: int = 0) -> T | None:
# check for literal matches
if match_value in store_map:
return store_map[match_value]
# fuzzy match against food store
fuzz_result = process.extractOne(match_value, store_map.keys(), scorer=fuzz.ratio)
if fuzz_result is None:
return None
choice, score, _ = fuzz_result
if score < fuzzy_match_threshold:
return None
else:
return store_map[choice]
def find_food_match(self, food: IngredientFood | CreateIngredientFood) -> IngredientFood | None:
if isinstance(food, IngredientFood):
return food
match_value = IngredientFoodModel.normalize(food.name)
return self.find_match(
match_value,
store_map=self.foods_by_normalized_name,
fuzzy_match_threshold=self.food_fuzzy_match_threshold,
)
def find_unit_match(self, unit: IngredientUnit | CreateIngredientUnit) -> IngredientUnit | None:
if isinstance(unit, IngredientUnit):
return unit
match_value = IngredientUnitModel.normalize(unit.name)
return self.find_match(
match_value,
store_map=self.units_by_normalized_name_or_abbreviation,
fuzzy_match_threshold=self.unit_fuzzy_match_threshold,
)
def find_ingredient_match(self, ingredient: ParsedIngredient) -> ParsedIngredient:
if ingredient.ingredient.food and (food_match := self.find_food_match(ingredient.ingredient.food)):
ingredient.ingredient.food = food_match
if ingredient.ingredient.unit and (unit_match := self.find_unit_match(ingredient.ingredient.unit)):
ingredient.ingredient.unit = unit_match
return ingredient
class BruteForceParser(ABCIngredientParser):
"""
Brute force ingredient parser.
"""
def __init__(self) -> None:
pass
def parse_one(self, ingredient: str) -> ParsedIngredient:
bfi = brute.parse(ingredient)
return ParsedIngredient(
parsed_ingredient = ParsedIngredient(
input=ingredient,
ingredient=RecipeIngredient(
unit=CreateIngredientUnit(name=bfi.unit),
@@ -53,6 +157,8 @@ class BruteForceParser(ABCIngredientParser):
),
)
return self.find_ingredient_match(parsed_ingredient)
def parse(self, ingredients: list[str]) -> list[ParsedIngredient]:
return [self.parse_one(ingredient) for ingredient in ingredients]
@@ -62,9 +168,6 @@ class NLPParser(ABCIngredientParser):
Class for CRFPP ingredient parsers.
"""
def __init__(self) -> None:
pass
def _crf_to_ingredient(self, crf_model: crfpp.CRFIngredient) -> ParsedIngredient:
ingredient = None
@@ -87,7 +190,7 @@ class NLPParser(ABCIngredientParser):
note=crf_model.input,
)
return ParsedIngredient(
parsed_ingredient = ParsedIngredient(
input=crf_model.input,
ingredient=ingredient,
confidence=IngredientConfidence(
@@ -97,6 +200,8 @@ class NLPParser(ABCIngredientParser):
),
)
return self.find_ingredient_match(parsed_ingredient)
def parse(self, ingredients: list[str]) -> list[ParsedIngredient]:
crf_models = crfpp.convert_list_to_crf_model(ingredients)
return [self._crf_to_ingredient(crf_model) for crf_model in crf_models]
@@ -112,9 +217,9 @@ __registrar = {
}
def get_parser(parser: RegisteredParser) -> ABCIngredientParser:
def get_parser(parser: RegisteredParser, group_id: UUID4, session: Session) -> ABCIngredientParser:
"""
get_parser returns an ingrdeint parser based on the string enum value
passed in.
"""
return __registrar.get(parser, NLPParser)()
return __registrar.get(parser, NLPParser)(group_id, session)