feat: Add brute strategy to ingredient processor (#744)

* fix UI column width

* words

* update parser to support diff strats

* add new model url

* make button more visible

* fix nutrition error

* feat(backend):  add 'brute' strategy for parsing ingredients

* satisfy linter

* update UI for creation page

* feat(backend):  log 422 errors in detail when not in PRODUCTION

* add strategy selector

Co-authored-by: Hayden <hay-kot@pm.me>
This commit is contained in:
Hayden
2021-10-16 16:06:13 -08:00
committed by GitHub
parent 60908e5a88
commit 3b920babe3
25 changed files with 961 additions and 131 deletions

View File

@@ -1 +1,2 @@
from .ingredient_parser import *
from .ingredient_parser_service import *

View File

@@ -0,0 +1 @@
from .string_utils import *

View File

@@ -0,0 +1,23 @@
import re
compiled_match = re.compile(r"(.){1,6}\s\((.[^\(\)])+\)\s")
compiled_search = re.compile(r"\((.[^\(])+\)")
def move_parens_to_end(ing_str) -> str:
"""
Moves all parentheses in the string to the end of the string using Regex.
If no parentheses are found, the string is returned unchanged.
"""
if re.match(compiled_match, ing_str):
match = re.search(compiled_search, ing_str)
start = match.start()
end = match.end()
ing_str = ing_str[:start] + ing_str[end:] + " " + ing_str[start:end]
return ing_str
def check_char(char, *eql) -> bool:
"""Helper method to check if a charaters matches any of the additional provided arguments"""
return any(char == eql_char for eql_char in eql)

View File

@@ -0,0 +1 @@
from .process import parse

View File

@@ -0,0 +1,204 @@
import string
import unicodedata
from typing import Tuple
from pydantic import BaseModel
from .._helpers import check_char, move_parens_to_end
class BruteParsedIngredient(BaseModel):
food: str = ""
note: str = ""
amount: float = ""
unit: str = ""
class Config:
anystr_strip_whitespace = True
def parse_fraction(x):
if len(x) == 1 and "fraction" in unicodedata.decomposition(x):
frac_split = unicodedata.decomposition(x[-1:]).split()
return float((frac_split[1]).replace("003", "")) / float((frac_split[3]).replace("003", ""))
else:
frac_split = x.split("/")
if len(frac_split) != 2:
raise ValueError
try:
return int(frac_split[0]) / int(frac_split[1])
except ZeroDivisionError:
raise ValueError
def parse_amount(ing_str) -> Tuple[float, str, str]:
def keep_looping(ing_str, end) -> bool:
"""
Checks if:
1. the end of the string is reached
2. or if the next character is a digit
3. or if the next character looks like an number (e.g. 1/2, 1.3, 1,500)
"""
if end >= len(ing_str):
return False
if ing_str[end] in string.digits:
return True
if check_char(ing_str[end], ".", ",", "/") and end + 1 < len(ing_str) and ing_str[end + 1] in string.digits:
return True
amount = 0
unit = ""
note = ""
did_check_frac = False
end = 0
while keep_looping(ing_str, end):
end += 1
if end > 0:
if "/" in ing_str[:end]:
amount = parse_fraction(ing_str[:end])
else:
amount = float(ing_str[:end].replace(",", "."))
else:
amount = parse_fraction(ing_str[0])
end += 1
did_check_frac = True
if end < len(ing_str):
if did_check_frac:
unit = ing_str[end:]
else:
try:
amount += parse_fraction(ing_str[end])
unit_end = end + 1
unit = ing_str[unit_end:]
except ValueError:
unit = ing_str[end:]
# i dont know any unit that starts with ( or - so its likely an alternative like 1L (500ml) Water or 2-3
if unit.startswith("(") or unit.startswith("-"):
unit = ""
note = ing_str
return amount, unit, note
def parse_ingredient_with_comma(tokens) -> Tuple[str, str]:
ingredient = ""
note = ""
start = 0
# search for first occurrence of an argument ending in a comma
while start < len(tokens) and not tokens[start].endswith(","):
start += 1
if start == len(tokens):
# no token ending in a comma found -> use everything as ingredient
ingredient = " ".join(tokens)
else:
ingredient = " ".join(tokens[: start + 1])[:-1]
note_end = start + 1
note = " ".join(tokens[note_end:])
return ingredient, note
def parse_ingredient(tokens) -> Tuple[str, str]:
ingredient = ""
note = ""
if tokens[-1].endswith(")"):
# Check if the matching opening bracket is in the same token
if (not tokens[-1].startswith("(")) and ("(" in tokens[-1]):
return parse_ingredient_with_comma(tokens)
# last argument ends with closing bracket -> look for opening bracket
start = len(tokens) - 1
while not tokens[start].startswith("(") and start != 0:
start -= 1
if start == 0:
# the whole list is wrapped in brackets -> assume it is an error (e.g. assumed first argument was the unit) # noqa: E501
raise ValueError
elif start < 0:
# no opening bracket anywhere -> just ignore the last bracket
ingredient, note = parse_ingredient_with_comma(tokens)
else:
# opening bracket found -> split in ingredient and note, remove brackets from note # noqa: E501
note = " ".join(tokens[start:])[1:-1]
ingredient = " ".join(tokens[:start])
else:
ingredient, note = parse_ingredient_with_comma(tokens)
return ingredient, note
def parse(ing_str) -> BruteParsedIngredient:
amount = 0
unit = ""
ingredient = ""
note = ""
unit_note = ""
ing_str = move_parens_to_end(ing_str)
tokens = ing_str.split()
# Early return if the ingrdient is a single token and therefore has no other properties
if len(tokens) == 1:
ingredient = tokens[0]
# TODO Refactor to expect BFP to be returned instead of Tuple
return BruteParsedIngredient(food=ingredient, note=note, amount=amount, unit=unit)
try:
# try to parse first argument as amount
amount, unit, unit_note = parse_amount(tokens[0])
# only try to parse second argument as amount if there are at least
# three arguments if it already has a unit there can't be
# a fraction for the amount
if len(tokens) > 2:
try:
if unit != "":
# a unit is already found, no need to try the second argument for a fraction
# probably not the best method to do it, but I didn't want to make an if check and paste the exact same thing in the else as already is in the except # noqa: E501
raise ValueError
# try to parse second argument as amount and add that, in case of '2 1/2' or '2 ½'
amount += parse_fraction(tokens[1])
# assume that units can't end with a comma
if len(tokens) > 3 and not tokens[2].endswith(","):
# try to use third argument as unit and everything else as ingredient, use everything as ingredient if it fails # noqa: E501
try:
ingredient, note = parse_ingredient(tokens[3:])
unit = tokens[2]
except ValueError:
ingredient, note = parse_ingredient(tokens[2:])
else:
ingredient, note = parse_ingredient(tokens[2:])
except ValueError:
# assume that units can't end with a comma
if not tokens[1].endswith(","):
# try to use second argument as unit and everything else as ingredient, use everything as ingredient if it fails # noqa: E501
try:
ingredient, note = parse_ingredient(tokens[2:])
if unit == "":
unit = tokens[1]
else:
note = tokens[1]
except ValueError:
ingredient, note = parse_ingredient(tokens[1:])
else:
ingredient, note = parse_ingredient(tokens[1:])
else:
# only two arguments, first one is the amount
# which means this is the ingredient
ingredient = tokens[1]
except ValueError:
try:
# can't parse first argument as amount
# -> no unit -> parse everything as ingredient
ingredient, note = parse_ingredient(tokens)
except ValueError:
ingredient = " ".join(tokens[1:])
if unit_note not in note:
note += " " + unit_note
return BruteParsedIngredient(food=ingredient, note=note, amount=amount, unit=unit)

View File

@@ -0,0 +1 @@
from .processor import *

View File

@@ -2,23 +2,25 @@ import re
import unicodedata
replace_abbreviations = {
"cup ": "cup ",
" g ": "gram ",
"kg ": "kilogram ",
"lb ": "pound ",
"ml ": "milliliter ",
"oz ": "ounce ",
"pint ": "pint ",
"qt ": "quart ",
"tbs ": "tablespoon ",
"tbsp ": "tablespoon ",
"tsp ": "teaspoon ",
"cup": " cup ",
"g": " gram ",
"kg": " kilogram ",
"lb": " pound ",
"ml": " milliliter ",
"oz": " ounce ",
"pint": " pint ",
"qt": " quart ",
"tbsp": " tablespoon ",
"tbs": " tablespoon ", # Order Matters!, 'tsb' must come after 'tbsp' incase of duplicate matches
"tsp": " teaspoon ",
}
def replace_common_abbreviations(string: str) -> str:
for k, v in replace_abbreviations.items():
string = string.replace(k, v)
regex = rf"(?<=\d)\s?({k}s?)"
string = re.sub(regex, v, string)
return string
@@ -81,17 +83,3 @@ def pre_process_string(string: str) -> str:
string = wrap_or_clause(string)
return string
def main():
# TODO: Migrate to unittests
print("Starting...")
print(pre_process_string("1 tsp. Diamond Crystal or ½ tsp. Morton kosher salt, plus more"))
print(pre_process_string("1 tsp. Diamond Crystal or ½ tsp. Morton kosher salt"))
print(pre_process_string("¼ cup michiu tou or other rice wine"))
print(pre_process_string("1 tbs. wine, expensive or other white wine, plus more"))
print("Finished...")
if __name__ == "__main__":
main()

View File

@@ -12,6 +12,14 @@ CWD = Path(__file__).parent
MODEL_PATH = CWD / "model.crfmodel"
class CRFConfidence(BaseModel):
average: float = 0.0
comment: float = None
name: float = None
unit: float = None
qty: float = None
class CRFIngredient(BaseModel):
input: str = ""
name: str = ""
@@ -19,15 +27,19 @@ class CRFIngredient(BaseModel):
qty: str = ""
comment: str = ""
unit: str = ""
confidence: CRFConfidence
@validator("qty", always=True, pre=True)
def validate_qty(qty, values): # sourcery skip: merge-nested-ifs
if qty is None or qty == "":
# Check if other contains a fraction
if values["other"] is not None and values["other"].find("/") != -1:
return float(Fraction(values["other"])).__round__(1)
else:
return 1
try:
if values["other"] is not None and values["other"].find("/") != -1:
return float(Fraction(values["other"])).__round__(1)
else:
return 1
except Exception:
pass
return qty

View File

@@ -1,4 +1,5 @@
import re
from statistics import mean
from . import tokenizer
@@ -179,6 +180,9 @@ def import_data(lines):
data = [{}]
display = [[]]
prevTag = None
confidence_all = [{}]
#
# iterate lines in the data file, which looks like:
#
@@ -208,6 +212,8 @@ def import_data(lines):
display.append([])
prevTag = None
confidence_all.append({})
# ignore comments
elif line[0] == "#":
pass
@@ -226,6 +232,18 @@ def import_data(lines):
tag, confidence = re.split(r"/", columns[-1], 1)
tag = re.sub("^[BI]\-", "", tag).lower() # noqa: W605 - invalid dscape sequence
# ====================
# Confidence Getter
if prevTag != tag:
if confidence_all[-1].get(tag):
confidence_all[-1][tag].append(confidence)
else:
confidence_all[-1][tag] = [confidence]
else:
if confidence_all[-1].get(tag):
confidence_all[-1][tag].append(confidence)
else:
confidence_all[-1][tag] = [confidence]
# ---- DISPLAY ----
# build a structure which groups each token by its tag, so we can
# rebuild the original display name later.
@@ -257,13 +275,23 @@ def import_data(lines):
output = [
dict([(k, smartJoin(tokens)) for k, tokens in ingredient.items()]) for ingredient in data if len(ingredient)
]
# Add the marked-up display data
for i, v in enumerate(output):
output[i]["display"] = displayIngredient(display[i])
# Preclean Confidence
for i, c in enumerate(confidence_all):
avg_of_all = []
for k, v in c.items():
v = [float(x) for x in v]
avg = round(mean(v), 2)
avg_of_all.append(avg)
confidence_all[i][k] = avg
if avg_of_all:
confidence_all[i]["average"] = round(mean(avg_of_all), 2)
# Add the raw ingredient phrase
for i, v in enumerate(output):
output[i]["input"] = smartJoin([" ".join(tokens) for k, tokens in display[i]])
for i, _ in enumerate(output):
output[i]["input"] = smartJoin([" ".join(tokens) for _, tokens in display[i]])
output[i]["confidence"] = confidence_all[i]
return output

View File

@@ -3,9 +3,15 @@ from fractions import Fraction
from mealie.core.root_logger import get_logger
from mealie.schema.recipe import RecipeIngredient
from mealie.schema.recipe.recipe_ingredient import CreateIngredientFood, CreateIngredientUnit
from mealie.schema.recipe.recipe_ingredient import (
CreateIngredientFood,
CreateIngredientUnit,
IngredientConfidence,
ParsedIngredient,
RegisteredParser,
)
from .crfpp.processor import CRFIngredient, convert_list_to_crf_model
from . import brute, crfpp
logger = get_logger(__name__)
@@ -15,12 +21,41 @@ class ABCIngredientParser(ABC):
Abstract class for ingredient parsers.
"""
def parse_one(self, ingredient_string: str) -> ParsedIngredient:
pass
@abstractmethod
def parse(self, ingredients: list[str]) -> list[RecipeIngredient]:
def parse(self, ingredients: list[str]) -> list[ParsedIngredient]:
...
class CRFPPIngredientParser(ABCIngredientParser):
class BruteForceParser(ABCIngredientParser):
"""
Brute force ingredient parser.
"""
def __init__(self) -> None:
pass
def parse_one(self, ingredient: str) -> ParsedIngredient:
bfi = brute.parse(ingredient)
return ParsedIngredient(
input=ingredient,
ingredient=RecipeIngredient(
unit=CreateIngredientUnit(name=bfi.unit),
food=CreateIngredientFood(name=bfi.food),
disable_amount=False,
quantity=bfi.amount,
note=bfi.note,
),
)
def parse(self, ingredients: list[str]) -> list[ParsedIngredient]:
return [self.parse_one(ingredient) for ingredient in ingredients]
class NLPParser(ABCIngredientParser):
"""
Class for CRFPP ingredient parsers.
"""
@@ -28,7 +63,7 @@ class CRFPPIngredientParser(ABCIngredientParser):
def __init__(self) -> None:
pass
def _crf_to_ingredient(self, crf_model: CRFIngredient) -> RecipeIngredient:
def _crf_to_ingredient(self, crf_model: crfpp.CRFIngredient) -> ParsedIngredient:
ingredient = None
try:
@@ -41,15 +76,37 @@ class CRFPPIngredientParser(ABCIngredientParser):
quantity=float(sum(Fraction(s) for s in crf_model.qty.split())),
)
except Exception as e:
logger.error(f"Failed to parse ingredient: {crf_model}: {e}")
# TODO: Capture some sort of state for the user to see that an exception occured
logger.exception(e)
ingredient = RecipeIngredient(
title="",
note=crf_model.input,
)
return ingredient
return ParsedIngredient(
input=crf_model.input,
ingredient=ingredient,
confidence=IngredientConfidence(
quantity=crf_model.confidence.qty,
food=crf_model.confidence.name,
**crf_model.confidence.dict(),
),
)
def parse(self, ingredients: list[str]) -> list[RecipeIngredient]:
crf_models = convert_list_to_crf_model(ingredients)
def parse(self, ingredients: list[str]) -> list[ParsedIngredient]:
crf_models = crfpp.convert_list_to_crf_model(ingredients)
return [self._crf_to_ingredient(crf_model) for crf_model in crf_models]
__registrar = {
RegisteredParser.nlp: NLPParser,
RegisteredParser.brute: BruteForceParser,
}
def get_parser(parser: RegisteredParser) -> ABCIngredientParser:
"""
get_parser returns an ingrdeint parser based on the string enum value
passed in.
"""
return __registrar.get(parser, NLPParser)()

View File

@@ -1,14 +1,19 @@
from mealie.schema.recipe import RecipeIngredient
from mealie.services._base_http_service.http_services import UserHttpService
from .ingredient_parser import ABCIngredientParser, CRFPPIngredientParser
from .ingredient_parser import ABCIngredientParser, RegisteredParser, get_parser
class IngredientParserService(UserHttpService):
def __init__(self, parser: ABCIngredientParser = None, *args, **kwargs) -> None:
self.parser: ABCIngredientParser = parser() if parser else CRFPPIngredientParser()
parser: ABCIngredientParser
def __init__(self, parser: RegisteredParser = RegisteredParser.nlp, *args, **kwargs) -> None:
self.set_parser(parser)
super().__init__(*args, **kwargs)
def set_parser(self, parser: RegisteredParser) -> None:
self.parser = get_parser(parser)
def populate_item(self) -> None:
"""Satisfy abstract method"""
pass