mirror of
https://github.com/mealie-recipes/mealie.git
synced 2025-12-12 05:15:18 -05:00
feat: ✨ Add brute strategy to ingredient processor (#744)
* fix UI column width * words * update parser to support diff strats * add new model url * make button more visible * fix nutrition error * feat(backend): ✨ add 'brute' strategy for parsing ingredients * satisfy linter * update UI for creation page * feat(backend): ✨ log 422 errors in detail when not in PRODUCTION * add strategy selector Co-authored-by: Hayden <hay-kot@pm.me>
This commit is contained in:
@@ -1 +1,2 @@
|
||||
from .ingredient_parser import *
|
||||
from .ingredient_parser_service import *
|
||||
|
||||
1
mealie/services/parser_services/_helpers/__init__.py
Normal file
1
mealie/services/parser_services/_helpers/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
from .string_utils import *
|
||||
23
mealie/services/parser_services/_helpers/string_utils.py
Normal file
23
mealie/services/parser_services/_helpers/string_utils.py
Normal file
@@ -0,0 +1,23 @@
|
||||
import re
|
||||
|
||||
compiled_match = re.compile(r"(.){1,6}\s\((.[^\(\)])+\)\s")
|
||||
compiled_search = re.compile(r"\((.[^\(])+\)")
|
||||
|
||||
|
||||
def move_parens_to_end(ing_str) -> str:
|
||||
"""
|
||||
Moves all parentheses in the string to the end of the string using Regex.
|
||||
If no parentheses are found, the string is returned unchanged.
|
||||
"""
|
||||
if re.match(compiled_match, ing_str):
|
||||
match = re.search(compiled_search, ing_str)
|
||||
start = match.start()
|
||||
end = match.end()
|
||||
ing_str = ing_str[:start] + ing_str[end:] + " " + ing_str[start:end]
|
||||
|
||||
return ing_str
|
||||
|
||||
|
||||
def check_char(char, *eql) -> bool:
|
||||
"""Helper method to check if a charaters matches any of the additional provided arguments"""
|
||||
return any(char == eql_char for eql_char in eql)
|
||||
1
mealie/services/parser_services/brute/__init__.py
Normal file
1
mealie/services/parser_services/brute/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
from .process import parse
|
||||
204
mealie/services/parser_services/brute/process.py
Normal file
204
mealie/services/parser_services/brute/process.py
Normal file
@@ -0,0 +1,204 @@
|
||||
import string
|
||||
import unicodedata
|
||||
from typing import Tuple
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
from .._helpers import check_char, move_parens_to_end
|
||||
|
||||
|
||||
class BruteParsedIngredient(BaseModel):
|
||||
food: str = ""
|
||||
note: str = ""
|
||||
amount: float = ""
|
||||
unit: str = ""
|
||||
|
||||
class Config:
|
||||
anystr_strip_whitespace = True
|
||||
|
||||
|
||||
def parse_fraction(x):
|
||||
if len(x) == 1 and "fraction" in unicodedata.decomposition(x):
|
||||
frac_split = unicodedata.decomposition(x[-1:]).split()
|
||||
return float((frac_split[1]).replace("003", "")) / float((frac_split[3]).replace("003", ""))
|
||||
else:
|
||||
frac_split = x.split("/")
|
||||
if len(frac_split) != 2:
|
||||
raise ValueError
|
||||
try:
|
||||
return int(frac_split[0]) / int(frac_split[1])
|
||||
except ZeroDivisionError:
|
||||
raise ValueError
|
||||
|
||||
|
||||
def parse_amount(ing_str) -> Tuple[float, str, str]:
|
||||
def keep_looping(ing_str, end) -> bool:
|
||||
"""
|
||||
Checks if:
|
||||
1. the end of the string is reached
|
||||
2. or if the next character is a digit
|
||||
3. or if the next character looks like an number (e.g. 1/2, 1.3, 1,500)
|
||||
"""
|
||||
if end >= len(ing_str):
|
||||
return False
|
||||
|
||||
if ing_str[end] in string.digits:
|
||||
return True
|
||||
|
||||
if check_char(ing_str[end], ".", ",", "/") and end + 1 < len(ing_str) and ing_str[end + 1] in string.digits:
|
||||
return True
|
||||
|
||||
amount = 0
|
||||
unit = ""
|
||||
note = ""
|
||||
|
||||
did_check_frac = False
|
||||
end = 0
|
||||
|
||||
while keep_looping(ing_str, end):
|
||||
end += 1
|
||||
|
||||
if end > 0:
|
||||
if "/" in ing_str[:end]:
|
||||
amount = parse_fraction(ing_str[:end])
|
||||
else:
|
||||
amount = float(ing_str[:end].replace(",", "."))
|
||||
else:
|
||||
amount = parse_fraction(ing_str[0])
|
||||
end += 1
|
||||
did_check_frac = True
|
||||
if end < len(ing_str):
|
||||
if did_check_frac:
|
||||
unit = ing_str[end:]
|
||||
else:
|
||||
try:
|
||||
amount += parse_fraction(ing_str[end])
|
||||
|
||||
unit_end = end + 1
|
||||
unit = ing_str[unit_end:]
|
||||
except ValueError:
|
||||
unit = ing_str[end:]
|
||||
|
||||
# i dont know any unit that starts with ( or - so its likely an alternative like 1L (500ml) Water or 2-3
|
||||
if unit.startswith("(") or unit.startswith("-"):
|
||||
unit = ""
|
||||
note = ing_str
|
||||
|
||||
return amount, unit, note
|
||||
|
||||
|
||||
def parse_ingredient_with_comma(tokens) -> Tuple[str, str]:
|
||||
ingredient = ""
|
||||
note = ""
|
||||
start = 0
|
||||
# search for first occurrence of an argument ending in a comma
|
||||
while start < len(tokens) and not tokens[start].endswith(","):
|
||||
start += 1
|
||||
if start == len(tokens):
|
||||
# no token ending in a comma found -> use everything as ingredient
|
||||
ingredient = " ".join(tokens)
|
||||
else:
|
||||
ingredient = " ".join(tokens[: start + 1])[:-1]
|
||||
|
||||
note_end = start + 1
|
||||
note = " ".join(tokens[note_end:])
|
||||
return ingredient, note
|
||||
|
||||
|
||||
def parse_ingredient(tokens) -> Tuple[str, str]:
|
||||
ingredient = ""
|
||||
note = ""
|
||||
if tokens[-1].endswith(")"):
|
||||
# Check if the matching opening bracket is in the same token
|
||||
if (not tokens[-1].startswith("(")) and ("(" in tokens[-1]):
|
||||
return parse_ingredient_with_comma(tokens)
|
||||
# last argument ends with closing bracket -> look for opening bracket
|
||||
start = len(tokens) - 1
|
||||
while not tokens[start].startswith("(") and start != 0:
|
||||
start -= 1
|
||||
if start == 0:
|
||||
# the whole list is wrapped in brackets -> assume it is an error (e.g. assumed first argument was the unit) # noqa: E501
|
||||
raise ValueError
|
||||
elif start < 0:
|
||||
# no opening bracket anywhere -> just ignore the last bracket
|
||||
ingredient, note = parse_ingredient_with_comma(tokens)
|
||||
else:
|
||||
# opening bracket found -> split in ingredient and note, remove brackets from note # noqa: E501
|
||||
note = " ".join(tokens[start:])[1:-1]
|
||||
ingredient = " ".join(tokens[:start])
|
||||
else:
|
||||
ingredient, note = parse_ingredient_with_comma(tokens)
|
||||
return ingredient, note
|
||||
|
||||
|
||||
def parse(ing_str) -> BruteParsedIngredient:
|
||||
amount = 0
|
||||
unit = ""
|
||||
ingredient = ""
|
||||
note = ""
|
||||
unit_note = ""
|
||||
|
||||
ing_str = move_parens_to_end(ing_str)
|
||||
|
||||
tokens = ing_str.split()
|
||||
|
||||
# Early return if the ingrdient is a single token and therefore has no other properties
|
||||
if len(tokens) == 1:
|
||||
ingredient = tokens[0]
|
||||
# TODO Refactor to expect BFP to be returned instead of Tuple
|
||||
return BruteParsedIngredient(food=ingredient, note=note, amount=amount, unit=unit)
|
||||
|
||||
try:
|
||||
# try to parse first argument as amount
|
||||
amount, unit, unit_note = parse_amount(tokens[0])
|
||||
# only try to parse second argument as amount if there are at least
|
||||
# three arguments if it already has a unit there can't be
|
||||
# a fraction for the amount
|
||||
if len(tokens) > 2:
|
||||
try:
|
||||
if unit != "":
|
||||
# a unit is already found, no need to try the second argument for a fraction
|
||||
# probably not the best method to do it, but I didn't want to make an if check and paste the exact same thing in the else as already is in the except # noqa: E501
|
||||
raise ValueError
|
||||
# try to parse second argument as amount and add that, in case of '2 1/2' or '2 ½'
|
||||
amount += parse_fraction(tokens[1])
|
||||
# assume that units can't end with a comma
|
||||
if len(tokens) > 3 and not tokens[2].endswith(","):
|
||||
# try to use third argument as unit and everything else as ingredient, use everything as ingredient if it fails # noqa: E501
|
||||
try:
|
||||
ingredient, note = parse_ingredient(tokens[3:])
|
||||
unit = tokens[2]
|
||||
except ValueError:
|
||||
ingredient, note = parse_ingredient(tokens[2:])
|
||||
else:
|
||||
ingredient, note = parse_ingredient(tokens[2:])
|
||||
except ValueError:
|
||||
# assume that units can't end with a comma
|
||||
if not tokens[1].endswith(","):
|
||||
# try to use second argument as unit and everything else as ingredient, use everything as ingredient if it fails # noqa: E501
|
||||
try:
|
||||
ingredient, note = parse_ingredient(tokens[2:])
|
||||
if unit == "":
|
||||
unit = tokens[1]
|
||||
else:
|
||||
note = tokens[1]
|
||||
except ValueError:
|
||||
ingredient, note = parse_ingredient(tokens[1:])
|
||||
else:
|
||||
ingredient, note = parse_ingredient(tokens[1:])
|
||||
else:
|
||||
# only two arguments, first one is the amount
|
||||
# which means this is the ingredient
|
||||
ingredient = tokens[1]
|
||||
except ValueError:
|
||||
try:
|
||||
# can't parse first argument as amount
|
||||
# -> no unit -> parse everything as ingredient
|
||||
ingredient, note = parse_ingredient(tokens)
|
||||
except ValueError:
|
||||
ingredient = " ".join(tokens[1:])
|
||||
|
||||
if unit_note not in note:
|
||||
note += " " + unit_note
|
||||
|
||||
return BruteParsedIngredient(food=ingredient, note=note, amount=amount, unit=unit)
|
||||
@@ -0,0 +1 @@
|
||||
from .processor import *
|
||||
|
||||
@@ -2,23 +2,25 @@ import re
|
||||
import unicodedata
|
||||
|
||||
replace_abbreviations = {
|
||||
"cup ": "cup ",
|
||||
" g ": "gram ",
|
||||
"kg ": "kilogram ",
|
||||
"lb ": "pound ",
|
||||
"ml ": "milliliter ",
|
||||
"oz ": "ounce ",
|
||||
"pint ": "pint ",
|
||||
"qt ": "quart ",
|
||||
"tbs ": "tablespoon ",
|
||||
"tbsp ": "tablespoon ",
|
||||
"tsp ": "teaspoon ",
|
||||
"cup": " cup ",
|
||||
"g": " gram ",
|
||||
"kg": " kilogram ",
|
||||
"lb": " pound ",
|
||||
"ml": " milliliter ",
|
||||
"oz": " ounce ",
|
||||
"pint": " pint ",
|
||||
"qt": " quart ",
|
||||
"tbsp": " tablespoon ",
|
||||
"tbs": " tablespoon ", # Order Matters!, 'tsb' must come after 'tbsp' incase of duplicate matches
|
||||
"tsp": " teaspoon ",
|
||||
}
|
||||
|
||||
|
||||
def replace_common_abbreviations(string: str) -> str:
|
||||
|
||||
for k, v in replace_abbreviations.items():
|
||||
string = string.replace(k, v)
|
||||
regex = rf"(?<=\d)\s?({k}s?)"
|
||||
string = re.sub(regex, v, string)
|
||||
|
||||
return string
|
||||
|
||||
@@ -81,17 +83,3 @@ def pre_process_string(string: str) -> str:
|
||||
string = wrap_or_clause(string)
|
||||
|
||||
return string
|
||||
|
||||
|
||||
def main():
|
||||
# TODO: Migrate to unittests
|
||||
print("Starting...")
|
||||
print(pre_process_string("1 tsp. Diamond Crystal or ½ tsp. Morton kosher salt, plus more"))
|
||||
print(pre_process_string("1 tsp. Diamond Crystal or ½ tsp. Morton kosher salt"))
|
||||
print(pre_process_string("¼ cup michiu tou or other rice wine"))
|
||||
print(pre_process_string("1 tbs. wine, expensive or other white wine, plus more"))
|
||||
print("Finished...")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -12,6 +12,14 @@ CWD = Path(__file__).parent
|
||||
MODEL_PATH = CWD / "model.crfmodel"
|
||||
|
||||
|
||||
class CRFConfidence(BaseModel):
|
||||
average: float = 0.0
|
||||
comment: float = None
|
||||
name: float = None
|
||||
unit: float = None
|
||||
qty: float = None
|
||||
|
||||
|
||||
class CRFIngredient(BaseModel):
|
||||
input: str = ""
|
||||
name: str = ""
|
||||
@@ -19,15 +27,19 @@ class CRFIngredient(BaseModel):
|
||||
qty: str = ""
|
||||
comment: str = ""
|
||||
unit: str = ""
|
||||
confidence: CRFConfidence
|
||||
|
||||
@validator("qty", always=True, pre=True)
|
||||
def validate_qty(qty, values): # sourcery skip: merge-nested-ifs
|
||||
if qty is None or qty == "":
|
||||
# Check if other contains a fraction
|
||||
if values["other"] is not None and values["other"].find("/") != -1:
|
||||
return float(Fraction(values["other"])).__round__(1)
|
||||
else:
|
||||
return 1
|
||||
try:
|
||||
if values["other"] is not None and values["other"].find("/") != -1:
|
||||
return float(Fraction(values["other"])).__round__(1)
|
||||
else:
|
||||
return 1
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return qty
|
||||
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import re
|
||||
from statistics import mean
|
||||
|
||||
from . import tokenizer
|
||||
|
||||
@@ -179,6 +180,9 @@ def import_data(lines):
|
||||
data = [{}]
|
||||
display = [[]]
|
||||
prevTag = None
|
||||
|
||||
confidence_all = [{}]
|
||||
|
||||
#
|
||||
# iterate lines in the data file, which looks like:
|
||||
#
|
||||
@@ -208,6 +212,8 @@ def import_data(lines):
|
||||
display.append([])
|
||||
prevTag = None
|
||||
|
||||
confidence_all.append({})
|
||||
|
||||
# ignore comments
|
||||
elif line[0] == "#":
|
||||
pass
|
||||
@@ -226,6 +232,18 @@ def import_data(lines):
|
||||
tag, confidence = re.split(r"/", columns[-1], 1)
|
||||
tag = re.sub("^[BI]\-", "", tag).lower() # noqa: W605 - invalid dscape sequence
|
||||
|
||||
# ====================
|
||||
# Confidence Getter
|
||||
if prevTag != tag:
|
||||
if confidence_all[-1].get(tag):
|
||||
confidence_all[-1][tag].append(confidence)
|
||||
else:
|
||||
confidence_all[-1][tag] = [confidence]
|
||||
else:
|
||||
if confidence_all[-1].get(tag):
|
||||
confidence_all[-1][tag].append(confidence)
|
||||
else:
|
||||
confidence_all[-1][tag] = [confidence]
|
||||
# ---- DISPLAY ----
|
||||
# build a structure which groups each token by its tag, so we can
|
||||
# rebuild the original display name later.
|
||||
@@ -257,13 +275,23 @@ def import_data(lines):
|
||||
output = [
|
||||
dict([(k, smartJoin(tokens)) for k, tokens in ingredient.items()]) for ingredient in data if len(ingredient)
|
||||
]
|
||||
# Add the marked-up display data
|
||||
for i, v in enumerate(output):
|
||||
output[i]["display"] = displayIngredient(display[i])
|
||||
|
||||
# Preclean Confidence
|
||||
for i, c in enumerate(confidence_all):
|
||||
avg_of_all = []
|
||||
for k, v in c.items():
|
||||
v = [float(x) for x in v]
|
||||
avg = round(mean(v), 2)
|
||||
avg_of_all.append(avg)
|
||||
confidence_all[i][k] = avg
|
||||
|
||||
if avg_of_all:
|
||||
confidence_all[i]["average"] = round(mean(avg_of_all), 2)
|
||||
|
||||
# Add the raw ingredient phrase
|
||||
for i, v in enumerate(output):
|
||||
output[i]["input"] = smartJoin([" ".join(tokens) for k, tokens in display[i]])
|
||||
for i, _ in enumerate(output):
|
||||
output[i]["input"] = smartJoin([" ".join(tokens) for _, tokens in display[i]])
|
||||
output[i]["confidence"] = confidence_all[i]
|
||||
|
||||
return output
|
||||
|
||||
|
||||
@@ -3,9 +3,15 @@ from fractions import Fraction
|
||||
|
||||
from mealie.core.root_logger import get_logger
|
||||
from mealie.schema.recipe import RecipeIngredient
|
||||
from mealie.schema.recipe.recipe_ingredient import CreateIngredientFood, CreateIngredientUnit
|
||||
from mealie.schema.recipe.recipe_ingredient import (
|
||||
CreateIngredientFood,
|
||||
CreateIngredientUnit,
|
||||
IngredientConfidence,
|
||||
ParsedIngredient,
|
||||
RegisteredParser,
|
||||
)
|
||||
|
||||
from .crfpp.processor import CRFIngredient, convert_list_to_crf_model
|
||||
from . import brute, crfpp
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
@@ -15,12 +21,41 @@ class ABCIngredientParser(ABC):
|
||||
Abstract class for ingredient parsers.
|
||||
"""
|
||||
|
||||
def parse_one(self, ingredient_string: str) -> ParsedIngredient:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def parse(self, ingredients: list[str]) -> list[RecipeIngredient]:
|
||||
def parse(self, ingredients: list[str]) -> list[ParsedIngredient]:
|
||||
...
|
||||
|
||||
|
||||
class CRFPPIngredientParser(ABCIngredientParser):
|
||||
class BruteForceParser(ABCIngredientParser):
|
||||
"""
|
||||
Brute force ingredient parser.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
pass
|
||||
|
||||
def parse_one(self, ingredient: str) -> ParsedIngredient:
|
||||
bfi = brute.parse(ingredient)
|
||||
|
||||
return ParsedIngredient(
|
||||
input=ingredient,
|
||||
ingredient=RecipeIngredient(
|
||||
unit=CreateIngredientUnit(name=bfi.unit),
|
||||
food=CreateIngredientFood(name=bfi.food),
|
||||
disable_amount=False,
|
||||
quantity=bfi.amount,
|
||||
note=bfi.note,
|
||||
),
|
||||
)
|
||||
|
||||
def parse(self, ingredients: list[str]) -> list[ParsedIngredient]:
|
||||
return [self.parse_one(ingredient) for ingredient in ingredients]
|
||||
|
||||
|
||||
class NLPParser(ABCIngredientParser):
|
||||
"""
|
||||
Class for CRFPP ingredient parsers.
|
||||
"""
|
||||
@@ -28,7 +63,7 @@ class CRFPPIngredientParser(ABCIngredientParser):
|
||||
def __init__(self) -> None:
|
||||
pass
|
||||
|
||||
def _crf_to_ingredient(self, crf_model: CRFIngredient) -> RecipeIngredient:
|
||||
def _crf_to_ingredient(self, crf_model: crfpp.CRFIngredient) -> ParsedIngredient:
|
||||
ingredient = None
|
||||
|
||||
try:
|
||||
@@ -41,15 +76,37 @@ class CRFPPIngredientParser(ABCIngredientParser):
|
||||
quantity=float(sum(Fraction(s) for s in crf_model.qty.split())),
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to parse ingredient: {crf_model}: {e}")
|
||||
# TODO: Capture some sort of state for the user to see that an exception occured
|
||||
logger.exception(e)
|
||||
ingredient = RecipeIngredient(
|
||||
title="",
|
||||
note=crf_model.input,
|
||||
)
|
||||
|
||||
return ingredient
|
||||
return ParsedIngredient(
|
||||
input=crf_model.input,
|
||||
ingredient=ingredient,
|
||||
confidence=IngredientConfidence(
|
||||
quantity=crf_model.confidence.qty,
|
||||
food=crf_model.confidence.name,
|
||||
**crf_model.confidence.dict(),
|
||||
),
|
||||
)
|
||||
|
||||
def parse(self, ingredients: list[str]) -> list[RecipeIngredient]:
|
||||
crf_models = convert_list_to_crf_model(ingredients)
|
||||
def parse(self, ingredients: list[str]) -> list[ParsedIngredient]:
|
||||
crf_models = crfpp.convert_list_to_crf_model(ingredients)
|
||||
return [self._crf_to_ingredient(crf_model) for crf_model in crf_models]
|
||||
|
||||
|
||||
__registrar = {
|
||||
RegisteredParser.nlp: NLPParser,
|
||||
RegisteredParser.brute: BruteForceParser,
|
||||
}
|
||||
|
||||
|
||||
def get_parser(parser: RegisteredParser) -> ABCIngredientParser:
|
||||
"""
|
||||
get_parser returns an ingrdeint parser based on the string enum value
|
||||
passed in.
|
||||
"""
|
||||
return __registrar.get(parser, NLPParser)()
|
||||
|
||||
@@ -1,14 +1,19 @@
|
||||
from mealie.schema.recipe import RecipeIngredient
|
||||
from mealie.services._base_http_service.http_services import UserHttpService
|
||||
|
||||
from .ingredient_parser import ABCIngredientParser, CRFPPIngredientParser
|
||||
from .ingredient_parser import ABCIngredientParser, RegisteredParser, get_parser
|
||||
|
||||
|
||||
class IngredientParserService(UserHttpService):
|
||||
def __init__(self, parser: ABCIngredientParser = None, *args, **kwargs) -> None:
|
||||
self.parser: ABCIngredientParser = parser() if parser else CRFPPIngredientParser()
|
||||
parser: ABCIngredientParser
|
||||
|
||||
def __init__(self, parser: RegisteredParser = RegisteredParser.nlp, *args, **kwargs) -> None:
|
||||
self.set_parser(parser)
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
def set_parser(self, parser: RegisteredParser) -> None:
|
||||
self.parser = get_parser(parser)
|
||||
|
||||
def populate_item(self) -> None:
|
||||
"""Satisfy abstract method"""
|
||||
pass
|
||||
|
||||
Reference in New Issue
Block a user