mirror of
https://github.com/mealie-recipes/mealie.git
synced 2025-12-15 06:45:23 -05:00
Feature/CRF++ and server side locales (#731)
* add universal toast plugin * add server side locales * integrate CRF++ into CI/CD Pipeline * docs(docs): 📝 add recipe parser docs * feat(backend): ✨ Continued work on ingredient parsers * add new model dest * feat(frontend): ✨ New ingredient parser page * formatting Co-authored-by: Hayden <hay-kot@pm.me>
This commit is contained in:
1
mealie/services/parser_services/__init__.py
Normal file
1
mealie/services/parser_services/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
from .ingredient_parser_service import *
|
||||
0
mealie/services/parser_services/crfpp/__init__.py
Normal file
0
mealie/services/parser_services/crfpp/__init__.py
Normal file
97
mealie/services/parser_services/crfpp/pre_processor.py
Normal file
97
mealie/services/parser_services/crfpp/pre_processor.py
Normal file
@@ -0,0 +1,97 @@
|
||||
import re
|
||||
import unicodedata
|
||||
|
||||
replace_abbreviations = {
|
||||
"cup ": "cup ",
|
||||
" g ": "gram ",
|
||||
"kg ": "kilogram ",
|
||||
"lb ": "pound ",
|
||||
"ml ": "milliliter ",
|
||||
"oz ": "ounce ",
|
||||
"pint ": "pint ",
|
||||
"qt ": "quart ",
|
||||
"tbs ": "tablespoon ",
|
||||
"tbsp ": "tablespoon ",
|
||||
"tsp ": "teaspoon ",
|
||||
}
|
||||
|
||||
|
||||
def replace_common_abbreviations(string: str) -> str:
|
||||
for k, v in replace_abbreviations.items():
|
||||
string = string.replace(k, v)
|
||||
|
||||
return string
|
||||
|
||||
|
||||
def remove_periods(string: str) -> str:
|
||||
"""Removes periods not sournded by digets"""
|
||||
return re.sub(r"(?<!\d)\.(?!\d)", "", string)
|
||||
|
||||
|
||||
def replace_fraction_unicode(string: str):
|
||||
# TODO: I'm not confident this works well enough for production needs some testing and/or refacorting
|
||||
# TODO: Breaks on multiple unicode fractions
|
||||
for c in string:
|
||||
try:
|
||||
name = unicodedata.name(c)
|
||||
except ValueError:
|
||||
continue
|
||||
if name.startswith("VULGAR FRACTION"):
|
||||
normalized = unicodedata.normalize("NFKC", c)
|
||||
numerator, _, denominator = normalized.partition("⁄") # _ = slash
|
||||
text = f" {numerator}/{denominator}"
|
||||
return string.replace(c, text).replace(" ", " ")
|
||||
|
||||
return string
|
||||
|
||||
|
||||
def wrap_or_clause(string: str):
|
||||
"""
|
||||
Attempts to wrap or clauses in ()
|
||||
|
||||
Examples:
|
||||
'1 tsp. Diamond Crystal or ½ tsp. Morton kosher salt, plus more' -> '1 teaspoon diamond crystal (or 1/2 teaspoon morton kosher salt), plus more'
|
||||
|
||||
"""
|
||||
# TODO: Needs more adequite testing to be sure this doens't have side effects.
|
||||
split_by_or = string.split(" or ")
|
||||
|
||||
split_by_comma = split_by_or[1].split(",")
|
||||
|
||||
if len(split_by_comma) > 0:
|
||||
return f"{split_by_or[0]} (or {split_by_comma[0]}),{''.join(split_by_comma[1:])}".strip().removesuffix(",")
|
||||
|
||||
return string
|
||||
|
||||
|
||||
def pre_process_string(string: str) -> str:
|
||||
"""
|
||||
Series of preprocessing functions to make best use of the CRF++ model. The ideal string looks something like...
|
||||
|
||||
{qty} {unit} {food}, {additional}
|
||||
1 tbs. wine, expensive or other white wine, plus more
|
||||
|
||||
"""
|
||||
string = string.lower()
|
||||
string = replace_fraction_unicode(string)
|
||||
string = remove_periods(string)
|
||||
string = replace_common_abbreviations(string)
|
||||
|
||||
if " or " in string:
|
||||
string = wrap_or_clause(string)
|
||||
|
||||
return string
|
||||
|
||||
|
||||
def main():
|
||||
# TODO: Migrate to unittests
|
||||
print("Starting...")
|
||||
print(pre_process_string("1 tsp. Diamond Crystal or ½ tsp. Morton kosher salt, plus more"))
|
||||
print(pre_process_string("1 tsp. Diamond Crystal or ½ tsp. Morton kosher salt"))
|
||||
print(pre_process_string("¼ cup michiu tou or other rice wine"))
|
||||
print(pre_process_string("1 tbs. wine, expensive or other white wine, plus more"))
|
||||
print("Finished...")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
46
mealie/services/parser_services/crfpp/processor.py
Normal file
46
mealie/services/parser_services/crfpp/processor.py
Normal file
@@ -0,0 +1,46 @@
|
||||
import subprocess
|
||||
import tempfile
|
||||
from fractions import Fraction
|
||||
from pathlib import Path
|
||||
|
||||
from pydantic import BaseModel, validator
|
||||
|
||||
from . import utils
|
||||
from .pre_processor import pre_process_string
|
||||
|
||||
CWD = Path(__file__).parent
|
||||
MODEL_PATH = CWD / "model.crfmodel"
|
||||
|
||||
|
||||
class CRFIngredient(BaseModel):
|
||||
input: str = ""
|
||||
name: str = ""
|
||||
other: str = ""
|
||||
qty: str = ""
|
||||
comment: str = ""
|
||||
unit: str = ""
|
||||
|
||||
@validator("qty", always=True, pre=True)
|
||||
def validate_qty(qty, values): # sourcery skip: merge-nested-ifs
|
||||
if qty is None or qty == "":
|
||||
# Check if other contains a fraction
|
||||
if values["other"] is not None and values["other"].find("/") != -1:
|
||||
return float(Fraction(values["other"])).__round__(1)
|
||||
else:
|
||||
return 1
|
||||
|
||||
return qty
|
||||
|
||||
|
||||
def _exec_crf_test(input_text):
|
||||
with tempfile.NamedTemporaryFile(mode="w") as input_file:
|
||||
input_file.write(utils.export_data(input_text))
|
||||
input_file.flush()
|
||||
return subprocess.check_output(["crf_test", "--verbose=1", "--model", MODEL_PATH, input_file.name]).decode(
|
||||
"utf-8"
|
||||
)
|
||||
|
||||
|
||||
def convert_list_to_crf_model(list_of_ingrdeint_text: list[str]):
|
||||
crf_output = _exec_crf_test([pre_process_string(x) for x in list_of_ingrdeint_text])
|
||||
return [CRFIngredient(**ingredient) for ingredient in utils.import_data(crf_output.split("\n"))]
|
||||
38
mealie/services/parser_services/crfpp/tokenizer.py
Normal file
38
mealie/services/parser_services/crfpp/tokenizer.py
Normal file
@@ -0,0 +1,38 @@
|
||||
import re
|
||||
|
||||
|
||||
def clumpFractions(s):
|
||||
"""
|
||||
Replaces the whitespace between the integer and fractional part of a quantity
|
||||
with a dollar sign, so it's interpreted as a single token. The rest of the
|
||||
string is left alone.
|
||||
clumpFractions("aaa 1 2/3 bbb")
|
||||
# => "aaa 1$2/3 bbb"
|
||||
"""
|
||||
|
||||
return re.sub(r"(\d+)\s+(\d)/(\d)", r"\1$\2/\3", s)
|
||||
|
||||
|
||||
def tokenize(s):
|
||||
"""
|
||||
Tokenize on parenthesis, punctuation, spaces and American units followed by a slash.
|
||||
We sometimes give American units and metric units for baking recipes. For example:
|
||||
* 2 tablespoons/30 mililiters milk or cream
|
||||
* 2 1/2 cups/300 grams all-purpose flour
|
||||
The recipe database only allows for one unit, and we want to use the American one.
|
||||
But we must split the text on "cups/" etc. in order to pick it up.
|
||||
"""
|
||||
|
||||
# handle abbreviation like "100g" by treating it as "100 grams"
|
||||
s = re.sub(r"(\d+)g", r"\1 grams", s)
|
||||
s = re.sub(r"(\d+)oz", r"\1 ounces", s)
|
||||
s = re.sub(r"(\d+)ml", r"\1 milliliters", s, flags=re.IGNORECASE)
|
||||
|
||||
# TODO: Replace american_units with list of units from database?
|
||||
american_units = ["cup", "tablespoon", "teaspoon", "pound", "ounce", "quart", "pint"]
|
||||
# The following removes slashes following American units and replaces it with a space.
|
||||
for unit in american_units:
|
||||
s = s.replace(unit + "/", unit + " ")
|
||||
s = s.replace(unit + "s/", unit + "s ")
|
||||
|
||||
return [token.strip() for token in re.split(r"([,()\s]{1})", clumpFractions(s)) if token and token.strip()]
|
||||
282
mealie/services/parser_services/crfpp/utils.py
Normal file
282
mealie/services/parser_services/crfpp/utils.py
Normal file
@@ -0,0 +1,282 @@
|
||||
import re
|
||||
|
||||
from . import tokenizer
|
||||
|
||||
|
||||
def joinLine(columns):
|
||||
return "\t".join(columns)
|
||||
|
||||
|
||||
def cleanUnicodeFractions(s):
|
||||
"""
|
||||
Replace unicode fractions with ascii representation, preceded by a
|
||||
space.
|
||||
|
||||
"1\x215e" => "1 7/8"
|
||||
"""
|
||||
|
||||
fractions = {
|
||||
"\x215b": "1/8",
|
||||
"\x215c": "3/8",
|
||||
"\x215d": "5/8",
|
||||
"\x215e": "7/8",
|
||||
"\x2159": "1/6",
|
||||
"\x215a": "5/6",
|
||||
"\x2155": "1/5",
|
||||
"\x2156": "2/5",
|
||||
"\x2157": "3/5",
|
||||
"\x2158": "4/5",
|
||||
"\xbc": " 1/4",
|
||||
"\xbe": "3/4",
|
||||
"\x2153": "1/3",
|
||||
"\x2154": "2/3",
|
||||
"\xbd": "1/2",
|
||||
}
|
||||
|
||||
for f_unicode, f_ascii in fractions.items():
|
||||
s = s.replace(f_unicode, " " + f_ascii)
|
||||
|
||||
return s
|
||||
|
||||
|
||||
def unclump(s):
|
||||
"""
|
||||
Replacess $'s with spaces. The reverse of clumpFractions.
|
||||
"""
|
||||
return re.sub(r"\$", " ", s)
|
||||
|
||||
|
||||
def normalizeToken(s):
|
||||
"""
|
||||
ToDo: FIX THIS. We used to use the pattern.en package to singularize words, but
|
||||
in the name of simple deployments, we took it out. We should fix this at some
|
||||
point.
|
||||
"""
|
||||
return singularize(s)
|
||||
|
||||
|
||||
def getFeatures(token, index, tokens):
|
||||
"""
|
||||
Returns a list of features for a given token.
|
||||
"""
|
||||
length = len(tokens)
|
||||
|
||||
return [
|
||||
("I%s" % index),
|
||||
("L%s" % lengthGroup(length)),
|
||||
("Yes" if isCapitalized(token) else "No") + "CAP",
|
||||
("Yes" if insideParenthesis(token, tokens) else "No") + "PAREN",
|
||||
]
|
||||
|
||||
|
||||
def singularize(word):
|
||||
"""
|
||||
A poor replacement for the pattern.en singularize function, but ok for now.
|
||||
"""
|
||||
|
||||
units = {
|
||||
"cups": "cup",
|
||||
"tablespoons": "tablespoon",
|
||||
"teaspoons": "teaspoon",
|
||||
"pounds": "pound",
|
||||
"ounces": "ounce",
|
||||
"cloves": "clove",
|
||||
"sprigs": "sprig",
|
||||
"pinches": "pinch",
|
||||
"bunches": "bunch",
|
||||
"slices": "slice",
|
||||
"grams": "gram",
|
||||
"heads": "head",
|
||||
"quarts": "quart",
|
||||
"stalks": "stalk",
|
||||
"pints": "pint",
|
||||
"pieces": "piece",
|
||||
"sticks": "stick",
|
||||
"dashes": "dash",
|
||||
"fillets": "fillet",
|
||||
"cans": "can",
|
||||
"ears": "ear",
|
||||
"packages": "package",
|
||||
"strips": "strip",
|
||||
"bulbs": "bulb",
|
||||
"bottles": "bottle",
|
||||
}
|
||||
|
||||
if word in units.keys():
|
||||
return units[word]
|
||||
else:
|
||||
return word
|
||||
|
||||
|
||||
def isCapitalized(token):
|
||||
"""
|
||||
Returns true if a given token starts with a capital letter.
|
||||
"""
|
||||
return re.match(r"^[A-Z]", token) is not None
|
||||
|
||||
|
||||
def lengthGroup(actualLength):
|
||||
"""
|
||||
Buckets the length of the ingredient into 6 buckets.
|
||||
"""
|
||||
for n in [4, 8, 12, 16, 20]:
|
||||
if actualLength < n:
|
||||
return str(n)
|
||||
|
||||
return "X"
|
||||
|
||||
|
||||
def insideParenthesis(token, tokens):
|
||||
"""
|
||||
Returns true if the word is inside parenthesis in the phrase.
|
||||
"""
|
||||
if token in ["(", ")"]:
|
||||
return True
|
||||
else:
|
||||
line = " ".join(tokens)
|
||||
return (
|
||||
re.match(r".*\(.*" + re.escape(token) + ".*\).*", line) is not None # noqa: W605 - invalid dscape sequence
|
||||
)
|
||||
|
||||
|
||||
def displayIngredient(ingredient):
|
||||
"""
|
||||
Format a list of (tag, [tokens]) tuples as an HTML string for display.
|
||||
|
||||
displayIngredient([("qty", ["1"]), ("name", ["cat", "pie"])])
|
||||
# => <span class='qty'>1</span> <span class='name'>cat pie</span>
|
||||
"""
|
||||
|
||||
return "".join(["<span class='%s'>%s</span>" % (tag, " ".join(tokens)) for tag, tokens in ingredient])
|
||||
|
||||
|
||||
# HACK: fix this
|
||||
def smartJoin(words):
|
||||
"""
|
||||
Joins list of words with spaces, but is smart about not adding spaces
|
||||
before commas.
|
||||
"""
|
||||
|
||||
input = " ".join(words)
|
||||
|
||||
# replace " , " with ", "
|
||||
input = input.replace(" , ", ", ")
|
||||
|
||||
# replace " ( " with " ("
|
||||
input = input.replace("( ", "(")
|
||||
|
||||
# replace " ) " with ") "
|
||||
input = input.replace(" )", ")")
|
||||
|
||||
return input
|
||||
|
||||
|
||||
def import_data(lines):
|
||||
"""
|
||||
This thing takes the output of CRF++ and turns it into an actual
|
||||
data structure.
|
||||
"""
|
||||
data = [{}]
|
||||
display = [[]]
|
||||
prevTag = None
|
||||
#
|
||||
# iterate lines in the data file, which looks like:
|
||||
#
|
||||
# # 0.511035
|
||||
# 1/2 I1 L12 NoCAP X B-QTY/0.982850
|
||||
# teaspoon I2 L12 NoCAP X B-UNIT/0.982200
|
||||
# fresh I3 L12 NoCAP X B-COMMENT/0.716364
|
||||
# thyme I4 L12 NoCAP X B-NAME/0.816803
|
||||
# leaves I5 L12 NoCAP X I-NAME/0.960524
|
||||
# , I6 L12 NoCAP X B-COMMENT/0.772231
|
||||
# finely I7 L12 NoCAP X I-COMMENT/0.825956
|
||||
# chopped I8 L12 NoCAP X I-COMMENT/0.893379
|
||||
#
|
||||
# # 0.505999
|
||||
# Black I1 L8 YesCAP X B-NAME/0.765461
|
||||
# pepper I2 L8 NoCAP X I-NAME/0.756614
|
||||
# , I3 L8 NoCAP X OTHER/0.798040
|
||||
# to I4 L8 NoCAP X B-COMMENT/0.683089
|
||||
# taste I5 L8 NoCAP X I-COMMENT/0.848617
|
||||
#
|
||||
# i.e. the output of crf_test -v 1
|
||||
#
|
||||
for line in lines:
|
||||
# blank line starts a new ingredient
|
||||
if line in ("", "\n"):
|
||||
data.append({})
|
||||
display.append([])
|
||||
prevTag = None
|
||||
|
||||
# ignore comments
|
||||
elif line[0] == "#":
|
||||
pass
|
||||
|
||||
# otherwise it's a token
|
||||
# e.g.: potato \t I2 \t L5 \t NoCAP \t B-NAME/0.978253
|
||||
else:
|
||||
|
||||
columns = re.split("\t", line.strip())
|
||||
token = columns[0].strip()
|
||||
|
||||
# unclump fractions
|
||||
token = unclump(token)
|
||||
|
||||
# turn B-NAME/123 back into "name"
|
||||
tag, confidence = re.split(r"/", columns[-1], 1)
|
||||
tag = re.sub("^[BI]\-", "", tag).lower() # noqa: W605 - invalid dscape sequence
|
||||
|
||||
# ---- DISPLAY ----
|
||||
# build a structure which groups each token by its tag, so we can
|
||||
# rebuild the original display name later.
|
||||
|
||||
if prevTag != tag:
|
||||
display[-1].append((tag, [token]))
|
||||
prevTag = tag
|
||||
|
||||
else:
|
||||
display[-1][-1][1].append(token)
|
||||
# ^- token
|
||||
# ^---- tag
|
||||
# ^-------- ingredient
|
||||
|
||||
# ---- DATA ----
|
||||
# build a dict grouping tokens by their tag
|
||||
|
||||
# initialize this attribute if this is the first token of its kind
|
||||
if tag not in data[-1]:
|
||||
data[-1][tag] = []
|
||||
|
||||
# HACK: If this token is a unit, singularize it so Scoop accepts it.
|
||||
if tag == "unit":
|
||||
token = singularize(token)
|
||||
|
||||
data[-1][tag].append(token)
|
||||
|
||||
# reassemble the output into a list of dicts.
|
||||
output = [
|
||||
dict([(k, smartJoin(tokens)) for k, tokens in ingredient.items()]) for ingredient in data if len(ingredient)
|
||||
]
|
||||
# Add the marked-up display data
|
||||
for i, v in enumerate(output):
|
||||
output[i]["display"] = displayIngredient(display[i])
|
||||
|
||||
# Add the raw ingredient phrase
|
||||
for i, v in enumerate(output):
|
||||
output[i]["input"] = smartJoin([" ".join(tokens) for k, tokens in display[i]])
|
||||
|
||||
return output
|
||||
|
||||
|
||||
def export_data(lines):
|
||||
""" Parse "raw" ingredient lines into CRF-ready output """
|
||||
output = []
|
||||
for line in lines:
|
||||
line_clean = re.sub("<[^<]+?>", "", line)
|
||||
tokens = tokenizer.tokenize(line_clean)
|
||||
|
||||
for i, token in enumerate(tokens):
|
||||
features = getFeatures(token, i + 1, tokens)
|
||||
output.append(joinLine([token] + features))
|
||||
output.append("")
|
||||
return "\n".join(output)
|
||||
55
mealie/services/parser_services/ingredient_parser.py
Normal file
55
mealie/services/parser_services/ingredient_parser.py
Normal file
@@ -0,0 +1,55 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from fractions import Fraction
|
||||
|
||||
from mealie.core.root_logger import get_logger
|
||||
from mealie.schema.recipe import RecipeIngredient
|
||||
from mealie.schema.recipe.recipe_ingredient import CreateIngredientFood, CreateIngredientUnit
|
||||
|
||||
from .crfpp.processor import CRFIngredient, convert_list_to_crf_model
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
class ABCIngredientParser(ABC):
|
||||
"""
|
||||
Abstract class for ingredient parsers.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def parse(self, ingredients: list[str]) -> list[RecipeIngredient]:
|
||||
...
|
||||
|
||||
|
||||
class CRFPPIngredientParser(ABCIngredientParser):
|
||||
"""
|
||||
Class for CRFPP ingredient parsers.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
pass
|
||||
|
||||
def _crf_to_ingredient(self, crf_model: CRFIngredient) -> RecipeIngredient:
|
||||
ingredient = None
|
||||
|
||||
try:
|
||||
ingredient = RecipeIngredient(
|
||||
title="",
|
||||
note=crf_model.comment,
|
||||
unit=CreateIngredientUnit(name=crf_model.unit),
|
||||
food=CreateIngredientFood(name=crf_model.name),
|
||||
disable_amount=False,
|
||||
quantity=float(sum(Fraction(s) for s in crf_model.qty.split())),
|
||||
)
|
||||
except Exception as e:
|
||||
# TODO: Capture some sort of state for the user to see that an exception occured
|
||||
logger.exception(e)
|
||||
ingredient = RecipeIngredient(
|
||||
title="",
|
||||
note=crf_model.input,
|
||||
)
|
||||
|
||||
return ingredient
|
||||
|
||||
def parse(self, ingredients: list[str]) -> list[RecipeIngredient]:
|
||||
crf_models = convert_list_to_crf_model(ingredients)
|
||||
return [self._crf_to_ingredient(crf_model) for crf_model in crf_models]
|
||||
28
mealie/services/parser_services/ingredient_parser_service.py
Normal file
28
mealie/services/parser_services/ingredient_parser_service.py
Normal file
@@ -0,0 +1,28 @@
|
||||
from mealie.schema.recipe import RecipeIngredient
|
||||
from mealie.services._base_http_service.http_services import UserHttpService
|
||||
|
||||
from .ingredient_parser import ABCIngredientParser, CRFPPIngredientParser
|
||||
|
||||
|
||||
class IngredientParserService(UserHttpService):
|
||||
def __init__(self, parser: ABCIngredientParser = None, *args, **kwargs) -> None:
|
||||
self.parser: ABCIngredientParser = parser() if parser else CRFPPIngredientParser()
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
def populate_item(self) -> None:
|
||||
"""Satisfy abstract method"""
|
||||
pass
|
||||
|
||||
def parse_ingredient(self, ingredient: str) -> RecipeIngredient:
|
||||
parsed = self.parser.parse([ingredient])
|
||||
|
||||
if parsed:
|
||||
return parsed[0]
|
||||
# TODO: Raise Exception
|
||||
|
||||
def parse_ingredients(self, ingredients: list[str]) -> list[RecipeIngredient]:
|
||||
parsed = self.parser.parse(ingredients)
|
||||
|
||||
if parsed:
|
||||
return parsed
|
||||
# TODO: Raise Exception
|
||||
Reference in New Issue
Block a user