Feature/CRF++ and server side locales (#731)

* add universal toast plugin

* add server side locales

* integrate CRF++ into CI/CD Pipeline

* docs(docs): 📝 add recipe parser docs

* feat(backend):  Continued work on ingredient parsers

* add new model dest

* feat(frontend):  New ingredient parser page

* formatting

Co-authored-by: Hayden <hay-kot@pm.me>
This commit is contained in:
Hayden
2021-10-09 13:08:23 -08:00
committed by GitHub
parent c16f07950f
commit 60908e5a88
43 changed files with 610 additions and 186 deletions

View File

@@ -0,0 +1 @@
from .ingredient_parser_service import *

View File

@@ -0,0 +1,97 @@
import re
import unicodedata
replace_abbreviations = {
"cup ": "cup ",
" g ": "gram ",
"kg ": "kilogram ",
"lb ": "pound ",
"ml ": "milliliter ",
"oz ": "ounce ",
"pint ": "pint ",
"qt ": "quart ",
"tbs ": "tablespoon ",
"tbsp ": "tablespoon ",
"tsp ": "teaspoon ",
}
def replace_common_abbreviations(string: str) -> str:
for k, v in replace_abbreviations.items():
string = string.replace(k, v)
return string
def remove_periods(string: str) -> str:
"""Removes periods not sournded by digets"""
return re.sub(r"(?<!\d)\.(?!\d)", "", string)
def replace_fraction_unicode(string: str):
# TODO: I'm not confident this works well enough for production needs some testing and/or refacorting
# TODO: Breaks on multiple unicode fractions
for c in string:
try:
name = unicodedata.name(c)
except ValueError:
continue
if name.startswith("VULGAR FRACTION"):
normalized = unicodedata.normalize("NFKC", c)
numerator, _, denominator = normalized.partition("") # _ = slash
text = f" {numerator}/{denominator}"
return string.replace(c, text).replace(" ", " ")
return string
def wrap_or_clause(string: str):
"""
Attempts to wrap or clauses in ()
Examples:
'1 tsp. Diamond Crystal or ½ tsp. Morton kosher salt, plus more' -> '1 teaspoon diamond crystal (or 1/2 teaspoon morton kosher salt), plus more'
"""
# TODO: Needs more adequite testing to be sure this doens't have side effects.
split_by_or = string.split(" or ")
split_by_comma = split_by_or[1].split(",")
if len(split_by_comma) > 0:
return f"{split_by_or[0]} (or {split_by_comma[0]}),{''.join(split_by_comma[1:])}".strip().removesuffix(",")
return string
def pre_process_string(string: str) -> str:
"""
Series of preprocessing functions to make best use of the CRF++ model. The ideal string looks something like...
{qty} {unit} {food}, {additional}
1 tbs. wine, expensive or other white wine, plus more
"""
string = string.lower()
string = replace_fraction_unicode(string)
string = remove_periods(string)
string = replace_common_abbreviations(string)
if " or " in string:
string = wrap_or_clause(string)
return string
def main():
# TODO: Migrate to unittests
print("Starting...")
print(pre_process_string("1 tsp. Diamond Crystal or ½ tsp. Morton kosher salt, plus more"))
print(pre_process_string("1 tsp. Diamond Crystal or ½ tsp. Morton kosher salt"))
print(pre_process_string("¼ cup michiu tou or other rice wine"))
print(pre_process_string("1 tbs. wine, expensive or other white wine, plus more"))
print("Finished...")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,46 @@
import subprocess
import tempfile
from fractions import Fraction
from pathlib import Path
from pydantic import BaseModel, validator
from . import utils
from .pre_processor import pre_process_string
CWD = Path(__file__).parent
MODEL_PATH = CWD / "model.crfmodel"
class CRFIngredient(BaseModel):
input: str = ""
name: str = ""
other: str = ""
qty: str = ""
comment: str = ""
unit: str = ""
@validator("qty", always=True, pre=True)
def validate_qty(qty, values): # sourcery skip: merge-nested-ifs
if qty is None or qty == "":
# Check if other contains a fraction
if values["other"] is not None and values["other"].find("/") != -1:
return float(Fraction(values["other"])).__round__(1)
else:
return 1
return qty
def _exec_crf_test(input_text):
with tempfile.NamedTemporaryFile(mode="w") as input_file:
input_file.write(utils.export_data(input_text))
input_file.flush()
return subprocess.check_output(["crf_test", "--verbose=1", "--model", MODEL_PATH, input_file.name]).decode(
"utf-8"
)
def convert_list_to_crf_model(list_of_ingrdeint_text: list[str]):
crf_output = _exec_crf_test([pre_process_string(x) for x in list_of_ingrdeint_text])
return [CRFIngredient(**ingredient) for ingredient in utils.import_data(crf_output.split("\n"))]

View File

@@ -0,0 +1,38 @@
import re
def clumpFractions(s):
"""
Replaces the whitespace between the integer and fractional part of a quantity
with a dollar sign, so it's interpreted as a single token. The rest of the
string is left alone.
clumpFractions("aaa 1 2/3 bbb")
# => "aaa 1$2/3 bbb"
"""
return re.sub(r"(\d+)\s+(\d)/(\d)", r"\1$\2/\3", s)
def tokenize(s):
"""
Tokenize on parenthesis, punctuation, spaces and American units followed by a slash.
We sometimes give American units and metric units for baking recipes. For example:
* 2 tablespoons/30 mililiters milk or cream
* 2 1/2 cups/300 grams all-purpose flour
The recipe database only allows for one unit, and we want to use the American one.
But we must split the text on "cups/" etc. in order to pick it up.
"""
# handle abbreviation like "100g" by treating it as "100 grams"
s = re.sub(r"(\d+)g", r"\1 grams", s)
s = re.sub(r"(\d+)oz", r"\1 ounces", s)
s = re.sub(r"(\d+)ml", r"\1 milliliters", s, flags=re.IGNORECASE)
# TODO: Replace american_units with list of units from database?
american_units = ["cup", "tablespoon", "teaspoon", "pound", "ounce", "quart", "pint"]
# The following removes slashes following American units and replaces it with a space.
for unit in american_units:
s = s.replace(unit + "/", unit + " ")
s = s.replace(unit + "s/", unit + "s ")
return [token.strip() for token in re.split(r"([,()\s]{1})", clumpFractions(s)) if token and token.strip()]

View File

@@ -0,0 +1,282 @@
import re
from . import tokenizer
def joinLine(columns):
return "\t".join(columns)
def cleanUnicodeFractions(s):
"""
Replace unicode fractions with ascii representation, preceded by a
space.
"1\x215e" => "1 7/8"
"""
fractions = {
"\x215b": "1/8",
"\x215c": "3/8",
"\x215d": "5/8",
"\x215e": "7/8",
"\x2159": "1/6",
"\x215a": "5/6",
"\x2155": "1/5",
"\x2156": "2/5",
"\x2157": "3/5",
"\x2158": "4/5",
"\xbc": " 1/4",
"\xbe": "3/4",
"\x2153": "1/3",
"\x2154": "2/3",
"\xbd": "1/2",
}
for f_unicode, f_ascii in fractions.items():
s = s.replace(f_unicode, " " + f_ascii)
return s
def unclump(s):
"""
Replacess $'s with spaces. The reverse of clumpFractions.
"""
return re.sub(r"\$", " ", s)
def normalizeToken(s):
"""
ToDo: FIX THIS. We used to use the pattern.en package to singularize words, but
in the name of simple deployments, we took it out. We should fix this at some
point.
"""
return singularize(s)
def getFeatures(token, index, tokens):
"""
Returns a list of features for a given token.
"""
length = len(tokens)
return [
("I%s" % index),
("L%s" % lengthGroup(length)),
("Yes" if isCapitalized(token) else "No") + "CAP",
("Yes" if insideParenthesis(token, tokens) else "No") + "PAREN",
]
def singularize(word):
"""
A poor replacement for the pattern.en singularize function, but ok for now.
"""
units = {
"cups": "cup",
"tablespoons": "tablespoon",
"teaspoons": "teaspoon",
"pounds": "pound",
"ounces": "ounce",
"cloves": "clove",
"sprigs": "sprig",
"pinches": "pinch",
"bunches": "bunch",
"slices": "slice",
"grams": "gram",
"heads": "head",
"quarts": "quart",
"stalks": "stalk",
"pints": "pint",
"pieces": "piece",
"sticks": "stick",
"dashes": "dash",
"fillets": "fillet",
"cans": "can",
"ears": "ear",
"packages": "package",
"strips": "strip",
"bulbs": "bulb",
"bottles": "bottle",
}
if word in units.keys():
return units[word]
else:
return word
def isCapitalized(token):
"""
Returns true if a given token starts with a capital letter.
"""
return re.match(r"^[A-Z]", token) is not None
def lengthGroup(actualLength):
"""
Buckets the length of the ingredient into 6 buckets.
"""
for n in [4, 8, 12, 16, 20]:
if actualLength < n:
return str(n)
return "X"
def insideParenthesis(token, tokens):
"""
Returns true if the word is inside parenthesis in the phrase.
"""
if token in ["(", ")"]:
return True
else:
line = " ".join(tokens)
return (
re.match(r".*\(.*" + re.escape(token) + ".*\).*", line) is not None # noqa: W605 - invalid dscape sequence
)
def displayIngredient(ingredient):
"""
Format a list of (tag, [tokens]) tuples as an HTML string for display.
displayIngredient([("qty", ["1"]), ("name", ["cat", "pie"])])
# => <span class='qty'>1</span> <span class='name'>cat pie</span>
"""
return "".join(["<span class='%s'>%s</span>" % (tag, " ".join(tokens)) for tag, tokens in ingredient])
# HACK: fix this
def smartJoin(words):
"""
Joins list of words with spaces, but is smart about not adding spaces
before commas.
"""
input = " ".join(words)
# replace " , " with ", "
input = input.replace(" , ", ", ")
# replace " ( " with " ("
input = input.replace("( ", "(")
# replace " ) " with ") "
input = input.replace(" )", ")")
return input
def import_data(lines):
"""
This thing takes the output of CRF++ and turns it into an actual
data structure.
"""
data = [{}]
display = [[]]
prevTag = None
#
# iterate lines in the data file, which looks like:
#
# # 0.511035
# 1/2 I1 L12 NoCAP X B-QTY/0.982850
# teaspoon I2 L12 NoCAP X B-UNIT/0.982200
# fresh I3 L12 NoCAP X B-COMMENT/0.716364
# thyme I4 L12 NoCAP X B-NAME/0.816803
# leaves I5 L12 NoCAP X I-NAME/0.960524
# , I6 L12 NoCAP X B-COMMENT/0.772231
# finely I7 L12 NoCAP X I-COMMENT/0.825956
# chopped I8 L12 NoCAP X I-COMMENT/0.893379
#
# # 0.505999
# Black I1 L8 YesCAP X B-NAME/0.765461
# pepper I2 L8 NoCAP X I-NAME/0.756614
# , I3 L8 NoCAP X OTHER/0.798040
# to I4 L8 NoCAP X B-COMMENT/0.683089
# taste I5 L8 NoCAP X I-COMMENT/0.848617
#
# i.e. the output of crf_test -v 1
#
for line in lines:
# blank line starts a new ingredient
if line in ("", "\n"):
data.append({})
display.append([])
prevTag = None
# ignore comments
elif line[0] == "#":
pass
# otherwise it's a token
# e.g.: potato \t I2 \t L5 \t NoCAP \t B-NAME/0.978253
else:
columns = re.split("\t", line.strip())
token = columns[0].strip()
# unclump fractions
token = unclump(token)
# turn B-NAME/123 back into "name"
tag, confidence = re.split(r"/", columns[-1], 1)
tag = re.sub("^[BI]\-", "", tag).lower() # noqa: W605 - invalid dscape sequence
# ---- DISPLAY ----
# build a structure which groups each token by its tag, so we can
# rebuild the original display name later.
if prevTag != tag:
display[-1].append((tag, [token]))
prevTag = tag
else:
display[-1][-1][1].append(token)
# ^- token
# ^---- tag
# ^-------- ingredient
# ---- DATA ----
# build a dict grouping tokens by their tag
# initialize this attribute if this is the first token of its kind
if tag not in data[-1]:
data[-1][tag] = []
# HACK: If this token is a unit, singularize it so Scoop accepts it.
if tag == "unit":
token = singularize(token)
data[-1][tag].append(token)
# reassemble the output into a list of dicts.
output = [
dict([(k, smartJoin(tokens)) for k, tokens in ingredient.items()]) for ingredient in data if len(ingredient)
]
# Add the marked-up display data
for i, v in enumerate(output):
output[i]["display"] = displayIngredient(display[i])
# Add the raw ingredient phrase
for i, v in enumerate(output):
output[i]["input"] = smartJoin([" ".join(tokens) for k, tokens in display[i]])
return output
def export_data(lines):
""" Parse "raw" ingredient lines into CRF-ready output """
output = []
for line in lines:
line_clean = re.sub("<[^<]+?>", "", line)
tokens = tokenizer.tokenize(line_clean)
for i, token in enumerate(tokens):
features = getFeatures(token, i + 1, tokens)
output.append(joinLine([token] + features))
output.append("")
return "\n".join(output)

View File

@@ -0,0 +1,55 @@
from abc import ABC, abstractmethod
from fractions import Fraction
from mealie.core.root_logger import get_logger
from mealie.schema.recipe import RecipeIngredient
from mealie.schema.recipe.recipe_ingredient import CreateIngredientFood, CreateIngredientUnit
from .crfpp.processor import CRFIngredient, convert_list_to_crf_model
logger = get_logger(__name__)
class ABCIngredientParser(ABC):
"""
Abstract class for ingredient parsers.
"""
@abstractmethod
def parse(self, ingredients: list[str]) -> list[RecipeIngredient]:
...
class CRFPPIngredientParser(ABCIngredientParser):
"""
Class for CRFPP ingredient parsers.
"""
def __init__(self) -> None:
pass
def _crf_to_ingredient(self, crf_model: CRFIngredient) -> RecipeIngredient:
ingredient = None
try:
ingredient = RecipeIngredient(
title="",
note=crf_model.comment,
unit=CreateIngredientUnit(name=crf_model.unit),
food=CreateIngredientFood(name=crf_model.name),
disable_amount=False,
quantity=float(sum(Fraction(s) for s in crf_model.qty.split())),
)
except Exception as e:
# TODO: Capture some sort of state for the user to see that an exception occured
logger.exception(e)
ingredient = RecipeIngredient(
title="",
note=crf_model.input,
)
return ingredient
def parse(self, ingredients: list[str]) -> list[RecipeIngredient]:
crf_models = convert_list_to_crf_model(ingredients)
return [self._crf_to_ingredient(crf_model) for crf_model in crf_models]

View File

@@ -0,0 +1,28 @@
from mealie.schema.recipe import RecipeIngredient
from mealie.services._base_http_service.http_services import UserHttpService
from .ingredient_parser import ABCIngredientParser, CRFPPIngredientParser
class IngredientParserService(UserHttpService):
def __init__(self, parser: ABCIngredientParser = None, *args, **kwargs) -> None:
self.parser: ABCIngredientParser = parser() if parser else CRFPPIngredientParser()
super().__init__(*args, **kwargs)
def populate_item(self) -> None:
"""Satisfy abstract method"""
pass
def parse_ingredient(self, ingredient: str) -> RecipeIngredient:
parsed = self.parser.parse([ingredient])
if parsed:
return parsed[0]
# TODO: Raise Exception
def parse_ingredients(self, ingredients: list[str]) -> list[RecipeIngredient]:
parsed = self.parser.parse(ingredients)
if parsed:
return parsed
# TODO: Raise Exception