feat: Migrate from CRF++ to Ingredient Parser (a Python package) (#5061)

This commit is contained in:
Michael Genson
2025-02-28 08:17:28 -06:00
committed by GitHub
parent ec1a9d78ac
commit b12aea8272
19 changed files with 367 additions and 592 deletions

View File

@@ -250,7 +250,7 @@ class RepositoryGeneric(Generic[Schema, Model]):
match_key = match_key or self.primary_key
result = self._query_one(value, match_key)
results_as_model = self.schema.model_validate(result)
result_as_model = self.schema.model_validate(result)
try:
self.session.delete(result)
@@ -259,10 +259,10 @@ class RepositoryGeneric(Generic[Schema, Model]):
self.session.rollback()
raise e
return results_as_model
return result_as_model
def delete_many(self, values: Iterable) -> Schema:
query = self._query().filter(self.model.id.in_(values)) # type: ignore
def delete_many(self, values: Iterable) -> list[Schema]:
query = self._query().filter(self.model.id.in_(values))
results = self.session.execute(query).unique().scalars().all()
results_as_model = [self.schema.model_validate(result) for result in results]
@@ -277,7 +277,7 @@ class RepositoryGeneric(Generic[Schema, Model]):
self.session.rollback()
raise e
return results_as_model # type: ignore
return results_as_model
def delete_all(self) -> None:
delete(self.model)

View File

@@ -1,5 +1,5 @@
import re as re
from collections.abc import Sequence
from collections.abc import Iterable, Sequence
from random import randint
from typing import Self, cast
from uuid import UUID
@@ -103,6 +103,51 @@ class RepositoryRecipes(HouseholdRepositoryGeneric[Recipe, RecipeModel]):
if i >= max_retries:
raise
def _delete_recipe(self, recipe: RecipeModel) -> Recipe:
recipe_as_model = self.schema.model_validate(recipe)
# first remove UserToRecipe entries so we don't run into stale data errors
try:
user_to_recipe_delete_query = sa.delete(UserToRecipe).where(UserToRecipe.recipe_id == recipe.id)
self.session.execute(user_to_recipe_delete_query)
self.session.commit()
except Exception:
self.session.rollback()
raise
# remove the recipe
try:
self.session.delete(recipe)
self.session.commit()
except Exception:
self.session.rollback()
raise
return recipe_as_model
def delete(self, value, match_key: str | None = None) -> Recipe:
match_key = match_key or self.primary_key
recipe_in_db = self._query_one(value, match_key)
return self._delete_recipe(recipe_in_db)
def delete_many(self, values: Iterable) -> list[Recipe]:
query = self._query().filter(self.model.id.in_(values))
recipes_in_db = self.session.execute(query).unique().scalars().all()
results: list[Recipe] = []
# we create a delete statement for each row
# we don't delete the whole query in one statement because postgres doesn't cascade correctly
for recipe_in_db in recipes_in_db:
results.append(self._delete_recipe(recipe_in_db))
try:
self.session.commit()
except Exception as e:
self.session.rollback()
raise e
return results
def update_image(self, slug: str, _: str | None = None) -> int:
entry: RecipeModel = self._query_one(match_value=slug)
entry.image = randint(0, 255)

View File

@@ -1,21 +0,0 @@
import requests
from mealie.services.parser_services import crfpp
MODEL_URL = "https://github.com/mealie-recipes/nlp-model/releases/download/v1.0.0/model.crfmodel"
def main():
"""
Install the model into the crfpp directory
"""
r = requests.get(MODEL_URL, stream=True, allow_redirects=True)
with open(crfpp.MODEL_PATH, "wb") as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
if __name__ == "__main__":
main()

View File

@@ -1 +0,0 @@
from .processor import *

View File

@@ -1,69 +0,0 @@
import re
from mealie.services.parser_services.parser_utils import convert_vulgar_fractions_to_regular_fractions
replace_abbreviations = {
"cup": " cup ",
"g": " gram ",
"kg": " kilogram ",
"lb": " pound ",
"ml": " milliliter ",
"oz": " ounce ",
"pint": " pint ",
"qt": " quart ",
"tbsp": " tablespoon ",
"tbs": " tablespoon ", # Order Matters!, 'tsb' must come after 'tbsp' in case of duplicate matches
"tsp": " teaspoon ",
}
def replace_common_abbreviations(string: str) -> str:
for k, v in replace_abbreviations.items():
regex = rf"(?<=\d)\s?({k}\bs?)"
string = re.sub(regex, v, string)
return string
def remove_periods(string: str) -> str:
"""Removes periods not sournded by digets"""
return re.sub(r"(?<!\d)\.(?!\d)", "", string)
def wrap_or_clause(string: str):
"""
Attempts to wrap or clauses in ()
Examples:
'1 tsp. Diamond Crystal or ½ tsp. Morton kosher salt, plus more'
-> '1 teaspoon diamond crystal (or 1/2 teaspoon morton kosher salt), plus more'
"""
# TODO: Needs more adequite testing to be sure this doesn't have side effects.
split_by_or = string.split(" or ")
split_by_comma = split_by_or[1].split(",")
if len(split_by_comma) > 0:
return f"{split_by_or[0]} (or {split_by_comma[0]}),{''.join(split_by_comma[1:])}".strip().removesuffix(",")
return string
def pre_process_string(string: str) -> str:
"""
Series of preprocessing functions to make best use of the CRF++ model. The ideal string looks something like...
{qty} {unit} {food}, {additional}
1 tbs. wine, expensive or other white wine, plus more
"""
string = string.lower()
string = convert_vulgar_fractions_to_regular_fractions(string)
string = remove_periods(string)
string = replace_common_abbreviations(string)
if " or " in string:
string = wrap_or_clause(string)
return string

View File

@@ -1,63 +0,0 @@
import os
import subprocess
import tempfile
from fractions import Fraction
from pathlib import Path
from typing import Annotated
from pydantic import BaseModel, Field, field_validator
from pydantic_core.core_schema import ValidationInfo
from mealie.schema._mealie.types import NoneFloat
from . import utils
from .pre_processor import pre_process_string
CWD = Path(__file__).parent
MODEL_PATH = os.getenv("CRF_MODEL_PATH", default=CWD / "model.crfmodel")
class CRFConfidence(BaseModel):
average: float = 0.0
comment: NoneFloat = None
name: NoneFloat = None
unit: NoneFloat = None
qty: Annotated[NoneFloat, Field(validate_default=True)] = None
class CRFIngredient(BaseModel):
input: str = ""
name: str = ""
other: str = ""
qty: Annotated[str, Field(validate_default=True)] = ""
comment: str = ""
unit: str = ""
confidence: CRFConfidence
@field_validator("qty", mode="before")
def validate_qty(cls, qty, info: ValidationInfo):
if qty is not None and qty != "":
return qty
# Check if other contains a fraction
try:
if info.data["other"] is not None and info.data["other"].find("/") != -1:
return str(round(float(Fraction(info.data["other"])), 3))
else:
return "0"
except Exception:
return ""
def _exec_crf_test(input_text):
with tempfile.NamedTemporaryFile(mode="w") as input_file:
input_file.write(utils.export_data(input_text))
input_file.flush()
return subprocess.check_output(["crf_test", "--verbose=1", "--model", MODEL_PATH, input_file.name]).decode(
"utf-8"
)
def convert_list_to_crf_model(list_of_ingrdeint_text: list[str]):
crf_output = _exec_crf_test([pre_process_string(x) for x in list_of_ingrdeint_text])
return [CRFIngredient(**ingredient) for ingredient in utils.import_data(crf_output.split("\n"))]

View File

@@ -1,38 +0,0 @@
import re
def clumpFractions(s):
"""
Replaces the whitespace between the integer and fractional part of a quantity
with a dollar sign, so it's interpreted as a single token. The rest of the
string is left alone.
clumpFractions("aaa 1 2/3 bbb")
# => "aaa 1$2/3 bbb"
"""
return re.sub(r"(\d+)\s+(\d)/(\d)", r"\1$\2/\3", s)
def tokenize(s):
"""
Tokenize on parenthesis, punctuation, spaces and American units followed by a slash.
We sometimes give American units and metric units for baking recipes. For example:
* 2 tablespoons/30 mililiters milk or cream
* 2 1/2 cups/300 grams all-purpose flour
The recipe database only allows for one unit, and we want to use the American one.
But we must split the text on "cups/" etc. in order to pick it up.
"""
# handle abbreviation like "100g" by treating it as "100 grams"
s = re.sub(r"(\d+)g", r"\1 grams", s)
s = re.sub(r"(\d+)oz", r"\1 ounces", s)
s = re.sub(r"(\d+)ml", r"\1 milliliters", s, flags=re.IGNORECASE)
# TODO: Replace american_units with list of units from database?
american_units = ["cup", "tablespoon", "teaspoon", "pound", "ounce", "quart", "pint"]
# The following removes slashes following American units and replaces it with a space.
for unit in american_units:
s = s.replace(unit + "/", unit + " ")
s = s.replace(unit + "s/", unit + "s ")
return [token.strip() for token in re.split(r"([,()\s]{1})", clumpFractions(s)) if token and token.strip()]

View File

@@ -1,266 +0,0 @@
import re
from statistics import mean
from . import tokenizer
def joinLine(columns):
return "\t".join(columns)
def unclump(s):
"""
Replacess $'s with spaces. The reverse of clumpFractions.
"""
return re.sub(r"\$", " ", s)
def getFeatures(token, index, tokens):
"""
Returns a list of features for a given token.
"""
length = len(tokens)
return [
f"I{index}",
f"L{lengthGroup(length)}",
f"{'Yes' if isCapitalized(token) else 'No'}CAP",
f"{'Yes' if insideParenthesis(token, tokens) else 'No'}PAREN",
]
def singularize(word):
"""
A poor replacement for the pattern.en singularize function, but ok for now.
"""
units = {
"cups": "cup",
"tablespoons": "tablespoon",
"teaspoons": "teaspoon",
"pounds": "pound",
"ounces": "ounce",
"cloves": "clove",
"sprigs": "sprig",
"pinches": "pinch",
"bunches": "bunch",
"slices": "slice",
"grams": "gram",
"heads": "head",
"quarts": "quart",
"stalks": "stalk",
"pints": "pint",
"pieces": "piece",
"sticks": "stick",
"dashes": "dash",
"fillets": "fillet",
"cans": "can",
"ears": "ear",
"packages": "package",
"strips": "strip",
"bulbs": "bulb",
"bottles": "bottle",
}
if word in units.keys():
return units[word]
else:
return word
def isCapitalized(token):
"""
Returns true if a given token starts with a capital letter.
"""
return re.match(r"^[A-Z]", token) is not None
def lengthGroup(actualLength):
"""
Buckets the length of the ingredient into 6 buckets.
"""
for n in [4, 8, 12, 16, 20]:
if actualLength < n:
return str(n)
return "X"
def insideParenthesis(token, tokens):
"""
Returns true if the word is inside parenthesis in the phrase.
"""
if token in ["(", ")"]:
return True
else:
line = " ".join(tokens)
return (
re.match(r".*\(.*" + re.escape(token) + r".*\).*", line) is not None # - invalid dscape sequence
)
def displayIngredient(ingredient):
"""
Format a list of (tag, [tokens]) tuples as an HTML string for display.
displayIngredient([("qty", ["1"]), ("name", ["cat", "pie"])])
# => <span class='qty'>1</span> <span class='name'>cat pie</span>
"""
return "".join(["<span class='{}'>{}</span>".format(tag, " ".join(tokens)) for tag, tokens in ingredient])
# HACK: fix this
def smartJoin(words):
"""
Joins list of words with spaces, but is smart about not adding spaces
before commas.
"""
input = " ".join(words)
# replace " , " with ", "
input = input.replace(" , ", ", ")
# replace " ( " with " ("
input = input.replace("( ", "(")
# replace " ) " with ") "
input = input.replace(" )", ")")
return input
def import_data(lines):
"""
This thing takes the output of CRF++ and turns it into an actual
data structure.
"""
data = [{}]
display = [[]]
prevTag = None
confidence_all = [{}]
#
# iterate lines in the data file, which looks like:
#
# # 0.511035
# 1/2 I1 L12 NoCAP X B-QTY/0.982850
# teaspoon I2 L12 NoCAP X B-UNIT/0.982200
# fresh I3 L12 NoCAP X B-COMMENT/0.716364
# thyme I4 L12 NoCAP X B-NAME/0.816803
# leaves I5 L12 NoCAP X I-NAME/0.960524
# , I6 L12 NoCAP X B-COMMENT/0.772231
# finely I7 L12 NoCAP X I-COMMENT/0.825956
# chopped I8 L12 NoCAP X I-COMMENT/0.893379
#
# # 0.505999
# Black I1 L8 YesCAP X B-NAME/0.765461
# pepper I2 L8 NoCAP X I-NAME/0.756614
# , I3 L8 NoCAP X OTHER/0.798040
# to I4 L8 NoCAP X B-COMMENT/0.683089
# taste I5 L8 NoCAP X I-COMMENT/0.848617
#
# i.e. the output of crf_test -v 1
#
for line in lines:
# blank line starts a new ingredient
if line in ("", "\n"):
data.append({})
display.append([])
prevTag = None
confidence_all.append({})
# ignore comments
elif line[0] == "#":
pass
# otherwise it's a token
# e.g.: potato \t I2 \t L5 \t NoCAP \t B-NAME/0.978253
else:
columns = re.split("\t", line.strip())
token = columns[0].strip()
# unclump fractions
token = unclump(token)
# turn B-NAME/123 back into "name"
tag, confidence = re.split(r"/", columns[-1], maxsplit=1)
tag = re.sub(r"^[BI]\-", "", tag).lower() # - invalid dscape sequence
# ====================
# Confidence Getter
if prevTag != tag:
if confidence_all[-1].get(tag):
confidence_all[-1][tag].append(confidence)
else:
confidence_all[-1][tag] = [confidence]
else:
if confidence_all[-1].get(tag):
confidence_all[-1][tag].append(confidence)
else:
confidence_all[-1][tag] = [confidence]
# ---- DISPLAY ----
# build a structure which groups each token by its tag, so we can
# rebuild the original display name later.
if prevTag != tag:
display[-1].append((tag, [token]))
prevTag = tag
else:
display[-1][-1][1].append(token)
# ^- token
# ^---- tag
# ^-------- ingredient
# ---- DATA ----
# build a dict grouping tokens by their tag
# initialize this attribute if this is the first token of its kind
if tag not in data[-1]:
data[-1][tag] = []
# HACK: If this token is a unit, singularize it so Scoop accepts it.
if tag == "unit":
token = singularize(token)
data[-1][tag].append(token)
# reassemble the output into a list of dicts.
output = [{k: smartJoin(tokens) for k, tokens in ingredient.items()} for ingredient in data if len(ingredient)]
# Preclean Confidence
for i, c in enumerate(confidence_all):
avg_of_all = []
for k, v in c.items():
v = [float(x) for x in v]
avg = round(mean(v), 2)
avg_of_all.append(avg)
confidence_all[i][k] = avg
if avg_of_all:
confidence_all[i]["average"] = round(mean(avg_of_all), 2)
# Add the raw ingredient phrase
for i, _ in enumerate(output):
output[i]["input"] = smartJoin([" ".join(tokens) for _, tokens in display[i]])
output[i]["confidence"] = confidence_all[i]
return output
def export_data(lines):
"""Parse "raw" ingredient lines into CRF-ready output"""
output = []
for line in lines:
line_clean = re.sub("<[^<]+?>", "", line)
tokens = tokenizer.tokenize(line_clean)
for i, token in enumerate(tokens):
features = getFeatures(token, i + 1, tokens)
output.append(joinLine([token, *features]))
output.append("")
return "\n".join(output)

View File

@@ -1,12 +1,12 @@
from fractions import Fraction
from ingredient_parser import parse_ingredient
from ingredient_parser.dataclasses import CompositeIngredientAmount, IngredientAmount
from ingredient_parser.dataclasses import ParsedIngredient as IngredientParserParsedIngredient
from pydantic import UUID4
from sqlalchemy.orm import Session
from mealie.core.root_logger import get_logger
from mealie.schema.recipe import RecipeIngredient
from mealie.schema.recipe.recipe_ingredient import (
MAX_INGREDIENT_DENOMINATOR,
CreateIngredientFood,
CreateIngredientUnit,
IngredientConfidence,
@@ -14,8 +14,9 @@ from mealie.schema.recipe.recipe_ingredient import (
RegisteredParser,
)
from . import brute, crfpp, openai
from . import brute, openai
from ._base import ABCIngredientParser
from .parser_utils import extract_quantity_from_string
logger = get_logger(__name__)
@@ -47,50 +48,110 @@ class BruteForceParser(ABCIngredientParser):
class NLPParser(ABCIngredientParser):
"""
Class for CRFPP ingredient parsers.
Class for Ingredient Parser library
"""
def _crf_to_ingredient(self, crf_model: crfpp.CRFIngredient) -> ParsedIngredient:
ingredient = None
@staticmethod
def _extract_amount(ingredient: IngredientParserParsedIngredient) -> IngredientAmount:
if not (ingredient_amounts := ingredient.amount):
return IngredientAmount(quantity=0, quantity_max=0, unit="", text="", confidence=0, starting_index=-1)
try:
ingredient = RecipeIngredient(
title="",
note=crf_model.comment,
unit=CreateIngredientUnit(name=crf_model.unit),
food=CreateIngredientFood(name=crf_model.name),
disable_amount=False,
quantity=float(
sum(Fraction(s).limit_denominator(MAX_INGREDIENT_DENOMINATOR) for s in crf_model.qty.split())
),
)
except Exception as e:
logger.error(f"Failed to parse ingredient: {crf_model}: {e}")
# TODO: Capture some sort of state for the user to see that an exception occurred
ingredient = RecipeIngredient(
title="",
note=crf_model.input,
)
ingredient_amount = ingredient_amounts[0]
if isinstance(ingredient_amount, CompositeIngredientAmount):
ingredient_amount = ingredient_amount.amounts[0]
return ingredient_amount
@staticmethod
def _extract_quantity(ingredient_amount: IngredientAmount) -> tuple[float, float]:
confidence = ingredient_amount.confidence
if isinstance(ingredient_amount.quantity, str):
return extract_quantity_from_string(ingredient_amount.quantity)[0], confidence
else:
try:
return float(ingredient_amount.quantity), confidence
except ValueError:
return 0, 0
@staticmethod
def _extract_unit(ingredient_amount: IngredientAmount) -> tuple[str, float]:
confidence = ingredient_amount.confidence
unit = str(ingredient_amount.unit) if ingredient_amount.unit else ""
return unit, confidence
@staticmethod
def _extract_food(ingredient: IngredientParserParsedIngredient) -> tuple[str, float]:
confidence = ingredient.name.confidence if ingredient.name else 0
food = str(ingredient.name.text) if ingredient.name else ""
return food, confidence
@staticmethod
def _extract_note(ingredient: IngredientParserParsedIngredient) -> tuple[str, float]:
confidences: list[float] = []
note_parts: list[str] = []
if ingredient.size:
note_parts.append(ingredient.size.text)
confidences.append(ingredient.size.confidence)
if ingredient.preparation:
note_parts.append(ingredient.preparation.text)
confidences.append(ingredient.preparation.confidence)
if ingredient.comment:
note_parts.append(ingredient.comment.text)
confidences.append(ingredient.comment.confidence)
# average confidence among all note parts
confidence = sum(confidences) / len(confidences) if confidences else 0
note = ", ".join(note_parts)
note = note.replace("(", "").replace(")", "")
return note, confidence
def _convert_ingredient(self, ingredient: IngredientParserParsedIngredient) -> ParsedIngredient:
ingredient_amount = self._extract_amount(ingredient)
qty, qty_conf = self._extract_quantity(ingredient_amount)
unit, unit_conf = self._extract_unit(ingredient_amount)
food, food_conf = self._extract_food(ingredient)
note, note_conf = self._extract_note(ingredient)
# average confidence for components which were parsed
confidences: list[float] = []
if qty:
confidences.append(qty_conf)
if unit:
confidences.append(unit_conf)
if food:
confidences.append(food_conf)
if note:
confidences.append(note_conf)
parsed_ingredient = ParsedIngredient(
input=crf_model.input,
ingredient=ingredient,
input=ingredient.sentence,
confidence=IngredientConfidence(
quantity=crf_model.confidence.qty,
food=crf_model.confidence.name,
**crf_model.confidence.model_dump(),
average=(sum(confidences) / len(confidences)) if confidences else 0,
quantity=qty_conf,
unit=unit_conf,
food=food_conf,
comment=note_conf,
),
ingredient=RecipeIngredient(
title="",
quantity=qty,
unit=CreateIngredientUnit(name=unit) if unit else None,
food=CreateIngredientFood(name=food) if food else None,
disable_amount=False,
note=note,
),
)
return self.find_ingredient_match(parsed_ingredient)
async def parse(self, ingredients: list[str]) -> list[ParsedIngredient]:
crf_models = crfpp.convert_list_to_crf_model(ingredients)
return [self._crf_to_ingredient(crf_model) for crf_model in crf_models]
async def parse_one(self, ingredient_string: str) -> ParsedIngredient:
items = await self.parse([ingredient_string])
return items[0]
parsed_ingredient = parse_ingredient(ingredient_string)
return self._convert_ingredient(parsed_ingredient)
async def parse(self, ingredients: list[str]) -> list[ParsedIngredient]:
return [await self.parse_one(ingredient) for ingredient in ingredients]
__registrar: dict[RegisteredParser, type[ABCIngredientParser]] = {