mealie/services/parser_services/crfpp/pre_processor.py

import re
import unicodedata

replace_abbreviations = {
    "cup": " cup ",
    "g": " gram ",
    "kg": " kilogram ",
    "lb": " pound ",
    "ml": " milliliter ",
    "oz": " ounce ",
    "pint": " pint ",
    "qt": " quart ",
    "tbsp": " tablespoon ",
    "tbs": " tablespoon ",  # Order Matters!, 'tsb' must come after 'tbsp' in case of duplicate matches
    "tsp": " teaspoon ",
}


def replace_common_abbreviations(string: str) -> str:

    for k, v in replace_abbreviations.items():
        regex = rf"(?<=\d)\s?({k}\bs?)"
        string = re.sub(regex, v, string)

    return string


def remove_periods(string: str) -> str:
    """Removes periods not sournded by digets"""
    return re.sub(r"(?<!\d)\.(?!\d)", "", string)


def replace_fraction_unicode(string: str):
    # TODO: I'm not confident this works well enough for production needs some testing and/or refacorting
    # TODO: Breaks on multiple unicode fractions
    for c in string:
        try:
            name = unicodedata.name(c)
        except ValueError:
            continue
        if name.startswith("VULGAR FRACTION"):
            normalized = unicodedata.normalize("NFKC", c)
            numerator, _, denominator = normalized.partition("⁄")  # _ = slash
            text = f" {numerator}/{denominator}"
            return string.replace(c, text).replace("  ", " ")

    return string


def wrap_or_clause(string: str):
    """
    Attempts to wrap or clauses in ()

    Examples:
    '1 tsp. Diamond Crystal or ½ tsp. Morton kosher salt, plus more'
        -> '1 teaspoon diamond crystal (or 1/2 teaspoon morton kosher salt), plus more'

    """
    # TODO: Needs more adequite testing to be sure this doesn't have side effects.
    split_by_or = string.split(" or ")

    split_by_comma = split_by_or[1].split(",")

    if len(split_by_comma) > 0:
        return f"{split_by_or[0]} (or {split_by_comma[0]}),{''.join(split_by_comma[1:])}".strip().removesuffix(",")

    return string


def pre_process_string(string: str) -> str:
    """
    Series of preprocessing functions to make best use of the CRF++ model. The ideal string looks something like...

    {qty} {unit} {food}, {additional}
    1 tbs. wine, expensive or other white wine, plus more

    """
    string = string.lower()
    string = replace_fraction_unicode(string)
    string = remove_periods(string)
    string = replace_common_abbreviations(string)

    if " or " in string:
        string = wrap_or_clause(string)

    return string
-												update NLP for ingredients

											
										
										
											2021-08-29 17:10:51 -08:00
+								import re
 								import unicodedata
 								replace_abbreviations = {
-												 feat: ✨ Add brute strategy to ingredient processor (#744)

* fix UI column width

* words

* update parser to support diff strats

* add new model url

* make button more visible

* fix nutrition error

* feat(backend): :sparkles: add 'brute' strategy for parsing ingredients

* satisfy linter

* update UI for creation page

* feat(backend): :sparkles: log 422 errors in detail when not in PRODUCTION

* add strategy selector

Co-authored-by: Hayden <hay-kot@pm.me>
											
										
										
											2021-10-16 16:06:13 -08:00
+								    "cup": " cup ",
 								    "g": " gram ",
 								    "kg": " kilogram ",
 								    "lb": " pound ",
 								    "ml": " milliliter ",
 								    "oz": " ounce ",
 								    "pint": " pint ",
 								    "qt": " quart ",
 								    "tbsp": " tablespoon ",
-												docs: fix typos (#1665)

* docs: fix typos

* typos: fix typos found by `codespell` across the codebase

* docs: fix `macOS` spelling

* docs: fix `authentification` terminology

"Authentification" is not a thing.

* docs: fix `localhost` typo in example link

* typos: fix in-code typos

These are potentially higher risk, but no other mentions of these typos
show up in the codebase.
											
										
										
											2022-09-25 23:17:27 +00:00
+								    "tbs": " tablespoon ",  # Order Matters!, 'tsb' must come after 'tbsp' in case of duplicate matches
-												 feat: ✨ Add brute strategy to ingredient processor (#744)

* fix UI column width

* words

* update parser to support diff strats

* add new model url

* make button more visible

* fix nutrition error

* feat(backend): :sparkles: add 'brute' strategy for parsing ingredients

* satisfy linter

* update UI for creation page

* feat(backend): :sparkles: log 422 errors in detail when not in PRODUCTION

* add strategy selector

Co-authored-by: Hayden <hay-kot@pm.me>
											
										
										
											2021-10-16 16:06:13 -08:00
+								    "tsp": " teaspoon ",
-												update NLP for ingredients

											
										
										
											2021-08-29 17:10:51 -08:00
+								}
 								def replace_common_abbreviations(string: str) -> str:
-												 feat: ✨ Add brute strategy to ingredient processor (#744)

* fix UI column width

* words

* update parser to support diff strats

* add new model url

* make button more visible

* fix nutrition error

* feat(backend): :sparkles: add 'brute' strategy for parsing ingredients

* satisfy linter

* update UI for creation page

* feat(backend): :sparkles: log 422 errors in detail when not in PRODUCTION

* add strategy selector

Co-authored-by: Hayden <hay-kot@pm.me>
											
										
										
											2021-10-16 16:06:13 -08:00
-												update NLP for ingredients

											
										
										
											2021-08-29 17:10:51 -08:00
+								    for k, v in replace_abbreviations.items():
-												 feat(backend): ✨ Minor linting, bulk URL import, and improve BG tasks (#760)

* Fixes #751

* Fixes not showing original URL

* start slice at 0 instead of 1

* remove print statements

* add linter for print statements and remove print

* hide all buttons when edit disabled

* add bulk import API

* update attribute bindings

* unify button styles

* bulk add recipe feature

* thanks linter!

* uncomment code

Co-authored-by: Hayden <hay-kot@pm.me>
											
										
										
											2021-10-28 19:28:33 -08:00
+								        regex = rf"(?<=\d)\s?({k}\bs?)"
-												 feat: ✨ Add brute strategy to ingredient processor (#744)

* fix UI column width

* words

* update parser to support diff strats

* add new model url

* make button more visible

* fix nutrition error

* feat(backend): :sparkles: add 'brute' strategy for parsing ingredients

* satisfy linter

* update UI for creation page

* feat(backend): :sparkles: log 422 errors in detail when not in PRODUCTION

* add strategy selector

Co-authored-by: Hayden <hay-kot@pm.me>
											
										
										
											2021-10-16 16:06:13 -08:00
+								        string = re.sub(regex, v, string)
-												update NLP for ingredients

											
										
										
											2021-08-29 17:10:51 -08:00
 								    return string
 								def remove_periods(string: str) -> str:
 								    """Removes periods not sournded by digets"""
 								    return re.sub(r"(?<!\d)\.(?!\d)", "", string)
 								def replace_fraction_unicode(string: str):
 								    # TODO: I'm not confident this works well enough for production needs some testing and/or refacorting
 								    # TODO: Breaks on multiple unicode fractions
 								    for c in string:
 								        try:
 								            name = unicodedata.name(c)
 								        except ValueError:
 								            continue
 								        if name.startswith("VULGAR FRACTION"):
 								            normalized = unicodedata.normalize("NFKC", c)
-												Feature/CRF++ and server side locales (#731)

* add universal toast plugin

* add server side locales

* integrate CRF++ into CI/CD Pipeline

* docs(docs): :memo: add recipe parser docs

* feat(backend): :sparkles: Continued work on ingredient parsers

* add new model dest

* feat(frontend): :sparkles: New ingredient parser page

* formatting

Co-authored-by: Hayden <hay-kot@pm.me>
											
										
										
											2021-10-09 13:08:23 -08:00
+								            numerator, _, denominator = normalized.partition("⁄")  # _ = slash
-												update NLP for ingredients

											
										
										
											2021-08-29 17:10:51 -08:00
+								            text = f" {numerator}/{denominator}"
 								            return string.replace(c, text).replace("  ", " ")
 								    return string
 								def wrap_or_clause(string: str):
 								    """
 								    Attempts to wrap or clauses in ()
 								    Examples:
-												chores: updates-and-linters (#1868)

* switch to ruff

* add ruff

* run ruff --fix

* update ruff

* resolve ruff errors

* drop isort from CI

* fix decorator order
											
										
										
											2022-11-30 20:20:28 -09:00
+								    '1 tsp. Diamond Crystal or ½ tsp. Morton kosher salt, plus more'
 								        -> '1 teaspoon diamond crystal (or 1/2 teaspoon morton kosher salt), plus more'
-												update NLP for ingredients

											
										
										
											2021-08-29 17:10:51 -08:00
 								    """
-												docs: fix typos (#1665)

* docs: fix typos

* typos: fix typos found by `codespell` across the codebase

* docs: fix `macOS` spelling

* docs: fix `authentification` terminology

"Authentification" is not a thing.

* docs: fix `localhost` typo in example link

* typos: fix in-code typos

These are potentially higher risk, but no other mentions of these typos
show up in the codebase.
											
										
										
											2022-09-25 23:17:27 +00:00
+								    # TODO: Needs more adequite testing to be sure this doesn't have side effects.
-												update NLP for ingredients

											
										
										
											2021-08-29 17:10:51 -08:00
+								    split_by_or = string.split(" or ")
 								    split_by_comma = split_by_or[1].split(",")
 								    if len(split_by_comma) > 0:
 								        return f"{split_by_or[0]} (or {split_by_comma[0]}),{''.join(split_by_comma[1:])}".strip().removesuffix(",")
 								    return string
 								def pre_process_string(string: str) -> str:
 								    """
 								    Series of preprocessing functions to make best use of the CRF++ model. The ideal string looks something like...
 								    {qty} {unit} {food}, {additional}
 tbs. wine, expensive or other white wine, plus more
 								    """
 								    string = string.lower()
 								    string = replace_fraction_unicode(string)
 								    string = remove_periods(string)
 								    string = replace_common_abbreviations(string)
 								    if " or " in string:
 								        string = wrap_or_clause(string)
 								    return string