feat: OpenAI Ingredient Parsing (#3581)

This commit is contained in:
Michael Genson
2024-05-22 04:45:07 -05:00
committed by GitHub
parent 4c8bbdcde2
commit 5c57b3dd1a
28 changed files with 789 additions and 274 deletions

View File

@@ -0,0 +1,6 @@
from .openai import OpenAIDataInjection, OpenAIService
__all__ = [
"OpenAIDataInjection",
"OpenAIService",
]

View File

@@ -0,0 +1,129 @@
import inspect
import json
import os
from pathlib import Path
from textwrap import dedent
from openai import NOT_GIVEN, AsyncOpenAI
from openai.resources.chat.completions import ChatCompletion
from pydantic import BaseModel, field_validator
from mealie.core.config import get_app_settings
from .._base_service import BaseService
class OpenAIDataInjection(BaseModel):
description: str
value: str
@field_validator("value", mode="before")
def parse_value(cls, value):
if not value:
raise ValueError("Value cannot be empty")
if isinstance(value, str):
return value
# convert Pydantic models to JSON
if isinstance(value, BaseModel):
return value.model_dump_json()
# convert Pydantic types to their JSON schema definition
if inspect.isclass(value) and issubclass(value, BaseModel):
value = value.model_json_schema()
# attempt to convert object to JSON
try:
return json.dumps(value, separators=(",", ":"))
except TypeError:
return value
class OpenAIService(BaseService):
PROMPTS_DIR = Path(os.path.dirname(os.path.abspath(__file__))) / "prompts"
def __init__(self) -> None:
settings = get_app_settings()
if not settings.OPENAI_ENABLED:
raise ValueError("OpenAI is not enabled")
self.model = settings.OPENAI_MODEL
self.workers = settings.OPENAI_WORKERS
self.send_db_data = settings.OPENAI_SEND_DATABASE_DATA
self.get_client = lambda: AsyncOpenAI(
base_url=settings.OPENAI_BASE_URL,
api_key=settings.OPENAI_API_KEY,
)
super().__init__()
@classmethod
def get_prompt(cls, name: str, data_injections: list[OpenAIDataInjection] | None = None) -> str:
"""
Load stored prompt and inject data into it.
Access prompts with dot notation.
For example, to access `prompts/recipes/parse-recipe-ingredients.txt`, use
`recipes.parse-recipe-ingredients`
"""
if not name:
raise ValueError("Prompt name cannot be empty")
tree = name.split(".")
prompt_dir = os.path.join(cls.PROMPTS_DIR, *tree[:-1], tree[-1] + ".txt")
try:
with open(prompt_dir) as f:
content = f.read()
except OSError as e:
raise OSError(f"Unable to load prompt {name}") from e
if not data_injections:
return content
content_parts = [content]
for data_injection in data_injections:
content_parts.append(
dedent(
f"""
###
{data_injection.description}
---
{data_injection.value}
"""
)
)
return "\n".join(content_parts)
async def _get_raw_response(
self, prompt: str, message: str, temperature=0.2, force_json_response=True
) -> ChatCompletion:
client = self.get_client()
return await client.chat.completions.create(
messages=[
{
"role": "system",
"content": prompt,
},
{
"role": "user",
"content": message,
},
],
model=self.model,
temperature=temperature,
response_format={"type": "json_object"} if force_json_response else NOT_GIVEN,
)
async def get_response(self, prompt: str, message: str, temperature=0.2, force_json_response=True) -> str | None:
"""Send data to OpenAI and return the response message content"""
try:
response = await self._get_raw_response(prompt, message, temperature, force_json_response)
if not response.choices:
return None
return response.choices[0].message.content
except Exception:
self.logger.exception("OpenAI Request Failed")
return None

View File

@@ -0,0 +1,24 @@
You are a bot that parses user input into recipe ingredients. You will receive a list of one or more ingredients, each containing one or more of the following components:
- Food: the actual physical ingredient used in the recipe. For instance, if you receive "3 cups of onions, chopped", the food is "onions"
- Unit: the unit of measurement for this ingredient. For instance, if you receive "2 lbs chicken breast", the unit is "lbs" (short for "pounds")
- Quantity: the numerical representation of how much of this ingredient. For instance, if you receive "3 1/2 grams of minced garlic", the quantity is "3 1/2". Quantity may be represented as a whole number (integer), a float or decimal, or a fraction. You should output quantity in only whole numbers or floats, converting fractions into floats. Floats longer than 10 decimal places should be rounded to 10 decimal places.
- Note: the rest of the text that represents more detail on how to prepare the ingredient. Anything that is not one of the above should be the note. For instance, if you receive "one can of butter beans, drained" the note would be "drained". If you receive "3 cloves of garlic peeled and finely chopped", the note would be "peeled and finely chopped"
- Input: The input is simply the ingredient string you are processing as-is. It is forbidden to modify this at all, you must provide the input exactly as you received it
While parsing the ingredients, there are some things to keep in mind:
- If you cannot accurately determine the quantity, unit, food, or note, you should place everything into the note field and leave everything else empty. It's better to err on the side of putting everything in the note field than being wrong
- You may receive recipe ingredients from multiple different languages. You should adhere to the grammar rules of the input language when trying to parse the ingredient string
- Sometimes foods or units will be in their singular, plural, or other grammatical forms. You must interpret all of them appropriately
- Sometimes ingredients will have text in parenthesis (like this). Parenthesis typically indicate something that should appear in the notes. For example: an input of "3 potatoes (roughly chopped)" would parse "roughly chopped" into the notes. Notice that when this occurs, the parenthesis are dropped, and you should use "roughly chopped" instead of "(roughly chopped)" in the note
- It's possible for the input to contain typos. For instance, you might see the word "potatos" instead of "potatoes". If it is a common misspelling, you may correct it
- Pay close attention to what can be considered a unit of measurement. There are common measurements such as tablespoon, teaspoon, and gram, abbreviations such as tsp, tbsp, and oz, and others such as sprig, can, bundle, bunch, unit, cube, package, and pinch
- Sometimes quantities can be given a range, such as "3-5" or "1 to 2" or "three or four". In this instance, choose the lower quantity; do not try to average or otherwise calculate the quantity. For instance, if the input it "2-3 lbs of chicken breast" the quantity should be "2"
- Any text that does not appear in the unit or food must appear in the notes. No text should be left off. The only exception for this is if a quantity is converted from text into a number. For instance, if you convert "2 dozen" into the number "24", you should not put the word "dozen" into any other field
It is imperative that you do not create any data or otherwise make up any information. Failure to adhere to this rule is illegal and will result in harsh punishment. If you are unsure, place the entire string into the note section of the response. Do not make things up.
In addition to calculating the recipe ingredient fields, you are also responsible for including a confidence value. This value is a float between 0 - 1, where 1 is full confidence that the result is correct, and 0 is no confidence that the result is correct. If you're unable to parse anything, and you put the entire string in the notes, you should return 0 confidence. If you can easily parse the string into each component, then you should return a confidence of 1. If you have to guess which part is the unit and which part is the food, your confidence should be lower, such as 0.6. Even if there is no unit or note, if you're able to determine the food, you may use a higher confidence. If the entire ingredient consists of only a food, you can use a confidence of 1.
Below you will receive the JSON schema for your response. Your response must be in valid JSON in the below schema as provided. You must respond in this JSON schema; failure to do so is illegal. It is imperative that you follow the schema precisely to avoid punishment. You must follow the JSON schema.
The user message that you receive will be the list of one or more recipe ingredients for you to parse. Your response should have exactly one item for each item provided. For instance, if you receive 12 items to parse, then your response should be an array of 12 parsed items.

View File

@@ -0,0 +1,161 @@
from abc import ABC, abstractmethod
from typing import TypeVar
from pydantic import UUID4, BaseModel
from rapidfuzz import fuzz, process
from sqlalchemy.orm import Session
from mealie.db.models.recipe.ingredient import IngredientFoodModel, IngredientUnitModel
from mealie.repos.all_repositories import get_repositories
from mealie.repos.repository_factory import AllRepositories
from mealie.schema.recipe.recipe_ingredient import (
CreateIngredientFood,
CreateIngredientUnit,
IngredientFood,
IngredientUnit,
ParsedIngredient,
)
from mealie.schema.response.pagination import PaginationQuery
T = TypeVar("T", bound=BaseModel)
class ABCIngredientParser(ABC):
"""
Abstract class for ingredient parsers.
"""
def __init__(self, group_id: UUID4, session: Session) -> None:
self.group_id = group_id
self.session = session
self._foods_by_alias: dict[str, IngredientFood] | None = None
self._units_by_alias: dict[str, IngredientUnit] | None = None
@property
def _repos(self) -> AllRepositories:
return get_repositories(self.session)
@property
def foods_by_alias(self) -> dict[str, IngredientFood]:
if self._foods_by_alias is None:
foods_repo = self._repos.ingredient_foods.by_group(self.group_id)
query = PaginationQuery(page=1, per_page=-1)
all_foods = foods_repo.page_all(query).items
foods_by_alias: dict[str, IngredientFood] = {}
for food in all_foods:
if food.name:
foods_by_alias[IngredientFoodModel.normalize(food.name)] = food
if food.plural_name:
foods_by_alias[IngredientFoodModel.normalize(food.plural_name)] = food
for alias in food.aliases or []:
if alias.name:
foods_by_alias[IngredientFoodModel.normalize(alias.name)] = food
self._foods_by_alias = foods_by_alias
return self._foods_by_alias
@property
def units_by_alias(self) -> dict[str, IngredientUnit]:
if self._units_by_alias is None:
units_repo = self._repos.ingredient_units.by_group(self.group_id)
query = PaginationQuery(page=1, per_page=-1)
all_units = units_repo.page_all(query).items
units_by_alias: dict[str, IngredientUnit] = {}
for unit in all_units:
if unit.name:
units_by_alias[IngredientUnitModel.normalize(unit.name)] = unit
if unit.plural_name:
units_by_alias[IngredientUnitModel.normalize(unit.plural_name)] = unit
if unit.abbreviation:
units_by_alias[IngredientUnitModel.normalize(unit.abbreviation)] = unit
if unit.plural_abbreviation:
units_by_alias[IngredientUnitModel.normalize(unit.plural_abbreviation)] = unit
for alias in unit.aliases or []:
if alias.name:
units_by_alias[IngredientUnitModel.normalize(alias.name)] = unit
self._units_by_alias = units_by_alias
return self._units_by_alias
@property
def food_fuzzy_match_threshold(self) -> int:
"""Minimum threshold to fuzzy match against a database food search"""
return 85
@property
def unit_fuzzy_match_threshold(self) -> int:
"""Minimum threshold to fuzzy match against a database unit search"""
return 70
@abstractmethod
async def parse_one(self, ingredient_string: str) -> ParsedIngredient: ...
@abstractmethod
async def parse(self, ingredients: list[str]) -> list[ParsedIngredient]: ...
@classmethod
def find_match(cls, match_value: str, *, store_map: dict[str, T], fuzzy_match_threshold: int = 0) -> T | None:
# check for literal matches
if match_value in store_map:
return store_map[match_value]
# fuzzy match against food store
fuzz_result = process.extractOne(
match_value, store_map.keys(), scorer=fuzz.ratio, score_cutoff=fuzzy_match_threshold
)
if fuzz_result is None:
return None
return store_map[fuzz_result[0]]
def find_food_match(self, food: IngredientFood | CreateIngredientFood | str) -> IngredientFood | None:
if isinstance(food, IngredientFood):
return food
food_name = food if isinstance(food, str) else food.name
match_value = IngredientFoodModel.normalize(food_name)
return self.find_match(
match_value,
store_map=self.foods_by_alias,
fuzzy_match_threshold=self.food_fuzzy_match_threshold,
)
def find_unit_match(self, unit: IngredientUnit | CreateIngredientUnit | str) -> IngredientUnit | None:
if isinstance(unit, IngredientUnit):
return unit
unit_name = unit if isinstance(unit, str) else unit.name
match_value = IngredientUnitModel.normalize(unit_name)
return self.find_match(
match_value,
store_map=self.units_by_alias,
fuzzy_match_threshold=self.unit_fuzzy_match_threshold,
)
def find_ingredient_match(self, ingredient: ParsedIngredient) -> ParsedIngredient:
if ingredient.ingredient.food and (food_match := self.find_food_match(ingredient.ingredient.food)):
ingredient.ingredient.food = food_match
if ingredient.ingredient.unit and (unit_match := self.find_unit_match(ingredient.ingredient.unit)):
ingredient.ingredient.unit = unit_match
# Parser might have wrongly split a food into a unit and food.
if isinstance(ingredient.ingredient.food, CreateIngredientFood) and isinstance(
ingredient.ingredient.unit, CreateIngredientUnit
):
if food_match := self.find_food_match(
f"{ingredient.ingredient.unit.name} {ingredient.ingredient.food.name}"
):
ingredient.ingredient.food = food_match
ingredient.ingredient.unit = None
return ingredient

View File

@@ -1,173 +1,23 @@
from abc import ABC, abstractmethod
from fractions import Fraction
from typing import TypeVar
from pydantic import UUID4, BaseModel
from rapidfuzz import fuzz, process
from pydantic import UUID4
from sqlalchemy.orm import Session
from mealie.core.root_logger import get_logger
from mealie.db.models.recipe.ingredient import IngredientFoodModel, IngredientUnitModel
from mealie.repos.all_repositories import get_repositories
from mealie.repos.repository_factory import AllRepositories
from mealie.schema.recipe import RecipeIngredient
from mealie.schema.recipe.recipe_ingredient import (
MAX_INGREDIENT_DENOMINATOR,
CreateIngredientFood,
CreateIngredientUnit,
IngredientConfidence,
IngredientFood,
IngredientUnit,
ParsedIngredient,
RegisteredParser,
)
from mealie.schema.response.pagination import PaginationQuery
from . import brute, crfpp
from . import brute, crfpp, openai
from ._base import ABCIngredientParser
logger = get_logger(__name__)
T = TypeVar("T", bound=BaseModel)
class ABCIngredientParser(ABC):
"""
Abstract class for ingredient parsers.
"""
def __init__(self, group_id: UUID4, session: Session) -> None:
self.group_id = group_id
self.session = session
self._foods_by_alias: dict[str, IngredientFood] | None = None
self._units_by_alias: dict[str, IngredientUnit] | None = None
@property
def _repos(self) -> AllRepositories:
return get_repositories(self.session)
@property
def foods_by_alias(self) -> dict[str, IngredientFood]:
if self._foods_by_alias is None:
foods_repo = self._repos.ingredient_foods.by_group(self.group_id)
query = PaginationQuery(page=1, per_page=-1)
all_foods = foods_repo.page_all(query).items
foods_by_alias: dict[str, IngredientFood] = {}
for food in all_foods:
if food.name:
foods_by_alias[IngredientFoodModel.normalize(food.name)] = food
if food.plural_name:
foods_by_alias[IngredientFoodModel.normalize(food.plural_name)] = food
for alias in food.aliases or []:
if alias.name:
foods_by_alias[IngredientFoodModel.normalize(alias.name)] = food
self._foods_by_alias = foods_by_alias
return self._foods_by_alias
@property
def units_by_alias(self) -> dict[str, IngredientUnit]:
if self._units_by_alias is None:
units_repo = self._repos.ingredient_units.by_group(self.group_id)
query = PaginationQuery(page=1, per_page=-1)
all_units = units_repo.page_all(query).items
units_by_alias: dict[str, IngredientUnit] = {}
for unit in all_units:
if unit.name:
units_by_alias[IngredientUnitModel.normalize(unit.name)] = unit
if unit.plural_name:
units_by_alias[IngredientUnitModel.normalize(unit.plural_name)] = unit
if unit.abbreviation:
units_by_alias[IngredientUnitModel.normalize(unit.abbreviation)] = unit
if unit.plural_abbreviation:
units_by_alias[IngredientUnitModel.normalize(unit.plural_abbreviation)] = unit
for alias in unit.aliases or []:
if alias.name:
units_by_alias[IngredientUnitModel.normalize(alias.name)] = unit
self._units_by_alias = units_by_alias
return self._units_by_alias
@property
def food_fuzzy_match_threshold(self) -> int:
"""Minimum threshold to fuzzy match against a database food search"""
return 85
@property
def unit_fuzzy_match_threshold(self) -> int:
"""Minimum threshold to fuzzy match against a database unit search"""
return 70
@abstractmethod
def parse_one(self, ingredient_string: str) -> ParsedIngredient: ...
@abstractmethod
def parse(self, ingredients: list[str]) -> list[ParsedIngredient]: ...
@classmethod
def find_match(cls, match_value: str, *, store_map: dict[str, T], fuzzy_match_threshold: int = 0) -> T | None:
# check for literal matches
if match_value in store_map:
return store_map[match_value]
# fuzzy match against food store
fuzz_result = process.extractOne(
match_value, store_map.keys(), scorer=fuzz.ratio, score_cutoff=fuzzy_match_threshold
)
if fuzz_result is None:
return None
return store_map[fuzz_result[0]]
def find_food_match(self, food: IngredientFood | CreateIngredientFood | str) -> IngredientFood | None:
if isinstance(food, IngredientFood):
return food
food_name = food if isinstance(food, str) else food.name
match_value = IngredientFoodModel.normalize(food_name)
return self.find_match(
match_value,
store_map=self.foods_by_alias,
fuzzy_match_threshold=self.food_fuzzy_match_threshold,
)
def find_unit_match(self, unit: IngredientUnit | CreateIngredientUnit | str) -> IngredientUnit | None:
if isinstance(unit, IngredientUnit):
return unit
unit_name = unit if isinstance(unit, str) else unit.name
match_value = IngredientUnitModel.normalize(unit_name)
return self.find_match(
match_value,
store_map=self.units_by_alias,
fuzzy_match_threshold=self.unit_fuzzy_match_threshold,
)
def find_ingredient_match(self, ingredient: ParsedIngredient) -> ParsedIngredient:
if ingredient.ingredient.food and (food_match := self.find_food_match(ingredient.ingredient.food)):
ingredient.ingredient.food = food_match
if ingredient.ingredient.unit and (unit_match := self.find_unit_match(ingredient.ingredient.unit)):
ingredient.ingredient.unit = unit_match
# Parser might have wrongly split a food into a unit and food.
if isinstance(ingredient.ingredient.food, CreateIngredientFood) and isinstance(
ingredient.ingredient.unit, CreateIngredientUnit
):
if food_match := self.find_food_match(
f"{ingredient.ingredient.unit.name} {ingredient.ingredient.food.name}"
):
ingredient.ingredient.food = food_match
ingredient.ingredient.unit = None
return ingredient
class BruteForceParser(ABCIngredientParser):
@@ -175,7 +25,7 @@ class BruteForceParser(ABCIngredientParser):
Brute force ingredient parser.
"""
def parse_one(self, ingredient: str) -> ParsedIngredient:
async def parse_one(self, ingredient: str) -> ParsedIngredient:
bfi = brute.parse(ingredient, self)
parsed_ingredient = ParsedIngredient(
@@ -191,8 +41,8 @@ class BruteForceParser(ABCIngredientParser):
return self.find_ingredient_match(parsed_ingredient)
def parse(self, ingredients: list[str]) -> list[ParsedIngredient]:
return [self.parse_one(ingredient) for ingredient in ingredients]
async def parse(self, ingredients: list[str]) -> list[ParsedIngredient]:
return [await self.parse_one(ingredient) for ingredient in ingredients]
class NLPParser(ABCIngredientParser):
@@ -234,18 +84,19 @@ class NLPParser(ABCIngredientParser):
return self.find_ingredient_match(parsed_ingredient)
def parse(self, ingredients: list[str]) -> list[ParsedIngredient]:
async def parse(self, ingredients: list[str]) -> list[ParsedIngredient]:
crf_models = crfpp.convert_list_to_crf_model(ingredients)
return [self._crf_to_ingredient(crf_model) for crf_model in crf_models]
def parse_one(self, ingredient: str) -> ParsedIngredient:
items = self.parse([ingredient])
async def parse_one(self, ingredient_string: str) -> ParsedIngredient:
items = await self.parse([ingredient_string])
return items[0]
__registrar = {
__registrar: dict[RegisteredParser, type[ABCIngredientParser]] = {
RegisteredParser.nlp: NLPParser,
RegisteredParser.brute: BruteForceParser,
RegisteredParser.openai: openai.OpenAIParser,
}

View File

@@ -0,0 +1,5 @@
from .parser import OpenAIParser
__all__ = [
"OpenAIParser",
]

View File

@@ -0,0 +1,122 @@
import asyncio
import json
from collections.abc import Awaitable
from pydantic import BaseModel
from mealie.schema.recipe.recipe_ingredient import (
CreateIngredientFood,
CreateIngredientUnit,
IngredientConfidence,
ParsedIngredient,
RecipeIngredient,
)
from mealie.services.openai import OpenAIDataInjection, OpenAIService
from .._base import ABCIngredientParser
class OpenAIIngredient(BaseModel):
"""
This class defines the JSON schema sent to OpenAI. Its schema is
injected directly into the OpenAI prompt.
"""
__doc__ = "" # we don't want to include the docstring in the JSON schema
input: str
confidence: float | None = None
quantity: float | None = 0
unit: str | None = None
food: str | None = None
note: str | None = None
class OpenAIIngredients(BaseModel):
ingredients: list[OpenAIIngredient] = []
class OpenAIParser(ABCIngredientParser):
def _convert_ingredient(self, openai_ing: OpenAIIngredient) -> ParsedIngredient:
ingredient = RecipeIngredient(
original_text=openai_ing.input,
quantity=openai_ing.quantity,
unit=CreateIngredientUnit(name=openai_ing.unit) if openai_ing.unit else None,
food=CreateIngredientFood(name=openai_ing.food) if openai_ing.food else None,
note=openai_ing.note,
)
parsed_ingredient = ParsedIngredient(
input=openai_ing.input,
confidence=IngredientConfidence(average=openai_ing.confidence),
ingredient=ingredient,
)
return self.find_ingredient_match(parsed_ingredient)
def _get_prompt(self, service: OpenAIService) -> str:
data_injections = [
OpenAIDataInjection(
description=(
"This is the JSON response schema. You must respond in valid JSON that follows this schema. "
"Your payload should be as compact as possible, eliminating unncessesary whitespace. Any fields "
"with default values which you do not populate should not be in the payload."
),
value=OpenAIIngredients,
),
]
if service.send_db_data:
data_injections.extend(
[
OpenAIDataInjection(
description=(
"Below is a list of units found in the units database. While parsing, you should "
"reference this list when determining which part of the input is the unit. You may "
"find a unit in the input that does not exist in this list. This should not prevent "
"you from parsing that text as a unit, however it may lower your confidence level."
),
value=list(set(self.units_by_alias)),
),
]
)
return service.get_prompt("recipes.parse-recipe-ingredients", data_injections=data_injections)
@staticmethod
def _chunk_messages(messages: list[str], n=1) -> list[list[str]]:
if n < 1:
n = 1
return [messages[i : i + n] for i in range(0, len(messages), n)]
async def _parse(self, ingredients: list[str]) -> OpenAIIngredients:
service = OpenAIService()
prompt = self._get_prompt(service)
# chunk ingredients and send each chunk to its own worker
ingredient_chunks = self._chunk_messages(ingredients, n=service.workers)
tasks: list[Awaitable[str | None]] = []
for ingredient_chunk in ingredient_chunks:
message = json.dumps(ingredient_chunk, separators=(",", ":"))
tasks.append(service.get_response(prompt, message, force_json_response=True))
# re-combine chunks into one response
responses_json = await asyncio.gather(*tasks)
responses = [
OpenAIIngredients.model_validate_json(response_json) for response_json in responses_json if responses_json
]
if not responses:
raise Exception("No response from OpenAI")
return OpenAIIngredients(
ingredients=[ingredient for response in responses for ingredient in response.ingredients]
)
async def parse_one(self, ingredient_string: str) -> ParsedIngredient:
items = await self.parse([ingredient_string])
return items[0]
async def parse(self, ingredients: list[str]) -> list[ParsedIngredient]:
response = await self._parse(ingredients)
return [self._convert_ingredient(ing) for ing in response.ingredients]