feat: Generalize Search to Other Models (#2472)

* generalized search logic to SearchFilter

* added default search behavior for all models

* fix for schema overrides

* added search support to several models

* fix for label search

* tests and fixes

* add config for normalizing characters

* dramatically simplified search tests

* bark bark

* fix normalization bug

* tweaked tests

* maybe this time?

---------

Co-authored-by: Hayden <64056131+hay-kot@users.noreply.github.com>
This commit is contained in:
Michael Genson
2023-08-20 13:30:21 -05:00
committed by GitHub
parent 76ae0bafc7
commit 99372aa2b6
16 changed files with 521 additions and 250 deletions

View File

@@ -1,7 +1,8 @@
# This file is auto-generated by gen_schema_exports.py
from .mealie_model import HasUUID, MealieModel
from .mealie_model import HasUUID, MealieModel, SearchType
__all__ = [
"HasUUID",
"MealieModel",
"SearchType",
]

View File

@@ -1,16 +1,34 @@
from __future__ import annotations
from collections.abc import Sequence
from typing import Protocol, TypeVar
from enum import Enum
from typing import ClassVar, Protocol, TypeVar
from humps.main import camelize
from pydantic import UUID4, BaseModel
from sqlalchemy import Select, desc, func, or_, text
from sqlalchemy.orm import InstrumentedAttribute, Session
from sqlalchemy.orm.interfaces import LoaderOption
from mealie.db.models._model_base import SqlAlchemyBase
T = TypeVar("T", bound=BaseModel)
class SearchType(Enum):
fuzzy = "fuzzy"
tokenized = "tokenized"
class MealieModel(BaseModel):
_fuzzy_similarity_threshold: ClassVar[float] = 0.5
_normalize_search: ClassVar[bool] = False
_searchable_properties: ClassVar[list[str]] = []
"""
Searchable properties for the search API.
The first property will be used for sorting (order_by)
"""
class Config:
alias_generator = camelize
allow_population_by_field_name = True
@@ -59,6 +77,40 @@ class MealieModel(BaseModel):
def loader_options(cls) -> list[LoaderOption]:
return []
@classmethod
def filter_search_query(
cls,
db_model: type[SqlAlchemyBase],
query: Select,
session: Session,
search_type: SearchType,
search: str,
search_list: list[str],
) -> Select:
"""
Filters a search query based on model attributes
Can be overridden to support a more advanced search
"""
if not cls._searchable_properties:
raise AttributeError("Not Implemented")
model_properties: list[InstrumentedAttribute] = [getattr(db_model, prop) for prop in cls._searchable_properties]
if search_type is SearchType.fuzzy:
session.execute(text(f"set pg_trgm.word_similarity_threshold = {cls._fuzzy_similarity_threshold};"))
filters = [prop.op("%>")(search) for prop in model_properties]
# trigram ordering by the first searchable property
return query.filter(or_(*filters)).order_by(func.least(model_properties[0].op("<->>")(search)))
else:
filters = []
for prop in model_properties:
filters.extend([prop.like(f"%{s}%") for s in search_list])
# order by how close the result is to the first searchable property
return query.filter(or_(*filters)).order_by(desc(model_properties[0].like(f"%{search}%")))
class HasUUID(Protocol):
id: UUID4

View File

@@ -1,5 +1,7 @@
from __future__ import annotations
from typing import ClassVar
from pydantic import UUID4
from mealie.schema._mealie import MealieModel
@@ -20,7 +22,7 @@ class MultiPurposeLabelUpdate(MultiPurposeLabelSave):
class MultiPurposeLabelSummary(MultiPurposeLabelUpdate):
pass
_searchable_properties: ClassVar[list[str]] = ["name"]
class Config:
orm_mode = True
@@ -31,14 +33,5 @@ class MultiPurposeLabelPagination(PaginationBase):
class MultiPurposeLabelOut(MultiPurposeLabelUpdate):
# shopping_list_items: list[ShoppingListItemOut] = []
# foods: list[IngredientFood] = []
class Config:
orm_mode = True
# from mealie.schema.recipe.recipe_ingredient import IngredientFood
# from mealie.schema.group.group_shopping_list import ShoppingListItemOut
# MultiPurposeLabelOut.update_forward_refs()

View File

@@ -2,16 +2,17 @@ from __future__ import annotations
import datetime
from pathlib import Path
from typing import Any
from typing import Any, ClassVar
from uuid import uuid4
from pydantic import UUID4, BaseModel, Field, validator
from slugify import slugify
from sqlalchemy.orm import joinedload, selectinload
from sqlalchemy import Select, desc, func, or_, select, text
from sqlalchemy.orm import Session, joinedload, selectinload
from sqlalchemy.orm.interfaces import LoaderOption
from mealie.core.config import get_app_dirs
from mealie.schema._mealie import MealieModel
from mealie.schema._mealie import MealieModel, SearchType
from mealie.schema.response.pagination import PaginationBase
from ...db.models.recipe import (
@@ -37,6 +38,8 @@ class RecipeTag(MealieModel):
name: str
slug: str
_searchable_properties: ClassVar[list[str]] = ["name"]
class Config:
orm_mode = True
@@ -78,6 +81,7 @@ class CreateRecipe(MealieModel):
class RecipeSummary(MealieModel):
id: UUID4 | None
_normalize_search: ClassVar[bool] = True
user_id: UUID4 = Field(default_factory=uuid4)
group_id: UUID4 = Field(default_factory=uuid4)
@@ -259,6 +263,69 @@ class Recipe(RecipeSummary):
selectinload(RecipeModel.notes),
]
@classmethod
def filter_search_query(
cls, db_model, query: Select, session: Session, search_type: SearchType, search: str, search_list: list[str]
) -> Select:
"""
1. token search looks for any individual exact hit in name, description, and ingredients
2. fuzzy search looks for trigram hits in name, description, and ingredients
3. Sort order is determined by closeness to the recipe name
Should search also look at tags?
"""
if search_type is SearchType.fuzzy:
# I would prefer to just do this in the recipe_ingredient.any part of the main query,
# but it turns out that at least sqlite wont use indexes for that correctly anymore and
# takes a big hit, so prefiltering it is
ingredient_ids = (
session.execute(
select(RecipeIngredientModel.id).filter(
or_(
RecipeIngredientModel.note_normalized.op("%>")(search),
RecipeIngredientModel.original_text_normalized.op("%>")(search),
)
)
)
.scalars()
.all()
)
session.execute(text(f"set pg_trgm.word_similarity_threshold = {cls._fuzzy_similarity_threshold};"))
return query.filter(
or_(
RecipeModel.name_normalized.op("%>")(search),
RecipeModel.description_normalized.op("%>")(search),
RecipeModel.recipe_ingredient.any(RecipeIngredientModel.id.in_(ingredient_ids)),
)
).order_by( # trigram ordering could be too slow on million record db, but is fine with thousands.
func.least(
RecipeModel.name_normalized.op("<->>")(search),
)
)
else:
ingredient_ids = (
session.execute(
select(RecipeIngredientModel.id).filter(
or_(
*[RecipeIngredientModel.note_normalized.like(f"%{ns}%") for ns in search_list],
*[RecipeIngredientModel.original_text_normalized.like(f"%{ns}%") for ns in search_list],
)
)
)
.scalars()
.all()
)
return query.filter(
or_(
*[RecipeModel.name_normalized.like(f"%{ns}%") for ns in search_list],
*[RecipeModel.description_normalized.like(f"%{ns}%") for ns in search_list],
RecipeModel.recipe_ingredient.any(RecipeIngredientModel.id.in_(ingredient_ids)),
)
).order_by(desc(RecipeModel.name_normalized.like(f"%{search}%")))
class RecipeLastMade(BaseModel):
timestamp: datetime.datetime

View File

@@ -3,6 +3,7 @@ from __future__ import annotations
import datetime
import enum
from fractions import Fraction
from typing import ClassVar
from uuid import UUID, uuid4
from pydantic import UUID4, Field, validator
@@ -50,6 +51,8 @@ class IngredientFood(CreateIngredientFood):
created_at: datetime.datetime | None
update_at: datetime.datetime | None
_searchable_properties: ClassVar[list[str]] = ["name", "description"]
class Config:
orm_mode = True
getter_dict = ExtrasGetterDict
@@ -78,6 +81,8 @@ class IngredientUnit(CreateIngredientUnit):
created_at: datetime.datetime | None
update_at: datetime.datetime | None
_searchable_properties: ClassVar[list[str]] = ["name", "abbreviation", "description"]
class Config:
orm_mode = True

View File

@@ -0,0 +1,67 @@
import re
from sqlalchemy import Select
from sqlalchemy.orm import Session
from text_unidecode import unidecode
from ...db.models._model_base import SqlAlchemyBase
from .._mealie import MealieModel, SearchType
class SearchFilter:
"""
0. fuzzy search (postgres only) and tokenized search are performed separately
1. take search string and do a little pre-normalization
2. look for internal quoted strings and keep them together as "literal" parts of the search
3. remove special characters from each non-literal search string
"""
punctuation = "!\#$%&()*+,-./:;<=>?@[\\]^_`{|}~" # string.punctuation with ' & " removed
quoted_regex = re.compile(r"""(["'])(?:(?=(\\?))\2.)*?\1""")
remove_quotes_regex = re.compile(r"""['"](.*)['"]""")
@classmethod
def _normalize_search(cls, search: str, normalize_characters: bool) -> str:
search = search.translate(str.maketrans(cls.punctuation, " " * len(cls.punctuation)))
if normalize_characters:
search = unidecode(search).lower().strip()
else:
search = search.strip()
return search
@classmethod
def _build_search_list(cls, search: str) -> list[str]:
if cls.quoted_regex.search(search):
# all quoted strings
quoted_search_list = [match.group() for match in cls.quoted_regex.finditer(search)]
# remove outer quotes
quoted_search_list = [cls.remove_quotes_regex.sub("\\1", x) for x in quoted_search_list]
# punctuation->spaces for splitting, but only on unquoted strings
search = cls.quoted_regex.sub("", search) # remove all quoted strings, leaving just non-quoted
search = search.translate(str.maketrans(cls.punctuation, " " * len(cls.punctuation)))
# all unquoted strings
unquoted_search_list = search.split()
search_list = quoted_search_list + unquoted_search_list
else:
search_list = search.translate(str.maketrans(cls.punctuation, " " * len(cls.punctuation))).split()
# remove padding whitespace inside quotes
return [x.strip() for x in search_list]
def __init__(self, session: Session, search: str, normalize_characters: bool = False) -> None:
if session.get_bind().name != "postgresql" or self.quoted_regex.search(search.strip()):
self.search_type = SearchType.tokenized
else:
self.search_type = SearchType.fuzzy
self.session = session
self.search = self._normalize_search(search, normalize_characters)
self.search_list = self._build_search_list(self.search)
def filter_query_by_search(self, query: Select, schema: type[MealieModel], model: type[SqlAlchemyBase]) -> Select:
return schema.filter_search_query(model, query, self.session, self.search_type, self.search, self.search_list)