feat: search tokenization, handling of quoted literal search, and postgres fuzziness (#2351)

* Creating postgres migration script and starting to set up to detect database

* non-working placeholders for postgres pg_tgrm

* First draft of some indexes

* non-working commit of postgres indexing

* Further non-working edits to db-centric fuzzy search

* update alembic for extensions

* More non-working setup

* Move db type check to init_db

* fix typo in db name check

* Add sqlite token search and postgres full text search

* reorder search to hit exact matches faster

* Add settings and docs for POSTGRES_LANGUAGE (full text search)

* Use user-specified POSTGRES_LANGUAGE in search

* fix fuzzy search typo

* Remove full text search and instead order by trigram match

* cleaner adding of indices, remove fulltext

* Cleanup old import of getting app settings

* Fix typo in index

* Fix some alembic fuzzy typos

* Remove diagnostic printing from alembic migration

* Fix mixed up commutator for trigram operator and relax criteria

* forgot to remove query debug

* sort only on name

* token and fuzzy search tests

* Refactor recipe search test to avoid rare random string cross-matches.

* Add ability to quote parts of search for exact match

* Remove internal punctuation, unless it's quoted for literal search

* Add tests for special character removal and literal search

* Remove the outer double quotes from searches, but leave internal single quotes alone.

* Update tests to avoid intra-test name collisions

* Fixing leftovers highlighted by lint

* cleanup linting and mypy errors

* Fix test cross-matching on dirty db (leftovers from bulk import)

* forgot to cleanup something when debugging mypy errors

* re-order pg_trgm loading in postgres

* address comments
This commit is contained in:
Jacob Corn
2023-05-28 19:46:53 +02:00
committed by GitHub
parent 27ebb4c462
commit 7e0d29afc7
7 changed files with 304 additions and 43 deletions

View File

@@ -92,6 +92,9 @@ def main():
logger.info("Migration needed. Performing migration...")
command.upgrade(alembic_cfg, "head")
if session.get_bind().name == "postgresql": # needed for fuzzy search and fast GIN text indices
session.execute(text("CREATE EXTENSION IF NOT EXISTS pg_trgm;"))
db = get_repositories(session)
if db.users.get_all():

View File

@@ -1,7 +1,9 @@
from typing import TYPE_CHECKING
import sqlalchemy as sa
from sqlalchemy import Boolean, Float, ForeignKey, Integer, String, event, orm
from sqlalchemy.orm import Mapped, mapped_column
from sqlalchemy.orm.session import Session
from text_unidecode import unidecode
from mealie.db.models._model_base import BaseMixins, SqlAlchemyBase
@@ -87,7 +89,7 @@ class RecipeIngredientModel(SqlAlchemyBase, BaseMixins):
original_text_normalized: Mapped[str | None] = mapped_column(String, index=True)
@auto_init()
def __init__(self, note: str | None = None, orginal_text: str | None = None, **_) -> None:
def __init__(self, session: Session, note: str | None = None, orginal_text: str | None = None, **_) -> None:
# SQLAlchemy events do not seem to register things that are set during auto_init
if note is not None:
self.note_normalized = unidecode(note).lower().strip()
@@ -95,13 +97,51 @@ class RecipeIngredientModel(SqlAlchemyBase, BaseMixins):
if orginal_text is not None:
self.orginal_text = unidecode(orginal_text).lower().strip()
tableargs = [ # base set of indices
sa.Index(
"ix_recipes_ingredients_note_normalized",
"note_normalized",
unique=False,
),
sa.Index(
"ix_recipes_ingredients_original_text_normalized",
"original_text_normalized",
unique=False,
),
]
if session.get_bind().name == "postgresql":
tableargs.extend(
[
sa.Index(
"ix_recipes_ingredients_note_normalized_gin",
"note_normalized",
unique=False,
postgresql_using="gin",
postgresql_ops={
"note_normalized": "gin_trgm_ops",
},
),
sa.Index(
"ix_recipes_ingredients_original_text_normalized_gin",
"original_text",
unique=False,
postgresql_using="gin",
postgresql_ops={
"original_text_normalized": "gin_trgm_ops",
},
),
]
)
# add indices
self.__table_args__ = tuple(tableargs)
@event.listens_for(RecipeIngredientModel.note, "set")
def receive_note(target: RecipeIngredientModel, value: str, oldvalue, initiator):
if value is not None:
target.name_normalized = unidecode(value).lower().strip()
target.note_normalized = unidecode(value).lower().strip()
else:
target.name_normalized = None
target.note_normalized = None
@event.listens_for(RecipeIngredientModel.original_text, "set")

View File

@@ -35,7 +35,9 @@ if TYPE_CHECKING:
class RecipeModel(SqlAlchemyBase, BaseMixins):
__tablename__ = "recipes"
__table_args__ = (sa.UniqueConstraint("slug", "group_id", name="recipe_slug_group_id_key"),)
__table_args__: tuple[sa.UniqueConstraint, ...] = (
sa.UniqueConstraint("slug", "group_id", name="recipe_slug_group_id_key"),
)
id: Mapped[GUID] = mapped_column(GUID, primary_key=True, default=GUID.generate)
slug: Mapped[str | None] = mapped_column(sa.String, index=True)
@@ -192,6 +194,46 @@ class RecipeModel(SqlAlchemyBase, BaseMixins):
if description is not None:
self.description_normalized = unidecode(description).lower().strip()
tableargs = [ # base set of indices
sa.UniqueConstraint("slug", "group_id", name="recipe_slug_group_id_key"),
sa.Index(
"ix_recipes_name_normalized",
"name_normalized",
unique=False,
),
sa.Index(
"ix_recipes_description_normalized",
"description_normalized",
unique=False,
),
]
if session.get_bind().name == "postgresql":
tableargs.extend(
[
sa.Index(
"ix_recipes_name_normalized_gin",
"name_normalized",
unique=False,
postgresql_using="gin",
postgresql_ops={
"name_normalized": "gin_trgm_ops",
},
),
sa.Index(
"ix_recipes_description_normalized_gin",
"description_normalized",
unique=False,
postgresql_using="gin",
postgresql_ops={
"description_normalized": "gin_trgm_ops",
},
),
]
)
# add indices
self.__table_args__ = tuple(tableargs)
@event.listens_for(RecipeModel.name, "set")
def receive_name(target: RecipeModel, value: str, oldvalue, initiator):