feat: search tokenization, handling of quoted literal search, and postgres fuzziness (#2351)

* Creating postgres migration script and starting to set up to detect database

* non-working placeholders for postgres pg_tgrm

* First draft of some indexes

* non-working commit of postgres indexing

* Further non-working edits to db-centric fuzzy search

* update alembic for extensions

* More non-working setup

* Move db type check to init_db

* fix typo in db name check

* Add sqlite token search and postgres full text search

* reorder search to hit exact matches faster

* Add settings and docs for POSTGRES_LANGUAGE (full text search)

* Use user-specified POSTGRES_LANGUAGE in search

* fix fuzzy search typo

* Remove full text search and instead order by trigram match

* cleaner adding of indices, remove fulltext

* Cleanup old import of getting app settings

* Fix typo in index

* Fix some alembic fuzzy typos

* Remove diagnostic printing from alembic migration

* Fix mixed up commutator for trigram operator and relax criteria

* forgot to remove query debug

* sort only on name

* token and fuzzy search tests

* Refactor recipe search test to avoid rare random string cross-matches.

* Add ability to quote parts of search for exact match

* Remove internal punctuation, unless it's quoted for literal search

* Add tests for special character removal and literal search

* Remove the outer double quotes from searches, but leave internal single quotes alone.

* Update tests to avoid intra-test name collisions

* Fixing leftovers highlighted by lint

* cleanup linting and mypy errors

* Fix test cross-matching on dirty db (leftovers from bulk import)

* forgot to cleanup something when debugging mypy errors

* re-order pg_trgm loading in postgres

* address comments
This commit is contained in:
Jacob Corn
2023-05-28 19:46:53 +02:00
committed by GitHub
parent 27ebb4c462
commit 7e0d29afc7
7 changed files with 304 additions and 43 deletions

View File

@@ -1,10 +1,11 @@
import re as re
from collections.abc import Sequence
from random import randint
from uuid import UUID
from pydantic import UUID4
from slugify import slugify
from sqlalchemy import Select, and_, desc, func, or_, select
from sqlalchemy import Select, and_, desc, func, or_, select, text
from sqlalchemy.exc import IntegrityError
from sqlalchemy.orm import joinedload
from text_unidecode import unidecode
@@ -151,29 +152,94 @@ class RepositoryRecipes(RepositoryGeneric[Recipe, RecipeModel]):
return ids + additional_ids
def _add_search_to_query(self, query: Select, search: str) -> Select:
"""
0. fuzzy search (postgres only) and tokenized search are performed separately
1. take search string and do a little pre-normalization
2. look for internal quoted strings and keep them together as "literal" parts of the search
3. remove special characters from each non-literal search string
4. token search looks for any individual exact hit in name, description, and ingredients
5. fuzzy search looks for trigram hits in name, description, and ingredients
6. Sort order is determined by closeness to the recipe name
Should search also look at tags?
"""
normalized_search = unidecode(search).lower().strip()
punctuation = "!\#$%&()*+,-./:;<=>?@[\\]^_`{|}~" # string.punctuation with ' & " removed
# keep quoted phrases together as literal portions of the search string
literal = False
quoted_regex = re.compile(r"""(["'])(?:(?=(\\?))\2.)*?\1""") # thank you stack exchange!
removequotes_regex = re.compile(r"""['"](.*)['"]""")
if quoted_regex.search(normalized_search):
literal = True
temp = normalized_search
quoted_search_list = [match.group() for match in quoted_regex.finditer(temp)] # all quoted strings
quoted_search_list = [removequotes_regex.sub("\\1", x) for x in quoted_search_list] # remove outer quotes
temp = quoted_regex.sub("", temp) # remove all quoted strings, leaving just non-quoted
temp = temp.translate(
str.maketrans(punctuation, " " * len(punctuation))
) # punctuation->spaces for splitting, but only on unquoted strings
unquoted_search_list = temp.split() # all unquoted strings
normalized_search_list = quoted_search_list + unquoted_search_list
else:
#
normalized_search = normalized_search.translate(str.maketrans(punctuation, " " * len(punctuation)))
normalized_search_list = normalized_search.split()
normalized_search_list = [x.strip() for x in normalized_search_list] # remove padding whitespace inside quotes
# I would prefer to just do this in the recipe_ingredient.any part of the main query, but it turns out
# that at least sqlite wont use indexes for that correctly anymore and takes a big hit, so prefiltering it is
ingredient_ids = (
self.session.execute(
select(RecipeIngredientModel.id).filter(
or_(
RecipeIngredientModel.note_normalized.like(f"%{normalized_search}%"),
RecipeIngredientModel.original_text_normalized.like(f"%{normalized_search}%"),
if (self.session.get_bind().name == "postgresql") & (literal is False): # fuzzy search
ingredient_ids = (
self.session.execute(
select(RecipeIngredientModel.id).filter(
or_(
RecipeIngredientModel.note_normalized.op("%>")(normalized_search),
RecipeIngredientModel.original_text_normalized.op("%>")(normalized_search),
)
)
)
.scalars()
.all()
)
else: # exact token search
ingredient_ids = (
self.session.execute(
select(RecipeIngredientModel.id).filter(
or_(
*[RecipeIngredientModel.note_normalized.like(f"%{ns}%") for ns in normalized_search_list],
*[
RecipeIngredientModel.original_text_normalized.like(f"%{ns}%")
for ns in normalized_search_list
],
)
)
)
.scalars()
.all()
)
.scalars()
.all()
)
q = query.filter(
or_(
RecipeModel.name_normalized.like(f"%{normalized_search}%"),
RecipeModel.description_normalized.like(f"%{normalized_search}%"),
RecipeModel.recipe_ingredient.any(RecipeIngredientModel.id.in_(ingredient_ids)),
if (self.session.get_bind().name == "postgresql") & (literal is False): # fuzzy search
# default = 0.7 is too strict for effective fuzzing
self.session.execute(text("set pg_trgm.word_similarity_threshold = 0.5;"))
q = query.filter(
or_(
RecipeModel.name_normalized.op("%>")(normalized_search),
RecipeModel.description_normalized.op("%>")(normalized_search),
RecipeModel.recipe_ingredient.any(RecipeIngredientModel.id.in_(ingredient_ids)),
)
).order_by( # trigram ordering could be too slow on million record db, but is fine with thousands.
func.least(
RecipeModel.name_normalized.op("<->>")(normalized_search),
)
)
).order_by(desc(RecipeModel.name_normalized.like(f"%{normalized_search}%")))
else: # exact token search
q = query.filter(
or_(
*[RecipeModel.name_normalized.like(f"%{ns}%") for ns in normalized_search_list],
*[RecipeModel.description_normalized.like(f"%{ns}%") for ns in normalized_search_list],
RecipeModel.recipe_ingredient.any(RecipeIngredientModel.id.in_(ingredient_ids)),
)
).order_by(desc(RecipeModel.name_normalized.like(f"%{normalized_search}%")))
return q
def page_all(