mirror of
https://github.com/navidrome/navidrome.git
synced 2026-03-04 06:35:52 +00:00
* fix(subsonic): optimize search3 for high-cardinality FTS queries Use a two-phase query strategy for FTS5 searches to avoid the performance penalty of expensive LEFT JOINs (annotation, bookmark, library) on high-cardinality results like "the". Phase 1 runs a lightweight query (main table + FTS index only) to get sorted, paginated rowids. Phase 2 hydrates only those few rowids with the full JOINs, making them nearly free. For queries with complex ORDER BY expressions that reference joined tables (e.g. artist search sorted by play count), the optimization is skipped and the original single-query approach is used. * fix(search): update order by clauses to include 'rank' for FTS queries Signed-off-by: Deluan <deluan@navidrome.org> * fix(search): reintroduce 'rank' in Phase 2 ORDER BY for FTS queries Signed-off-by: Deluan <deluan@navidrome.org> * fix(search): remove 'rank' from ORDER BY in non-FTS queries and adjust two-phase query handling Signed-off-by: Deluan <deluan@navidrome.org> * fix(search): update FTS ranking to use bm25 weights and simplify ORDER BY qualification Signed-off-by: Deluan <deluan@navidrome.org> * fix(search): refine FTS query handling and improve comments for clarity Signed-off-by: Deluan <deluan@navidrome.org> * fix(search): refactor full-text search handling to streamline query strategy selection and improve LIKE fallback logic. Increase e2e coverage for search3 Signed-off-by: Deluan <deluan@navidrome.org> * refactor: enhance FTS column definitions and relevance weights Signed-off-by: Deluan <deluan@navidrome.org> * fix(search): refactor Search method signatures to remove offset and size parameters, streamline query handling Signed-off-by: Deluan <deluan@navidrome.org> * fix(search): allow single-character queries in search strategies and update related tests Signed-off-by: Deluan <deluan@navidrome.org> * fix(search): make FTS Phase 1 treat Max=0 as no limit, reorganize tests FTS Phase 1 unconditionally called Limit(uint64(options.Max)), which produced LIMIT 0 when Max was zero. This diverged from applyOptions where Max=0 means no limit. Now Phase 1 mirrors applyOptions: only add LIMIT/OFFSET when the value is positive. Also moved legacy backend integration tests from sql_search_fts_test.go to sql_search_like_test.go and added regression tests for the Max=0 behavior on both backends. * refactor: simplify callSearch function by removing variadic options and directly using QueryOptions Signed-off-by: Deluan <deluan@navidrome.org> * fix(search): implement ftsQueryDegraded function to detect significant content loss in FTS queries Signed-off-by: Deluan <deluan@navidrome.org> --------- Signed-off-by: Deluan <deluan@navidrome.org>
423 lines
15 KiB
Go
423 lines
15 KiB
Go
package persistence
|
|
|
|
import (
|
|
"fmt"
|
|
"regexp"
|
|
"strings"
|
|
"unicode"
|
|
"unicode/utf8"
|
|
|
|
. "github.com/Masterminds/squirrel"
|
|
"github.com/navidrome/navidrome/log"
|
|
"github.com/navidrome/navidrome/model"
|
|
)
|
|
|
|
// containsCJK returns true if the string contains any CJK (Chinese/Japanese/Korean) characters.
|
|
// CJK text doesn't use spaces between words, so FTS5's unicode61 tokenizer treats entire
|
|
// CJK phrases as single tokens, making token-based search ineffective for CJK content.
|
|
func containsCJK(s string) bool {
|
|
for _, r := range s {
|
|
if unicode.Is(unicode.Han, r) ||
|
|
unicode.Is(unicode.Hiragana, r) ||
|
|
unicode.Is(unicode.Katakana, r) ||
|
|
unicode.Is(unicode.Hangul, r) {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// fts5SpecialChars matches characters that should be stripped from user input.
|
|
// We keep only Unicode letters, numbers, whitespace, * (prefix wildcard), " (phrase quotes),
|
|
// and \x00 (internal placeholder marker). All punctuation is removed because the unicode61
|
|
// tokenizer treats it as token separators, and characters like ' can cause FTS5 parse errors
|
|
// as unbalanced string delimiters.
|
|
var fts5SpecialChars = regexp.MustCompile(`[^\p{L}\p{N}\s*"\x00]`)
|
|
|
|
// fts5PunctStrip strips everything except letters and numbers (no whitespace, wildcards, or quotes).
|
|
// Used for normalizing words at index time to create concatenated forms (e.g., "R.E.M." → "REM").
|
|
var fts5PunctStrip = regexp.MustCompile(`[^\p{L}\p{N}]`)
|
|
|
|
// fts5Operators matches FTS5 boolean operators as whole words (case-insensitive).
|
|
var fts5Operators = regexp.MustCompile(`(?i)\b(AND|OR|NOT|NEAR)\b`)
|
|
|
|
// fts5LeadingStar matches a * at the start of a token. FTS5 only supports * at the end (prefix queries).
|
|
var fts5LeadingStar = regexp.MustCompile(`(^|[\s])\*+`)
|
|
|
|
// normalizeForFTS takes multiple strings, strips non-letter/non-number characters from each word,
|
|
// and returns a space-separated string of words that changed after stripping (deduplicated).
|
|
// This is used at index time to create concatenated forms: "R.E.M." → "REM", "AC/DC" → "ACDC".
|
|
func normalizeForFTS(values ...string) string {
|
|
seen := make(map[string]struct{})
|
|
var result []string
|
|
for _, v := range values {
|
|
for _, word := range strings.Fields(v) {
|
|
stripped := fts5PunctStrip.ReplaceAllString(word, "")
|
|
if stripped == "" || stripped == word {
|
|
continue
|
|
}
|
|
lower := strings.ToLower(stripped)
|
|
if _, ok := seen[lower]; ok {
|
|
continue
|
|
}
|
|
seen[lower] = struct{}{}
|
|
result = append(result, stripped)
|
|
}
|
|
}
|
|
return strings.Join(result, " ")
|
|
}
|
|
|
|
// isSingleUnicodeLetter returns true if token is exactly one Unicode letter.
|
|
func isSingleUnicodeLetter(token string) bool {
|
|
r, size := utf8.DecodeRuneInString(token)
|
|
return size == len(token) && size > 0 && unicode.IsLetter(r)
|
|
}
|
|
|
|
// namePunctuation is the set of characters commonly used as separators in artist/album
|
|
// names (hyphens, slashes, dots, apostrophes). Only words containing these are candidates
|
|
// for punctuated-word processing; other special characters (^, :, &) are just stripped.
|
|
const namePunctuation = `-/.''`
|
|
|
|
// processPunctuatedWords handles words with embedded name punctuation before the general
|
|
// special-character stripping. For each punctuated word it produces either:
|
|
// - A quoted phrase for dotted abbreviations: R.E.M. → "R E M"
|
|
// - A phrase+concat OR for other patterns: a-ha → ("a ha" OR aha*)
|
|
func processPunctuatedWords(input string, phrases []string) (string, []string) {
|
|
words := strings.Fields(input)
|
|
var result []string
|
|
for _, w := range words {
|
|
if strings.HasPrefix(w, "\x00") || strings.ContainsAny(w, `*"`) || !strings.ContainsAny(w, namePunctuation) {
|
|
result = append(result, w)
|
|
continue
|
|
}
|
|
concat := fts5PunctStrip.ReplaceAllString(w, "")
|
|
if concat == "" || concat == w {
|
|
result = append(result, w)
|
|
continue
|
|
}
|
|
subTokens := strings.Fields(fts5SpecialChars.ReplaceAllString(w, " "))
|
|
if len(subTokens) < 2 {
|
|
// Single sub-token after splitting (e.g., N' → N): just use the stripped form
|
|
result = append(result, concat)
|
|
continue
|
|
}
|
|
// Dotted abbreviations (R.E.M., U.K.) — all single letters separated by dots only
|
|
if isDottedAbbreviation(w, subTokens) {
|
|
phrases = append(phrases, fmt.Sprintf(`"%s"`, strings.Join(subTokens, " ")))
|
|
} else {
|
|
// Punctuated names (a-ha, AC/DC, Jay-Z) — phrase for adjacency + concat for search_normalized
|
|
phrases = append(phrases, fmt.Sprintf(`("%s" OR %s*)`, strings.Join(subTokens, " "), concat))
|
|
}
|
|
result = append(result, fmt.Sprintf("\x00PHRASE%d\x00", len(phrases)-1))
|
|
}
|
|
return strings.Join(result, " "), phrases
|
|
}
|
|
|
|
// isDottedAbbreviation returns true if w uses only dots as punctuation and all sub-tokens
|
|
// are single letters (e.g., "R.E.M.", "U.K." but not "a-ha" or "AC/DC").
|
|
func isDottedAbbreviation(w string, subTokens []string) bool {
|
|
for _, r := range w {
|
|
if !unicode.IsLetter(r) && !unicode.IsNumber(r) && r != '.' {
|
|
return false
|
|
}
|
|
}
|
|
for _, st := range subTokens {
|
|
if !isSingleUnicodeLetter(st) {
|
|
return false
|
|
}
|
|
}
|
|
return true
|
|
}
|
|
|
|
// buildFTS5Query preprocesses user input into a safe FTS5 MATCH expression.
|
|
// It preserves quoted phrases and * prefix wildcards, neutralizes FTS5 operators
|
|
// (by lowercasing them, since FTS5 operators are case-sensitive) and strips
|
|
// special characters to prevent query injection.
|
|
func buildFTS5Query(userInput string) string {
|
|
q := strings.TrimSpace(userInput)
|
|
if q == "" || q == `""` {
|
|
return ""
|
|
}
|
|
|
|
var phrases []string
|
|
result := q
|
|
for {
|
|
start := strings.Index(result, `"`)
|
|
if start == -1 {
|
|
break
|
|
}
|
|
end := strings.Index(result[start+1:], `"`)
|
|
if end == -1 {
|
|
// Unmatched quote — remove it
|
|
result = result[:start] + result[start+1:]
|
|
break
|
|
}
|
|
end += start + 1
|
|
phrase := result[start : end+1] // includes quotes
|
|
phrases = append(phrases, phrase)
|
|
result = result[:start] + fmt.Sprintf("\x00PHRASE%d\x00", len(phrases)-1) + result[end+1:]
|
|
}
|
|
|
|
// Neutralize FTS5 operators by lowercasing them (FTS5 operators are case-sensitive:
|
|
// AND, OR, NOT, NEAR are operators, but and, or, not, near are plain tokens)
|
|
result = fts5Operators.ReplaceAllStringFunc(result, strings.ToLower)
|
|
|
|
// Handle words with embedded punctuation (a-ha, AC/DC, R.E.M.) before stripping
|
|
result, phrases = processPunctuatedWords(result, phrases)
|
|
|
|
result = fts5SpecialChars.ReplaceAllString(result, " ")
|
|
result = fts5LeadingStar.ReplaceAllString(result, "$1")
|
|
tokens := strings.Fields(result)
|
|
|
|
// Append * to plain tokens for prefix matching (e.g., "love" → "love*").
|
|
// Skip tokens that are already wildcarded or are quoted phrase placeholders.
|
|
for i, t := range tokens {
|
|
if strings.HasPrefix(t, "\x00") || strings.HasSuffix(t, "*") {
|
|
continue
|
|
}
|
|
tokens[i] = t + "*"
|
|
}
|
|
|
|
result = strings.Join(tokens, " ")
|
|
|
|
for i, phrase := range phrases {
|
|
placeholder := fmt.Sprintf("\x00PHRASE%d\x00", i)
|
|
result = strings.ReplaceAll(result, placeholder, phrase)
|
|
}
|
|
|
|
return result
|
|
}
|
|
|
|
// ftsColumn pairs an FTS5 column name with its BM25 relevance weight.
|
|
type ftsColumn struct {
|
|
Name string
|
|
Weight float64
|
|
}
|
|
|
|
// ftsColumnDefs defines FTS5 columns and their BM25 relevance weights.
|
|
// The order MUST match the column order in the FTS5 table definition (see migrations).
|
|
// All columns are both searched and ranked. When adding indexed-but-not-searched
|
|
// columns in the future, use Weight: 0 to exclude from the search column filter.
|
|
var ftsColumnDefs = map[string][]ftsColumn{
|
|
"media_file": {
|
|
{"title", 10.0},
|
|
{"album", 5.0},
|
|
{"artist", 3.0},
|
|
{"album_artist", 3.0},
|
|
{"sort_title", 1.0},
|
|
{"sort_album_name", 1.0},
|
|
{"sort_artist_name", 1.0},
|
|
{"sort_album_artist_name", 1.0},
|
|
{"disc_subtitle", 1.0},
|
|
{"search_participants", 2.0},
|
|
{"search_normalized", 1.0},
|
|
},
|
|
"album": {
|
|
{"name", 10.0},
|
|
{"sort_album_name", 1.0},
|
|
{"album_artist", 3.0},
|
|
{"search_participants", 2.0},
|
|
{"discs", 1.0},
|
|
{"catalog_num", 1.0},
|
|
{"album_version", 1.0},
|
|
{"search_normalized", 1.0},
|
|
},
|
|
"artist": {
|
|
{"name", 10.0},
|
|
{"sort_artist_name", 1.0},
|
|
{"search_normalized", 1.0},
|
|
},
|
|
}
|
|
|
|
// ftsColumnFilters and ftsBM25Weights are precomputed from ftsColumnDefs at init time
|
|
// to avoid per-query allocations.
|
|
var (
|
|
ftsColumnFilters = map[string]string{}
|
|
ftsBM25Weights = map[string]string{}
|
|
)
|
|
|
|
func init() {
|
|
for table, cols := range ftsColumnDefs {
|
|
var names []string
|
|
weights := make([]string, len(cols))
|
|
for i, c := range cols {
|
|
if c.Weight > 0 {
|
|
names = append(names, c.Name)
|
|
}
|
|
weights[i] = fmt.Sprintf("%.1f", c.Weight)
|
|
}
|
|
ftsColumnFilters[table] = "{" + strings.Join(names, " ") + "}"
|
|
ftsBM25Weights[table] = strings.Join(weights, ", ")
|
|
}
|
|
}
|
|
|
|
// ftsSearch implements searchStrategy using FTS5 full-text search with BM25 ranking.
|
|
type ftsSearch struct {
|
|
tableName string
|
|
ftsTable string
|
|
matchExpr string
|
|
rankExpr string
|
|
}
|
|
|
|
// ToSql returns a single-query fallback for the REST filter path (no two-phase split).
|
|
func (s *ftsSearch) ToSql() (string, []interface{}, error) {
|
|
sql := s.tableName + ".rowid IN (SELECT rowid FROM " + s.ftsTable + " WHERE " + s.ftsTable + " MATCH ?)"
|
|
return sql, []interface{}{s.matchExpr}, nil
|
|
}
|
|
|
|
// execute runs a two-phase FTS5 search:
|
|
// - Phase 1: lightweight rowid query (main table + FTS + library filter) for ranking and pagination.
|
|
// - Phase 2: full SELECT with all JOINs, scoped to Phase 1's rowid set.
|
|
//
|
|
// Complex ORDER BY (function calls, aggregations) are dropped from Phase 1.
|
|
func (s *ftsSearch) execute(r sqlRepository, sq SelectBuilder, dest any, cfg searchConfig, options model.QueryOptions) error {
|
|
qualifiedOrderBys := []string{s.rankExpr}
|
|
for _, ob := range cfg.OrderBy {
|
|
if qualified := qualifyOrderBy(s.tableName, ob); qualified != "" {
|
|
qualifiedOrderBys = append(qualifiedOrderBys, qualified)
|
|
}
|
|
}
|
|
|
|
// Phase 1: fresh query — must set LIMIT/OFFSET from options explicitly.
|
|
// Mirror applyOptions behavior: Max=0 means no limit, not LIMIT 0.
|
|
rowidQuery := Select(s.tableName+".rowid").
|
|
From(s.tableName).
|
|
Join(s.ftsTable+" ON "+s.ftsTable+".rowid = "+s.tableName+".rowid AND "+s.ftsTable+" MATCH ?", s.matchExpr).
|
|
Where(Eq{s.tableName + ".missing": false}).
|
|
OrderBy(qualifiedOrderBys...)
|
|
if options.Max > 0 {
|
|
rowidQuery = rowidQuery.Limit(uint64(options.Max))
|
|
}
|
|
if options.Offset > 0 {
|
|
rowidQuery = rowidQuery.Offset(uint64(options.Offset))
|
|
}
|
|
|
|
// Library filter + musicFolderId must be applied here, before pagination.
|
|
if cfg.LibraryFilter != nil {
|
|
rowidQuery = cfg.LibraryFilter(rowidQuery)
|
|
} else {
|
|
rowidQuery = r.applyLibraryFilter(rowidQuery)
|
|
}
|
|
if options.Filters != nil {
|
|
rowidQuery = rowidQuery.Where(options.Filters)
|
|
}
|
|
|
|
rowidSQL, rowidArgs, err := rowidQuery.ToSql()
|
|
if err != nil {
|
|
return fmt.Errorf("building FTS rowid query: %w", err)
|
|
}
|
|
|
|
// Phase 2: strip LIMIT/OFFSET from sq (Phase 1 handled pagination),
|
|
// join on the ranked rowid set to hydrate with full columns.
|
|
sq = sq.RemoveLimit().RemoveOffset()
|
|
rankedSubquery := fmt.Sprintf(
|
|
"(SELECT rowid as _rid, row_number() OVER () AS _rn FROM (%s)) AS _ranked",
|
|
rowidSQL,
|
|
)
|
|
sq = sq.Join(rankedSubquery+" ON "+s.tableName+".rowid = _ranked._rid", rowidArgs...)
|
|
sq = sq.OrderBy("_ranked._rn")
|
|
return r.queryAll(sq, dest)
|
|
}
|
|
|
|
// qualifyOrderBy prepends tableName to a simple column name. Returns empty string for
|
|
// complex expressions (function calls, aggregations) that can't be used in Phase 1.
|
|
func qualifyOrderBy(tableName, orderBy string) string {
|
|
orderBy = strings.TrimSpace(orderBy)
|
|
if orderBy == "" || strings.ContainsAny(orderBy, "(,") {
|
|
return ""
|
|
}
|
|
parts := strings.Fields(orderBy)
|
|
if !strings.Contains(parts[0], ".") {
|
|
parts[0] = tableName + "." + parts[0]
|
|
}
|
|
return strings.Join(parts, " ")
|
|
}
|
|
|
|
// ftsQueryDegraded returns true when the FTS query lost significant discriminating
|
|
// content compared to the original input. This happens when special characters that
|
|
// are part of the entity name (e.g., "1+", "C++", "!!!", "C#") get stripped by FTS
|
|
// tokenization, leaving only very short/broad tokens. Also detects quoted phrases
|
|
// that would be degraded by FTS5's unicode61 tokenizer (e.g., "1+" → token "1").
|
|
func ftsQueryDegraded(original, ftsQuery string) bool {
|
|
original = strings.TrimSpace(original)
|
|
if original == "" || ftsQuery == "" {
|
|
return false
|
|
}
|
|
// Strip quotes from original for comparison — we want the raw content
|
|
stripped := strings.ReplaceAll(original, `"`, "")
|
|
// Extract the alphanumeric content from the original query
|
|
alphaNum := fts5PunctStrip.ReplaceAllString(stripped, "")
|
|
// If the original is entirely alphanumeric, nothing was stripped — not degraded
|
|
if len(alphaNum) == len(stripped) {
|
|
return false
|
|
}
|
|
// Check if all effective FTS tokens are very short (≤2 chars).
|
|
// Short tokens with prefix matching are too broad when special chars were stripped.
|
|
// For quoted phrases, extract the content and check the tokens inside.
|
|
tokens := strings.Fields(ftsQuery)
|
|
for _, t := range tokens {
|
|
t = strings.TrimSuffix(t, "*")
|
|
// Skip internal phrase placeholders
|
|
if strings.HasPrefix(t, "\x00") {
|
|
return false
|
|
}
|
|
// For OR groups from processPunctuatedWords (e.g., ("a ha" OR aha*)),
|
|
// the punctuated word was already handled meaningfully — not degraded.
|
|
if strings.HasPrefix(t, "(") {
|
|
return false
|
|
}
|
|
// For quoted phrases, check the tokens inside as FTS5 will tokenize them
|
|
if strings.HasPrefix(t, `"`) {
|
|
// Extract content between quotes
|
|
inner := strings.Trim(t, `"`)
|
|
innerAlpha := fts5PunctStrip.ReplaceAllString(inner, " ")
|
|
for _, it := range strings.Fields(innerAlpha) {
|
|
if len(it) > 2 {
|
|
return false
|
|
}
|
|
}
|
|
continue
|
|
}
|
|
if len(t) > 2 {
|
|
return false
|
|
}
|
|
}
|
|
return true
|
|
}
|
|
|
|
// newFTSSearch creates an FTS5 search strategy. Falls back to LIKE search if the
|
|
// query produces no FTS tokens (e.g., punctuation-only like "!!!!!!!") or if FTS
|
|
// tokenization stripped significant content from the query (e.g., "1+" → "1*").
|
|
// Returns nil when the query produces no searchable tokens at all.
|
|
func newFTSSearch(tableName, query string) searchStrategy {
|
|
q := buildFTS5Query(query)
|
|
if q == "" || ftsQueryDegraded(query, q) {
|
|
// Fallback: try LIKE search with the raw query
|
|
cleaned := strings.TrimSpace(strings.ReplaceAll(query, `"`, ""))
|
|
if cleaned != "" {
|
|
log.Trace("Search using LIKE fallback for non-tokenizable query", "table", tableName, "query", cleaned)
|
|
return newLikeSearch(tableName, cleaned)
|
|
}
|
|
return nil
|
|
}
|
|
ftsTable := tableName + "_fts"
|
|
matchExpr := q
|
|
if cols, ok := ftsColumnFilters[tableName]; ok {
|
|
matchExpr = cols + " : (" + q + ")"
|
|
}
|
|
|
|
rankExpr := ftsTable + ".rank"
|
|
if weights, ok := ftsBM25Weights[tableName]; ok {
|
|
rankExpr = "bm25(" + ftsTable + ", " + weights + ")"
|
|
}
|
|
|
|
s := &ftsSearch{
|
|
tableName: tableName,
|
|
ftsTable: ftsTable,
|
|
matchExpr: matchExpr,
|
|
rankExpr: rankExpr,
|
|
}
|
|
log.Trace("Search using FTS5 backend", "table", tableName, "query", q, "filter", s)
|
|
return s
|
|
}
|