navidrome/core/matcher/matcher.go

package matcher

import (
	"context"
	"fmt"
	"math"

	"github.com/Masterminds/squirrel"
	"github.com/navidrome/navidrome/conf"
	"github.com/navidrome/navidrome/core/agents"
	"github.com/navidrome/navidrome/model"
	"github.com/navidrome/navidrome/utils/str"
	"github.com/xrash/smetrics"
)

// Matcher matches agent song results to local library tracks.
type Matcher struct {
	ds model.DataStore
}

// New creates a new Matcher with the given DataStore.
func New(ds model.DataStore) *Matcher {
	return &Matcher{ds: ds}
}

// MatchSongs matches agent song results to local library tracks using a multi-phase
// matching algorithm that prioritizes accuracy over recall.
//
// # Algorithm Overview
//
// The algorithm matches songs from external agents (Last.fm, Deezer, etc.) to tracks in the
// local music library using four matching strategies in priority order:
//
//  1. Direct ID match: Songs with an ID field are matched directly to MediaFiles by ID
//  2. MusicBrainz Recording ID (MBID) match: Songs with MBID are matched to tracks with
//     matching mbz_recording_id
//  3. ISRC match: Songs with ISRC are matched to tracks with matching ISRC tag
//  4. Title+Artist fuzzy match: Remaining songs are matched using fuzzy string comparison
//     with metadata specificity scoring
//
// # Matching Priority
//
// When selecting the final result, matches are prioritized in order: ID > MBID > ISRC > Title+Artist.
// This ensures that more reliable identifiers take precedence over fuzzy text matching.
//
// # Fuzzy Matching Details
//
// For title+artist matching, the algorithm uses Jaro-Winkler similarity (threshold configurable
// via Matcher.FuzzyThreshold, default 85%). Matches are ranked by:
//
//  1. Title similarity (Jaro-Winkler score, 0.0-1.0)
//  2. Duration proximity (closer duration = higher score, 1.0 if unknown)
//  3. Preferred track flag (enabled by Matcher.PreferStarred; prioritized when the track is
//     starred or has rating >= 4)
//  4. Specificity level (0-5, based on metadata precision):
//     - Level 5: Title + Artist MBID + Album MBID (most specific)
//     - Level 4: Title + Artist MBID + Album name (fuzzy)
//     - Level 3: Title + Artist name + Album name (fuzzy)
//     - Level 2: Title + Artist MBID
//     - Level 1: Title + Artist name
//     - Level 0: Title only
//  5. Album similarity (Jaro-Winkler, as final tiebreaker)
//
// # Examples
//
// Example 1 - MBID Priority:
//
//	Agent returns: {Name: "Paranoid Android", MBID: "abc-123", Artist: "Radiohead"}
//	Library has: [
//	  {ID: "t1", Title: "Paranoid Android", MbzRecordingID: "abc-123"},
//	  {ID: "t2", Title: "Paranoid Android", Artist: "Radiohead"},
//	]
//	Result: t1 (MBID match takes priority over title+artist)
//
// Example 2 - ISRC Priority:
//
//	Agent returns: {Name: "Paranoid Android", ISRC: "GBAYE0000351", Artist: "Radiohead"}
//	Library has: [
//	  {ID: "t1", Title: "Paranoid Android", Tags: {isrc: ["GBAYE0000351"]}},
//	  {ID: "t2", Title: "Paranoid Android", Artist: "Radiohead"},
//	]
//	Result: t1 (ISRC match takes priority over title+artist)
//
// Example 3 - Specificity Ranking:
//
//	Agent returns: {Name: "Enjoy the Silence", Artist: "Depeche Mode", Album: "Violator"}
//	Library has: [
//	  {ID: "t1", Title: "Enjoy the Silence", Artist: "Depeche Mode", Album: "101"},           // Level 1
//	  {ID: "t2", Title: "Enjoy the Silence", Artist: "Depeche Mode", Album: "Violator"},      // Level 3
//	]
//	Result: t2 (Level 3 beats Level 1 due to album match)
//
// Example 4 - Fuzzy Title Matching:
//
//	Agent returns: {Name: "Bohemian Rhapsody", Artist: "Queen"}
//	Library has: {ID: "t1", Title: "Bohemian Rhapsody - Remastered", Artist: "Queen"}
//	With threshold=85%: Match succeeds (similarity ~0.87)
//	With threshold=100%: No match (not exact)
//
// # Parameters
//
//   - ctx: Context for database operations
//   - songs: Slice of agent.Song results from external providers
//   - count: Maximum number of matches to return
//
// # Returns
//
// Returns up to 'count' MediaFiles from the library that best match the input songs,
// preserving the original order from the agent. Songs that cannot be matched are skipped.
func (m *Matcher) MatchSongs(ctx context.Context, songs []agents.Song, count int) (model.MediaFiles, error) {
	if len(songs) == 0 {
		return nil, nil
	}

	byID, byMBID, byISRC, byTitle, err := m.loadAllMatches(ctx, songs)
	if err != nil {
		return nil, err
	}
	return m.selectBestMatchingSongs(songs, byID, byMBID, byISRC, byTitle, count), nil
}

// MatchSongsIndexed matches agent song results to local library tracks and returns a map
// from input song index to matched MediaFile. Songs that cannot be matched are omitted from the map.
// This preserves original indices, allowing callers to correlate results back to the input slice.
func (m *Matcher) MatchSongsIndexed(ctx context.Context, songs []agents.Song) (map[int]model.MediaFile, error) {
	if len(songs) == 0 {
		return nil, nil
	}

	byID, byMBID, byISRC, byTitle, err := m.loadAllMatches(ctx, songs)
	if err != nil {
		return nil, err
	}

	result := make(map[int]model.MediaFile, len(songs))
	for i, t := range songs {
		if mf, found := findMatchingTrack(t, byID, byMBID, byISRC, byTitle); found {
			result[i] = mf
		}
	}
	return result, nil
}

func (m *Matcher) loadAllMatches(ctx context.Context, songs []agents.Song) (byID, byMBID, byISRC, byTitle map[string]model.MediaFile, err error) {
	byID, err = m.loadTracksByID(ctx, songs)
	if err != nil {
		return nil, nil, nil, nil, fmt.Errorf("failed to load tracks by ID: %w", err)
	}
	byMBID, err = m.loadTracksByMBID(ctx, songs, byID)
	if err != nil {
		return nil, nil, nil, nil, fmt.Errorf("failed to load tracks by MBID: %w", err)
	}
	byISRC, err = m.loadTracksByISRC(ctx, songs, byID, byMBID)
	if err != nil {
		return nil, nil, nil, nil, fmt.Errorf("failed to load tracks by ISRC: %w", err)
	}
	byTitle, err = m.loadTracksByTitleAndArtist(ctx, songs, byID, byMBID, byISRC)
	if err != nil {
		return nil, nil, nil, nil, fmt.Errorf("failed to load tracks by title: %w", err)
	}
	return byID, byMBID, byISRC, byTitle, nil
}

// songMatchedIn checks if a song has already been matched in any of the provided match maps.
func songMatchedIn(s agents.Song, priorMatches ...map[string]model.MediaFile) bool {
	_, found := lookupByIdentifiers(s, priorMatches...)
	return found
}

// lookupByIdentifiers searches for a song's identifiers (ID, MBID, ISRC) in the provided maps.
func lookupByIdentifiers(s agents.Song, maps ...map[string]model.MediaFile) (model.MediaFile, bool) {
	keys := []string{s.ID, s.MBID, s.ISRC}
	for _, m := range maps {
		for _, key := range keys {
			if key != "" {
				if mf, ok := m[key]; ok && mf.ID != "" {
					return mf, true
				}
			}
		}
	}
	return model.MediaFile{}, false
}

// loadTracksByID fetches MediaFiles from the library using direct ID matching.
func (m *Matcher) loadTracksByID(ctx context.Context, songs []agents.Song) (map[string]model.MediaFile, error) {
	var ids []string
	for _, s := range songs {
		if s.ID != "" {
			ids = append(ids, s.ID)
		}
	}
	matches := map[string]model.MediaFile{}
	if len(ids) == 0 {
		return matches, nil
	}
	res, err := m.ds.MediaFile(ctx).GetAll(model.QueryOptions{
		Filters: squirrel.And{
			squirrel.Eq{"media_file.id": ids},
			squirrel.Eq{"missing": false},
		},
	})
	if err != nil {
		return matches, err
	}
	for _, mf := range res {
		if _, ok := matches[mf.ID]; !ok {
			matches[mf.ID] = mf
		}
	}
	return matches, nil
}

// loadTracksByMBID fetches MediaFiles from the library using MusicBrainz Recording IDs.
func (m *Matcher) loadTracksByMBID(ctx context.Context, songs []agents.Song, priorMatches ...map[string]model.MediaFile) (map[string]model.MediaFile, error) {
	var mbids []string
	for _, s := range songs {
		if s.MBID != "" && !songMatchedIn(s, priorMatches...) {
			mbids = append(mbids, s.MBID)
		}
	}
	matches := map[string]model.MediaFile{}
	if len(mbids) == 0 {
		return matches, nil
	}
	res, err := m.ds.MediaFile(ctx).GetAll(model.QueryOptions{
		Filters: squirrel.And{
			squirrel.Eq{"mbz_recording_id": mbids},
			squirrel.Eq{"missing": false},
		},
	})
	if err != nil {
		return matches, err
	}
	for _, mf := range res {
		if id := mf.MbzRecordingID; id != "" {
			if _, ok := matches[id]; !ok {
				matches[id] = mf
			}
		}
	}
	return matches, nil
}

// loadTracksByISRC fetches MediaFiles from the library using ISRC matching.
func (m *Matcher) loadTracksByISRC(ctx context.Context, songs []agents.Song, priorMatches ...map[string]model.MediaFile) (map[string]model.MediaFile, error) {
	var isrcs []string
	for _, s := range songs {
		if s.ISRC != "" && !songMatchedIn(s, priorMatches...) {
			isrcs = append(isrcs, s.ISRC)
		}
	}
	matches := map[string]model.MediaFile{}
	if len(isrcs) == 0 {
		return matches, nil
	}
	res, err := m.ds.MediaFile(ctx).GetAllByTags(model.TagISRC, isrcs, model.QueryOptions{
		Filters: squirrel.Eq{"missing": false},
		Sort:    "starred desc, rating desc, year asc, compilation asc",
	})
	if err != nil {
		return matches, err
	}
	for _, mf := range res {
		for _, isrc := range mf.Tags.Values(model.TagISRC) {
			if _, ok := matches[isrc]; !ok {
				matches[isrc] = mf
			}
		}
	}
	return matches, nil
}

// songQuery represents a normalized query for matching a song to library tracks.
type songQuery struct {
	title      string
	artist     string
	artistMBID string
	album      string
	albumMBID  string
	durationMs uint32
}

// matchScore combines title/album similarity with metadata specificity for ranking matches.
type matchScore struct {
	titleSimilarity   float64
	durationProximity float64
	preferredMatch    bool
	albumSimilarity   float64
	specificityLevel  int
}

// betterThan returns true if this score beats another.
func (s matchScore) betterThan(other matchScore) bool {
	if s.titleSimilarity != other.titleSimilarity {
		return s.titleSimilarity > other.titleSimilarity
	}
	if s.durationProximity != other.durationProximity {
		return s.durationProximity > other.durationProximity
	}
	if s.preferredMatch != other.preferredMatch {
		return s.preferredMatch
	}
	if s.specificityLevel != other.specificityLevel {
		return s.specificityLevel > other.specificityLevel
	}
	return s.albumSimilarity > other.albumSimilarity
}

// sanitizedTrack holds pre-sanitized fields for a media file, avoiding redundant sanitization
// when the same track is scored against multiple queries in the inner loop. The `mf` field
// is a pointer to avoid copying the large MediaFile struct into each entry of the per-artist
// sanitized slice.
type sanitizedTrack struct {
	mf     *model.MediaFile
	title  string
	artist string
	album  string
}

func newSanitizedTrack(mf *model.MediaFile) sanitizedTrack {
	return sanitizedTrack{
		mf:     mf,
		title:  str.SanitizeFieldForSorting(mf.Title),
		artist: str.SanitizeFieldForSortingNoArticle(mf.Artist),
		album:  str.SanitizeFieldForSorting(mf.Album),
	}
}

// computeSpecificityLevel determines how well query metadata matches a track (0-5).
// The track's title, artist, and album fields must be pre-sanitized.
func computeSpecificityLevel(q songQuery, t sanitizedTrack, albumThreshold float64) int {
	if q.artistMBID != "" && q.albumMBID != "" &&
		t.mf.MbzArtistID == q.artistMBID && t.mf.MbzAlbumID == q.albumMBID {
		return 5
	}
	if q.artistMBID != "" && q.album != "" &&
		t.mf.MbzArtistID == q.artistMBID && similarityRatio(t.album, q.album) >= albumThreshold {
		return 4
	}
	if q.artist != "" && q.album != "" &&
		t.artist == q.artist && similarityRatio(t.album, q.album) >= albumThreshold {
		return 3
	}
	if q.artistMBID != "" && t.mf.MbzArtistID == q.artistMBID {
		return 2
	}
	if q.artist != "" && t.artist == q.artist {
		return 1
	}
	if t.title == q.title {
		return 0
	}
	return -1
}

// loadTracksByTitleAndArtist loads tracks matching by title with optional artist/album filtering.
func (m *Matcher) loadTracksByTitleAndArtist(ctx context.Context, songs []agents.Song, priorMatches ...map[string]model.MediaFile) (map[string]model.MediaFile, error) {
	queries := m.buildTitleQueries(songs, priorMatches...)
	if len(queries) == 0 {
		return map[string]model.MediaFile{}, nil
	}

	threshold := float64(conf.Server.Matcher.FuzzyThreshold) / 100.0

	byArtist := map[string][]songQuery{}
	for _, q := range queries {
		if q.artist != "" {
			byArtist[q.artist] = append(byArtist[q.artist], q)
		}
	}

	matches := map[string]model.MediaFile{}
	for artist, artistQueries := range byArtist {
		tracks, err := m.ds.MediaFile(ctx).GetAll(model.QueryOptions{
			Filters: squirrel.And{
				squirrel.Eq{"order_artist_name": artist},
				squirrel.Eq{"missing": false},
			},
			Sort: "starred desc, rating desc, year asc, compilation asc",
		})
		if err != nil {
			continue
		}

		sanitized := make([]sanitizedTrack, len(tracks))
		for i := range tracks {
			sanitized[i] = newSanitizedTrack(&tracks[i])
		}

		for _, q := range artistQueries {
			if mf, found := m.findBestMatch(q, sanitized, threshold); found {
				key := q.title + "|" + q.artist
				if _, exists := matches[key]; !exists {
					matches[key] = mf
				}
			}
		}
	}
	return matches, nil
}

// durationProximity returns a score from 0.0 to 1.0 indicating how close the track's duration
// is to the target. Returns 1.0 if durationMs is 0 (unknown).
func durationProximity(durationMs uint32, mediaFileDurationSec float32) float64 {
	if durationMs == 0 {
		return 1.0
	}
	durationSec := float64(durationMs) / 1000.0
	diff := math.Abs(durationSec - float64(mediaFileDurationSec))
	return 1.0 / (1.0 + diff)
}

// findBestMatch finds the best matching track using combined title/album similarity and specificity scoring.
func (m *Matcher) findBestMatch(q songQuery, sanitizedTracks []sanitizedTrack, threshold float64) (model.MediaFile, bool) {
	var bestMatch model.MediaFile
	bestScore := matchScore{titleSimilarity: -1}
	found := false

	for _, t := range sanitizedTracks {
		titleSim := similarityRatio(q.title, t.title)

		if titleSim < threshold {
			continue
		}

		var albumSim float64
		if q.album != "" {
			albumSim = similarityRatio(q.album, t.album)
		}

		score := matchScore{
			titleSimilarity:   titleSim,
			durationProximity: durationProximity(q.durationMs, t.mf.Duration),
			preferredMatch:    conf.Server.Matcher.PreferStarred && isPreferredTrack(t.mf),
			albumSimilarity:   albumSim,
			specificityLevel:  computeSpecificityLevel(q, t, threshold),
		}

		if score.betterThan(bestScore) {
			bestScore = score
			bestMatch = *t.mf
			found = true
		}
	}
	return bestMatch, found
}

func isPreferredTrack(mf *model.MediaFile) bool {
	return mf.Starred || mf.Rating >= 4
}

// buildTitleQueries converts agent songs into normalized songQuery structs for title+artist matching.
func (m *Matcher) buildTitleQueries(songs []agents.Song, priorMatches ...map[string]model.MediaFile) []songQuery {
	var queries []songQuery
	for _, s := range songs {
		if songMatchedIn(s, priorMatches...) {
			continue
		}
		queries = append(queries, songQuery{
			title:      str.SanitizeFieldForSorting(s.Name),
			artist:     str.SanitizeFieldForSortingNoArticle(s.Artist),
			artistMBID: s.ArtistMBID,
			album:      str.SanitizeFieldForSorting(s.Album),
			albumMBID:  s.AlbumMBID,
			durationMs: s.Duration,
		})
	}
	return queries
}

// selectBestMatchingSongs assembles the final result by mapping input songs to their best matching
// library tracks using priority order: ID > MBID > ISRC > title+artist.
func (m *Matcher) selectBestMatchingSongs(songs []agents.Song, byID, byMBID, byISRC, byTitleArtist map[string]model.MediaFile, count int) model.MediaFiles {
	mfs := make(model.MediaFiles, 0, len(songs))
	addedBy := make(map[string]agents.Song, len(songs))

	for _, t := range songs {
		if len(mfs) == count {
			break
		}

		mf, found := findMatchingTrack(t, byID, byMBID, byISRC, byTitleArtist)
		if !found {
			continue
		}

		if prevSong, alreadyAdded := addedBy[mf.ID]; alreadyAdded {
			if t != prevSong {
				continue
			}
		} else {
			addedBy[mf.ID] = t
		}

		mfs = append(mfs, mf)
	}
	return mfs
}

// findMatchingTrack looks up a song in the match maps using priority order.
func findMatchingTrack(t agents.Song, byID, byMBID, byISRC, byTitleArtist map[string]model.MediaFile) (model.MediaFile, bool) {
	if mf, found := lookupByIdentifiers(t, byID, byMBID, byISRC); found {
		return mf, true
	}
	key := str.SanitizeFieldForSorting(t.Name) + "|" + str.SanitizeFieldForSortingNoArticle(t.Artist)
	if mf, ok := byTitleArtist[key]; ok {
		return mf, true
	}
	return model.MediaFile{}, false
}

// similarityRatio calculates the similarity between two strings using Jaro-Winkler algorithm.
func similarityRatio(a, b string) float64 {
	if a == b {
		return 1.0
	}
	if len(a) == 0 || len(b) == 0 {
		return 0.0
	}
	return smetrics.JaroWinkler(a, b, 0.7, 4)
}