feat: add TTML lyrics support with token-level karaoke and translation/pronunciation layers

Add a full TTML (Timed Text Markup Language) sidecar lyrics parser that extracts word/syllable-level timing from <span> elements, plus translation and pronunciation (transliteration) tracks from Apple Music TTML metadata sections. Backend changes: - TTML parser (core/lyrics/ttml.go) with support for all TTML time formats, nested timing contexts, and bare decimal second offsets - Translation/pronunciation tracks resolved via key-based metadata linking - Line timing hydration from token-level start/end values - 'kind' field added to Lyrics model and StructuredLyric API response (main/translation/pronunciation) - 'tokenLine' array in API response for word-level timing data - UTF-8 BOM and UTF-16 LE encoding support for TTML files - Fix for ambiguous time resolution in pronunciation spans (pre-1-minute) Frontend changes: - KaraokeLyricsOverlay rewritten with scrollable multi-line layout, word-level wipe highlighting with eased alpha transitions, rAF-driven playback clock with drift correction - Inline translation (above) and pronunciation (below) each main line, with smart filtering to hide redundant lines (same normalized text) - TR/PR toggle buttons and layer selection via selectLyricLayers() - Click-to-seek: click any lyric line to jump to that position - Customization popover with font-size sliders and color presets for each line type (TR/Default/PR), persisted to localStorage - Smooth font-size transition between active and inactive lines - Resizable overlay height via drag handle - lyrics.js: resolveKaraokeTokenWindow, buildSyntheticWordTokens, findLayerLineIndexForMain, token sorting, collapsed timing detection API extension (non-breaking, additive): - tokenLine[].token[] provides per-word start/end timing (ms) - tokenLine[].index maps back to the corresponding line[] entry - kind field: 'main', 'translation', 'pronunciation' - Clients ignoring tokenLine/kind continue to work unchanged
2026-05-03 06:51:16 +00:00 · 2026-02-20 16:54:45 +02:00 · 2026-02-20 16:54:45 +02:00 · c77e0de976
commit c77e0de976
parent ccee33f474
30 changed files with 4644 additions and 59 deletions
--- a/README.md
+++ b/README.md
@ -52,6 +52,7 @@ A share of the revenue helps fund the development of Navidrome at no additional
 - **Multi-platform**, runs on macOS, Linux and Windows. **Docker** images are also provided
 - Ready to use binaries for all major platforms, including **Raspberry Pi**
 - Automatically **monitors your library** for changes, importing new files and reloading new metadata 
 - Supports synchronized lyrics from sidecar **.lrc** and **.ttml** files (via `lyricspriority`)
 - **Themeable**, modern and responsive **Web interface** based on [Material UI](https://material-ui.com)
 - **Compatible** with all Subsonic/Madsonic/Airsonic [clients](https://www.navidrome.org/docs/overview/#apps)
 - **Transcoding** on the fly. Can be set per user/player. **Opus encoding is supported**
--- a/conf/configuration.go
+++ b/conf/configuration.go
@ -677,7 +677,7 @@ func setViperDefaults() {
 	viper.SetDefault("coverartquality", 75)
 	viper.SetDefault("artistartpriority", "artist.*, album/artist.*, external")
 	viper.SetDefault("discartpriority", "disc*.*, cd*.*, cover.*, folder.*, front.*, discsubtitle, embedded")
-	viper.SetDefault("lyricspriority", ".lrc,.txt,embedded")
+	viper.SetDefault("lyricspriority", ".lrc,.ttml,.txt,embedded")
 	viper.SetDefault("enablegravatar", false)
 	viper.SetDefault("enablefavourites", true)
 	viper.SetDefault("enablestarrating", true)
--- a/core/lyrics/lyrics_test.go
+++ b/core/lyrics/lyrics_test.go
@ -44,6 +44,35 @@ var _ = Describe("sources", func() {
 		},
 	}
 	ttmlLyrics := model.LyricList{
 		model.Lyrics{
 			Kind: "main",
 			Lang: "eng",
 			Line: []model.Line{
 				{
 					Start: gg.P(int64(18800)),
 					Value: "We're no strangers to love",
 				},
 				{
 					Start: gg.P(int64(22800)),
 					Value: "You know the rules and so do I",
 				},
 			},
 			Synced: true,
 		},
 		model.Lyrics{
 			Kind: "main",
 			Lang: "por",
 			Line: []model.Line{
 				{
 					Start: gg.P(int64(18800)),
 					Value: "Nao somos estranhos ao amor",
 				},
 			},
 			Synced: true,
 		},
 	}
 	unsyncedLyrics := model.LyricList{
 		model.Lyrics{
 			Lang: "xxx",
@ -80,7 +109,8 @@ var _ = Describe("sources", func() {
 	},
 		Entry("embedded > lrc > txt", "embedded,.lrc,.txt", embeddedLyrics),
 		Entry("lrc > embedded > txt", ".lrc,embedded,.txt", syncedLyrics),
-		Entry("txt > lrc > embedded", ".txt,.lrc,embedded", unsyncedLyrics))
+		Entry("txt > lrc > embedded", ".txt,.lrc,embedded", unsyncedLyrics),
 		Entry("ttml > lrc > embedded", ".ttml,.lrc,embedded", ttmlLyrics))
 	Context("Errors", func() {
 		var RegularUserContext = XContext
--- a/core/lyrics/sources.go
+++ b/core/lyrics/sources.go
@ -5,6 +5,7 @@ import (
 	"errors"
 	"os"
 	"path"
 	"strings"
 	"github.com/navidrome/navidrome/log"
 	"github.com/navidrome/navidrome/model"
@ -36,18 +37,31 @@ func fromExternalFile(ctx context.Context, mf *model.MediaFile, suffix string) (
 		return nil, err
 	}
-	lyrics, err := model.ToLyrics("xxx", string(contents))
+	var list model.LyricList
-	if err != nil {
+	if strings.EqualFold(suffix, ".ttml") {
-		log.Error(ctx, "error parsing lyric external file", "path", externalLyric, err)
+		list, err = parseTTML(contents)
-		return nil, err
+		if err != nil {
-	} else if lyrics == nil {
+			log.Error(ctx, "error parsing ttml external file", "path", externalLyric, err)
 			return nil, err
 		}
 	} else {
 		lyrics, err := model.ToLyrics("xxx", string(contents))
 		if err != nil {
 			log.Error(ctx, "error parsing lyric external file", "path", externalLyric, err)
 			return nil, err
 		}
 		if lyrics != nil {
 			list = model.LyricList{*lyrics}
 		}
 	}
 	if len(list) == 0 {
 		log.Trace(ctx, "empty lyrics from external file", "path", externalLyric)
 		return nil, nil
 	}
 	log.Trace(ctx, "retrieved lyrics from external file", "path", externalLyric)
-
+	return list, nil
 	return model.LyricList{*lyrics}, nil
 }
 // fromPlugin attempts to load lyrics from a plugin with the given name.
--- a/core/lyrics/sources_test.go
+++ b/core/lyrics/sources_test.go
@ -109,6 +109,41 @@ var _ = Describe("sources", func() {
 			}))
 		})
 		It("should return synchronized multilingual lyrics from a TTML file", func() {
 			mf := model.MediaFile{Path: "tests/fixtures/test.mp3"}
 			lyrics, err := fromExternalFile(ctx, &mf, ".ttml")
 			Expect(err).To(BeNil())
 			Expect(lyrics).To(Equal(model.LyricList{
 				{
 					Kind: "main",
 					Lang: "eng",
 					Line: []model.Line{
 						{
 							Start: gg.P(int64(18800)),
 							Value: "We're no strangers to love",
 						},
 						{
 							Start: gg.P(int64(22800)),
 							Value: "You know the rules and so do I",
 						},
 					},
 					Synced: true,
 				},
 				{
 					Kind: "main",
 					Lang: "por",
 					Line: []model.Line{
 						{
 							Start: gg.P(int64(18800)),
 							Value: "Nao somos estranhos ao amor",
 						},
 					},
 					Synced: true,
 				},
 			}))
 		})
 		It("should handle LRC files with UTF-8 BOM marker (issue #4631)", func() {
 			// The function looks for <basePath-without-ext><suffix>, so we need to pass
 			// a MediaFile with .mp3 path and look for .lrc suffix
@ -142,5 +177,33 @@ var _ = Describe("sources", func() {
 			Expect(lyrics[0].Line[1].Start).To(Equal(gg.P(int64(22801))))
 			Expect(lyrics[0].Line[1].Value).To(Equal("You know the rules and so do I"))
 		})
 		It("should handle TTML files with UTF-8 BOM marker", func() {
 			mf := model.MediaFile{Path: "tests/fixtures/bom-test.mp3"}
 			lyrics, err := fromExternalFile(ctx, &mf, ".ttml")
 			Expect(err).To(BeNil())
 			Expect(lyrics).To(HaveLen(1))
 			Expect(lyrics[0].Kind).To(Equal("main"))
 			Expect(lyrics[0].Synced).To(BeTrue())
 			Expect(lyrics[0].Line).To(HaveLen(1))
 			Expect(lyrics[0].Line[0].Start).To(Equal(gg.P(int64(0))))
 			Expect(lyrics[0].Line[0].Value).To(Equal("BOM test line"))
 		})
 		It("should handle UTF-16 LE encoded TTML files", func() {
 			mf := model.MediaFile{Path: "tests/fixtures/bom-utf16-test.mp3"}
 			lyrics, err := fromExternalFile(ctx, &mf, ".ttml")
 			Expect(err).To(BeNil())
 			Expect(lyrics).To(HaveLen(1))
 			Expect(lyrics[0].Kind).To(Equal("main"))
 			Expect(lyrics[0].Synced).To(BeTrue())
 			Expect(lyrics[0].Line).To(HaveLen(2))
 			Expect(lyrics[0].Line[0].Start).To(Equal(gg.P(int64(18800))))
 			Expect(lyrics[0].Line[0].Value).To(Equal("UTF16 line one"))
 			Expect(lyrics[0].Line[1].Start).To(Equal(gg.P(int64(22801))))
 			Expect(lyrics[0].Line[1].Value).To(Equal("UTF16 line two"))
 		})
 	})
 })
--- a/core/lyrics/sources_ttml_test.go
+++ b/core/lyrics/sources_ttml_test.go
@ -0,0 +1,92 @@
 package lyrics
 import (
 	"context"
 	"os"
 	"path/filepath"
 	"testing"
 	"github.com/navidrome/navidrome/model"
 )
 func TestFromExternalFileTTML(t *testing.T) {
 	ctx := context.Background()
 	mf := model.MediaFile{Path: fixturePath("test.mp3")}
 	lyrics, err := fromExternalFile(ctx, &mf, ".ttml")
 	if err != nil {
 		t.Fatalf("fromExternalFile returned error: %v", err)
 	}
 	if len(lyrics) != 2 {
 		t.Fatalf("expected 2 lyric tracks, got %d", len(lyrics))
 	}
 	if lyrics[0].Lang != "eng" {
 		t.Fatalf("expected first language 'eng', got %q", lyrics[0].Lang)
 	}
 	if len(lyrics[0].Line) != 2 {
 		t.Fatalf("expected 2 english lines, got %d", len(lyrics[0].Line))
 	}
 	if lyrics[0].Line[0].Start == nil || *lyrics[0].Line[0].Start != 18800 {
 		t.Fatalf("expected first english line start to be 18800, got %v", lyrics[0].Line[0].Start)
 	}
 }
 func TestFromExternalFileTTMLWithUTF8BOM(t *testing.T) {
 	ctx := context.Background()
 	mf := model.MediaFile{Path: fixturePath("bom-test.ttml")}
 	lyrics, err := fromExternalFile(ctx, &mf, ".ttml")
 	if err != nil {
 		t.Fatalf("fromExternalFile returned error: %v", err)
 	}
 	if len(lyrics) != 1 {
 		t.Fatalf("expected 1 lyric track, got %d", len(lyrics))
 	}
 	if !lyrics[0].Synced {
 		t.Fatal("expected BOM TTML lyrics to be synced")
 	}
 	if len(lyrics[0].Line) != 1 {
 		t.Fatalf("expected 1 lyric line, got %d", len(lyrics[0].Line))
 	}
 	if lyrics[0].Line[0].Start == nil || *lyrics[0].Line[0].Start != 0 {
 		t.Fatalf("expected first line start 0, got %v", lyrics[0].Line[0].Start)
 	}
 }
 func TestFromExternalFileTTMLUTF16(t *testing.T) {
 	ctx := context.Background()
 	mf := model.MediaFile{Path: fixturePath("bom-utf16-test.ttml")}
 	lyrics, err := fromExternalFile(ctx, &mf, ".ttml")
 	if err != nil {
 		t.Fatalf("fromExternalFile returned error: %v", err)
 	}
 	if len(lyrics) != 1 {
 		t.Fatalf("expected 1 lyric track, got %d", len(lyrics))
 	}
 	if !lyrics[0].Synced {
 		t.Fatal("expected UTF16 TTML lyrics to be synced")
 	}
 	if len(lyrics[0].Line) != 2 {
 		t.Fatalf("expected 2 lyric lines, got %d", len(lyrics[0].Line))
 	}
 	if lyrics[0].Line[0].Start == nil || *lyrics[0].Line[0].Start != 18800 {
 		t.Fatalf("expected first line start 18800, got %v", lyrics[0].Line[0].Start)
 	}
 	if lyrics[0].Line[1].Start == nil || *lyrics[0].Line[1].Start != 22801 {
 		t.Fatalf("expected second line start 22801, got %v", lyrics[0].Line[1].Start)
 	}
 }
 func fixturePath(name string) string {
 	candidates := []string{
 		filepath.Join("tests", "fixtures", name),
 		filepath.Join("..", "..", "tests", "fixtures", name),
 	}
 	for _, candidate := range candidates {
 		if _, err := os.Stat(candidate); err == nil {
 			return candidate
 		}
 	}
 	return filepath.Join("tests", "fixtures", name)
 }
--- a/core/lyrics/ttml.go
+++ b/core/lyrics/ttml.go
@ -0,0 +1,886 @@
 package lyrics
 import (
 	"bytes"
 	"encoding/xml"
 	"errors"
 	"io"
 	"math"
 	"regexp"
 	"sort"
 	"strconv"
 	"strings"
 	"github.com/navidrome/navidrome/log"
 	"github.com/navidrome/navidrome/model"
 	"github.com/navidrome/navidrome/utils/str"
 )
 const (
 	defaultTTMLFrameRate    = 30.0
 	defaultTTMLSubFrameRate = 1.0
 	defaultTTMLTickRate     = 1.0
 	ttmlLyricKindMain          = "main"
 	ttmlLyricKindTranslation   = "translation"
 	ttmlLyricKindPronunciation = "pronunciation"
 )
 var offsetTimeRegex = regexp.MustCompile(`^([0-9]+(?:\.[0-9]+)?)(h|m|s|ms|f|t)$`)
 var xmlEncodingRegex = regexp.MustCompile(`(?i)<\?xml([^>]*?)encoding\s*=\s*["'][^"']+["']([^>]*)\?>`)
 type ttmlTimeKind int
 const (
 	ttmlTimeAbsolute ttmlTimeKind = iota
 	ttmlTimeOffset
 	ttmlTimeAmbiguous
 )
 type ttmlTimingParams struct {
 	frameRate    float64
 	subFrameRate float64
 	tickRate     float64
 }
 type ttmlTimingContext struct {
 	lang     string
 	role     string
 	begin    int64
 	hasBegin bool
 	end      int64
 	hasEnd   bool
 	invalid  bool
 }
 type ttmlLineRef struct {
 	order int
 	line  model.Line
 }
 type ttmlMetadataEntry struct {
 	key  string
 	line model.Line
 	seq  int
 }
 type ttmlResolvedMetadataLine struct {
 	order int
 	seq   int
 	line  model.Line
 }
 type ttmlParser struct {
 	decoder *xml.Decoder
 	params  ttmlTimingParams
 	mainLangOrder   []string
 	mainLinesByLang map[string][]model.Line
 	mainLineRefsByKey map[string]ttmlLineRef
 	mainLineOrder     int
 	translationLangOrder   []string
 	translationEntriesByLg map[string][]ttmlMetadataEntry
 	pronunciationLangOrder   []string
 	pronunciationEntriesByLg map[string][]ttmlMetadataEntry
 	metadataSeq int
 }
 func parseTTML(contents []byte) (model.LyricList, error) {
 	contents = xmlEncodingRegex.ReplaceAll(contents, []byte(`<?xml$1encoding="UTF-8"$2?>`))
 	p := ttmlParser{
 		decoder: xml.NewDecoder(bytes.NewReader(contents)),
 		params: ttmlTimingParams{
 			frameRate:    defaultTTMLFrameRate,
 			subFrameRate: defaultTTMLSubFrameRate,
 			tickRate:     defaultTTMLTickRate,
 		},
 		mainLinesByLang:          make(map[string][]model.Line),
 		mainLineRefsByKey:        make(map[string]ttmlLineRef),
 		translationEntriesByLg:   make(map[string][]ttmlMetadataEntry),
 		pronunciationEntriesByLg: make(map[string][]ttmlMetadataEntry),
 	}
 	root := ttmlTimingContext{lang: "xxx"}
 	for {
 		token, err := p.decoder.Token()
 		if errors.Is(err, io.EOF) {
 			break
 		}
 		if err != nil {
 			return nil, err
 		}
 		start, ok := token.(xml.StartElement)
 		if !ok {
 			continue
 		}
 		if err := p.parseElement(start, root); err != nil {
 			return nil, err
 		}
 	}
 	return p.toLyricList(), nil
 }
 func (p *ttmlParser) parseElement(start xml.StartElement, parent ttmlTimingContext) error {
 	local := strings.ToLower(start.Name.Local)
 	if local == "tt" {
 		p.updateTimingParams(start.Attr)
 	}
 	switch local {
 	case "translation":
 		return p.parseMetadataTrack(start, parent, ttmlLyricKindTranslation)
 	case "transliteration":
 		return p.parseMetadataTrack(start, parent, ttmlLyricKindPronunciation)
 	}
 	ctx := p.childContext(start.Attr, parent)
 	if local == "p" {
 		lineText, tokens, err := p.parseParagraph(ctx)
 		if err != nil {
 			return err
 		}
 		if ctx.invalid || lineText == "" {
 			return nil
 		}
 		parsedLine := model.Line{Value: lineText}
 		if ctx.hasBegin {
 			startMs := ctx.begin
 			parsedLine.Start = &startMs
 		}
 		if ctx.hasEnd {
 			endMs := ctx.end
 			parsedLine.End = &endMs
 		}
 		if len(tokens) > 0 {
 			parsedLine.Token = tokens
 		}
 		parsedLine = hydrateLineTimingFromTokens(parsedLine)
 		lineKey, _ := attrValue(start.Attr, "key")
 		p.addMainLine(ctx.lang, lineKey, parsedLine)
 		return nil
 	}
 	for {
 		token, err := p.decoder.Token()
 		if err != nil {
 			return err
 		}
 		switch t := token.(type) {
 		case xml.StartElement:
 			nextParent := ctx
 			if ctx.invalid {
 				// Best effort: ignore invalid timing in container elements, and
 				// continue traversing descendants with parent context.
 				nextParent = parent
 			}
 			if err := p.parseElement(t, nextParent); err != nil {
 				return err
 			}
 		case xml.EndElement:
 			if strings.EqualFold(t.Name.Local, start.Name.Local) {
 				return nil
 			}
 		}
 	}
 }
 func (p *ttmlParser) parseMetadataTrack(start xml.StartElement, parent ttmlTimingContext, kind string) error {
 	ctx := p.childContext(start.Attr, parent)
 	lang := normalizeTTMLLang(ctx.lang)
 	for {
 		token, err := p.decoder.Token()
 		if err != nil {
 			return err
 		}
 		switch t := token.(type) {
 		case xml.StartElement:
 			if strings.EqualFold(t.Name.Local, "text") {
 				entry, ok, err := p.parseMetadataText(t, ctx)
 				if err != nil {
 					return err
 				}
 				if ok {
 					p.addMetadataEntry(kind, lang, entry)
 				}
 				continue
 			}
 			nextParent := ctx
 			if ctx.invalid {
 				nextParent = parent
 			}
 			if err := p.parseElement(t, nextParent); err != nil {
 				return err
 			}
 		case xml.EndElement:
 			if strings.EqualFold(t.Name.Local, start.Name.Local) {
 				return nil
 			}
 		}
 	}
 }
 func (p *ttmlParser) parseMetadataText(start xml.StartElement, parent ttmlTimingContext) (ttmlMetadataEntry, bool, error) {
 	forKey, hasFor := attrValue(start.Attr, "for")
 	forKey = strings.TrimSpace(forKey)
 	value, tokens, err := p.parseInlineElement(start, parent)
 	if err != nil {
 		return ttmlMetadataEntry{}, false, err
 	}
 	if !hasFor || forKey == "" {
 		return ttmlMetadataEntry{}, false, nil
 	}
 	ctx := p.childContext(start.Attr, parent)
 	if ctx.invalid {
 		return ttmlMetadataEntry{}, false, nil
 	}
 	line := model.Line{Value: sanitizeTTMLText(value)}
 	if ctx.hasBegin {
 		startMs := ctx.begin
 		line.Start = &startMs
 	}
 	if ctx.hasEnd {
 		endMs := ctx.end
 		line.End = &endMs
 	}
 	if len(tokens) > 0 {
 		line.Token = tokens
 	}
 	line = hydrateLineTimingFromTokens(line)
 	if line.Value == "" && len(line.Token) == 0 {
 		return ttmlMetadataEntry{}, false, nil
 	}
 	return ttmlMetadataEntry{key: forKey, line: line}, true, nil
 }
 func (p *ttmlParser) parseParagraph(parent ttmlTimingContext) (string, []model.Token, error) {
 	var text strings.Builder
 	var tokens []model.Token
 	for {
 		token, err := p.decoder.Token()
 		if err != nil {
 			return "", nil, err
 		}
 		switch t := token.(type) {
 		case xml.StartElement:
 			value, inlineTokens, err := p.parseInlineElement(t, parent)
 			if err != nil {
 				return "", nil, err
 			}
 			text.WriteString(value)
 			tokens = append(tokens, inlineTokens...)
 		case xml.EndElement:
 			if strings.EqualFold(t.Name.Local, "p") {
 				return sanitizeTTMLText(text.String()), tokens, nil
 			}
 		case xml.CharData:
 			text.WriteString(string(t))
 		}
 	}
 }
 func (p *ttmlParser) parseInlineElement(start xml.StartElement, parent ttmlTimingContext) (string, []model.Token, error) {
 	local := strings.ToLower(start.Name.Local)
 	if local == "br" {
 		return "\n", nil, nil
 	}
 	ctx := p.childContext(start.Attr, parent)
 	_, hasBegin := attrValue(start.Attr, "begin")
 	_, hasEnd := attrValue(start.Attr, "end")
 	_, hasDur := attrValue(start.Attr, "dur")
 	hasOwnTiming := hasBegin || hasEnd || hasDur
 	var text strings.Builder
 	var tokens []model.Token
 	for {
 		token, err := p.decoder.Token()
 		if err != nil {
 			return "", nil, err
 		}
 		switch t := token.(type) {
 		case xml.StartElement:
 			value, inlineTokens, err := p.parseInlineElement(t, ctx)
 			if err != nil {
 				return "", nil, err
 			}
 			text.WriteString(value)
 			tokens = append(tokens, inlineTokens...)
 		case xml.EndElement:
 			if !strings.EqualFold(t.Name.Local, start.Name.Local) {
 				continue
 			}
 			value := text.String()
 			tokenText := sanitizeTTMLText(value)
 			if local == "span" && hasOwnTiming && !ctx.invalid && tokenText != "" && len(tokens) == 0 {
 				parsedToken := model.Token{
 					Value: tokenText,
 					Role:  ctx.role,
 				}
 				if ctx.hasBegin {
 					startMs := ctx.begin
 					parsedToken.Start = &startMs
 				}
 				if ctx.hasEnd {
 					endMs := ctx.end
 					parsedToken.End = &endMs
 				}
 				tokens = append(tokens, parsedToken)
 			}
 			return value, tokens, nil
 		case xml.CharData:
 			text.WriteString(string(t))
 		}
 	}
 }
 func (p *ttmlParser) toLyricList() model.LyricList {
 	res := make(model.LyricList, 0, len(p.mainLangOrder)+len(p.translationLangOrder)+len(p.pronunciationLangOrder))
 	for _, lang := range p.mainLangOrder {
 		lines := p.mainLinesByLang[lang]
 		if len(lines) == 0 {
 			continue
 		}
 		res = append(res, model.Lyrics{
 			Kind:   ttmlLyricKindMain,
 			Lang:   lang,
 			Line:   lines,
 			Synced: linesAreSynced(lines),
 		})
 	}
 	res = append(res, p.buildMetadataLyrics(ttmlLyricKindTranslation, p.translationLangOrder, p.translationEntriesByLg)...)
 	res = append(res, p.buildMetadataLyrics(ttmlLyricKindPronunciation, p.pronunciationLangOrder, p.pronunciationEntriesByLg)...)
 	return res
 }
 func (p *ttmlParser) buildMetadataLyrics(kind string, langOrder []string, entriesByLang map[string][]ttmlMetadataEntry) model.LyricList {
 	res := make(model.LyricList, 0, len(langOrder))
 	for _, lang := range langOrder {
 		entries := entriesByLang[lang]
 		if len(entries) == 0 {
 			continue
 		}
 		seenKeys := make(map[string]struct{}, len(entries))
 		resolved := make([]ttmlResolvedMetadataLine, 0, len(entries))
 		for _, entry := range entries {
 			if _, exists := seenKeys[entry.key]; exists {
 				continue
 			}
 			seenKeys[entry.key] = struct{}{}
 			ref, ok := p.mainLineRefsByKey[entry.key]
 			if !ok {
 				log.Warn("Skipping TTML metadata line without matching key", "kind", kind, "lang", lang, "key", entry.key)
 				continue
 			}
 			line := entry.line
 			if line.Start == nil && ref.line.Start != nil {
 				startMs := *ref.line.Start
 				line.Start = &startMs
 			}
 			if line.End == nil && ref.line.End != nil {
 				endMs := *ref.line.End
 				line.End = &endMs
 			}
 			line = hydrateLineTimingFromTokens(line)
 			if line.Value == "" && len(line.Token) == 0 {
 				continue
 			}
 			resolved = append(resolved, ttmlResolvedMetadataLine{
 				order: ref.order,
 				seq:   entry.seq,
 				line:  line,
 			})
 		}
 		if len(resolved) == 0 {
 			continue
 		}
 		sort.SliceStable(resolved, func(i, j int) bool {
 			if resolved[i].order != resolved[j].order {
 				return resolved[i].order < resolved[j].order
 			}
 			return resolved[i].seq < resolved[j].seq
 		})
 		lines := make([]model.Line, len(resolved))
 		for i := range resolved {
 			lines[i] = resolved[i].line
 		}
 		res = append(res, model.Lyrics{
 			Kind:   kind,
 			Lang:   lang,
 			Line:   lines,
 			Synced: linesAreSynced(lines),
 		})
 	}
 	return res
 }
 func (p *ttmlParser) addMainLine(lang string, lineKey string, line model.Line) {
 	lang = normalizeTTMLLang(lang)
 	if _, ok := p.mainLinesByLang[lang]; !ok {
 		p.mainLangOrder = append(p.mainLangOrder, lang)
 	}
 	p.mainLinesByLang[lang] = append(p.mainLinesByLang[lang], line)
 	lineKey = strings.TrimSpace(lineKey)
 	if lineKey != "" {
 		if _, exists := p.mainLineRefsByKey[lineKey]; !exists {
 			p.mainLineRefsByKey[lineKey] = ttmlLineRef{
 				order: p.mainLineOrder,
 				line:  line,
 			}
 		}
 	}
 	p.mainLineOrder++
 }
 func (p *ttmlParser) addMetadataEntry(kind string, lang string, entry ttmlMetadataEntry) {
 	lang = normalizeTTMLLang(lang)
 	entry.seq = p.metadataSeq
 	p.metadataSeq++
 	switch kind {
 	case ttmlLyricKindTranslation:
 		if _, ok := p.translationEntriesByLg[lang]; !ok {
 			p.translationLangOrder = append(p.translationLangOrder, lang)
 		}
 		p.translationEntriesByLg[lang] = append(p.translationEntriesByLg[lang], entry)
 	case ttmlLyricKindPronunciation:
 		if _, ok := p.pronunciationEntriesByLg[lang]; !ok {
 			p.pronunciationLangOrder = append(p.pronunciationLangOrder, lang)
 		}
 		p.pronunciationEntriesByLg[lang] = append(p.pronunciationEntriesByLg[lang], entry)
 	}
 }
 func (p *ttmlParser) childContext(attrs []xml.Attr, parent ttmlTimingContext) ttmlTimingContext {
 	ctx := parent
 	if lang, ok := attrValue(attrs, "lang"); ok {
 		ctx.lang = normalizeTTMLLang(lang)
 	}
 	if role, ok := attrValue(attrs, "role"); ok {
 		role = strings.TrimSpace(role)
 		if role != "" {
 			if ctx.role == "" {
 				ctx.role = role
 			} else if !strings.Contains(ctx.role, role) {
 				ctx.role = ctx.role + " " + role
 			}
 		}
 	}
 	beginExpr, hasBegin := attrValue(attrs, "begin")
 	endExpr, hasEnd := attrValue(attrs, "end")
 	durExpr, hasDur := attrValue(attrs, "dur")
 	if hasBegin {
 		begin, kind, ok := parseTTMLTimeExpression(beginExpr, p.params)
 		if !ok {
 			ctx.invalid = true
 			return ctx
 		}
 		base := int64(0)
 		if parent.hasBegin {
 			base = parent.begin
 		}
 		ctx.begin = resolveTTMLTime(begin, kind, base, parent)
 		ctx.hasBegin = true
 	} else {
 		ctx.begin = parent.begin
 		ctx.hasBegin = parent.hasBegin
 	}
 	var calculatedEnd int64
 	calculatedHasEnd := false
 	if hasEnd {
 		end, kind, ok := parseTTMLTimeExpression(endExpr, p.params)
 		if !ok {
 			ctx.invalid = true
 			return ctx
 		}
 		base := ctx.begin
 		if !ctx.hasBegin {
 			base = parent.begin
 		}
 		calculatedEnd = resolveTTMLTime(end, kind, base, parent)
 		calculatedHasEnd = true
 	}
 	if hasDur {
 		dur, ok := parseTTMLDurationExpression(durExpr, p.params)
 		if !ok {
 			ctx.invalid = true
 			return ctx
 		}
 		if ctx.hasBegin {
 			durEnd := ctx.begin + dur
 			if !calculatedHasEnd || durEnd < calculatedEnd {
 				calculatedEnd = durEnd
 				calculatedHasEnd = true
 			}
 		}
 	}
 	if !calculatedHasEnd && parent.hasEnd {
 		calculatedEnd = parent.end
 		calculatedHasEnd = true
 	}
 	ctx.end = calculatedEnd
 	ctx.hasEnd = calculatedHasEnd
 	return ctx
 }
 func (p *ttmlParser) updateTimingParams(attrs []xml.Attr) {
 	frameRate := p.params.frameRate
 	if value, ok := attrValue(attrs, "frameRate"); ok {
 		if parsed, err := strconv.ParseFloat(value, 64); err == nil && parsed > 0 {
 			frameRate = parsed
 		}
 	}
 	if value, ok := attrValue(attrs, "frameRateMultiplier"); ok {
 		parts := strings.Fields(value)
 		if len(parts) == 2 {
 			numerator, errA := strconv.ParseFloat(parts[0], 64)
 			denominator, errB := strconv.ParseFloat(parts[1], 64)
 			if errA == nil && errB == nil && denominator > 0 {
 				frameRate = frameRate * (numerator / denominator)
 			}
 		}
 	}
 	subFrameRate := p.params.subFrameRate
 	if value, ok := attrValue(attrs, "subFrameRate"); ok {
 		if parsed, err := strconv.ParseFloat(value, 64); err == nil && parsed > 0 {
 			subFrameRate = parsed
 		}
 	}
 	tickRate := p.params.tickRate
 	if value, ok := attrValue(attrs, "tickRate"); ok {
 		if parsed, err := strconv.ParseFloat(value, 64); err == nil && parsed > 0 {
 			tickRate = parsed
 		}
 	}
 	p.params.frameRate = max(frameRate, defaultTTMLFrameRate)
 	p.params.subFrameRate = max(subFrameRate, defaultTTMLSubFrameRate)
 	p.params.tickRate = max(tickRate, defaultTTMLTickRate)
 }
 func parseTTMLDurationExpression(expr string, params ttmlTimingParams) (int64, bool) {
 	value, _, ok := parseTTMLTimeExpression(expr, params)
 	return value, ok
 }
 func resolveTTMLTime(value int64, kind ttmlTimeKind, base int64, parent ttmlTimingContext) int64 {
 	switch kind {
 	case ttmlTimeAbsolute:
 		return value
 	case ttmlTimeOffset:
 		return base + value
 	case ttmlTimeAmbiguous:
 		absolute := value
 		offset := base + value
 		// No parent timing context → no reference frame for offsets.
 		// Prefer absolute when offset differs (i.e., base > 0).
 		if !parent.hasBegin && !parent.hasEnd && base != 0 {
 			return absolute
 		}
 		if parent.hasBegin && parent.hasEnd {
 			absoluteInParent := absolute >= parent.begin && absolute <= parent.end
 			offsetInParent := offset >= parent.begin && offset <= parent.end
 			if absoluteInParent && !offsetInParent {
 				return absolute
 			}
 			if offsetInParent && !absoluteInParent {
 				return offset
 			}
 		}
 		if parent.hasBegin {
 			if absolute < parent.begin && offset >= parent.begin {
 				return offset
 			}
 			if absolute >= parent.begin && offset > absolute {
 				return absolute
 			}
 		}
 		return offset
 	default:
 		return base + value
 	}
 }
 func parseTTMLTimeExpression(expr string, params ttmlTimingParams) (int64, ttmlTimeKind, bool) {
 	expr = strings.TrimSpace(expr)
 	if expr == "" {
 		return 0, ttmlTimeOffset, false
 	}
 	lower := strings.ToLower(expr)
 	if strings.Contains(lower, "wallclock(") ||
 		strings.Contains(lower, ".begin") ||
 		strings.Contains(lower, ".end") {
 		log.Warn("Unsupported TTML time expression", "value", expr)
 		return 0, ttmlTimeOffset, false
 	}
 	// Best-effort support for non-standard TTML seen in the wild where a
 	// bare decimal value is used (implicitly seconds), e.g. "0.170".
 	if value, err := strconv.ParseFloat(lower, 64); err == nil && value >= 0 {
 		return int64(math.Round(value * 1000)), ttmlTimeAmbiguous, true
 	}
 	if matches := offsetTimeRegex.FindStringSubmatch(lower); len(matches) == 3 {
 		value, err := strconv.ParseFloat(matches[1], 64)
 		if err != nil {
 			return 0, ttmlTimeOffset, false
 		}
 		unit := matches[2]
 		seconds := 0.0
 		switch unit {
 		case "h":
 			seconds = value * 60 * 60
 		case "m":
 			seconds = value * 60
 		case "s":
 			seconds = value
 		case "ms":
 			seconds = value / 1000
 		case "f":
 			seconds = value / params.frameRate
 		case "t":
 			seconds = value / params.tickRate
 		default:
 			return 0, ttmlTimeOffset, false
 		}
 		return int64(math.Round(seconds * 1000)), ttmlTimeOffset, true
 	}
 	colonCount := strings.Count(expr, ":")
 	switch colonCount {
 	case 1, 2:
 		clockMs, ok := parseTTMLClockTime(expr)
 		if !ok {
 			return 0, ttmlTimeAbsolute, false
 		}
 		return clockMs, ttmlTimeAbsolute, true
 	case 3:
 		framesMs, ok := parseTTMLFrameTime(expr, params)
 		if !ok {
 			return 0, ttmlTimeAbsolute, false
 		}
 		return framesMs, ttmlTimeAbsolute, true
 	default:
 		log.Warn("Unsupported TTML time expression", "value", expr)
 		return 0, ttmlTimeOffset, false
 	}
 }
 func parseTTMLClockTime(value string) (int64, bool) {
 	parts := strings.Split(value, ":")
 	if len(parts) != 2 && len(parts) != 3 {
 		return 0, false
 	}
 	hours := int64(0)
 	minutesIdx := 0
 	if len(parts) == 3 {
 		h, err := strconv.ParseInt(parts[0], 10, 64)
 		if err != nil {
 			return 0, false
 		}
 		hours = h
 		minutesIdx = 1
 	}
 	minutes, err := strconv.ParseInt(parts[minutesIdx], 10, 64)
 	if err != nil {
 		return 0, false
 	}
 	seconds, err := strconv.ParseFloat(parts[minutesIdx+1], 64)
 	if err != nil {
 		return 0, false
 	}
 	totalSeconds := float64(hours*60*60+minutes*60) + seconds
 	return int64(math.Round(totalSeconds * 1000)), true
 }
 func parseTTMLFrameTime(value string, params ttmlTimingParams) (int64, bool) {
 	parts := strings.Split(value, ":")
 	if len(parts) != 4 {
 		return 0, false
 	}
 	hours, err := strconv.ParseInt(parts[0], 10, 64)
 	if err != nil {
 		return 0, false
 	}
 	minutes, err := strconv.ParseInt(parts[1], 10, 64)
 	if err != nil {
 		return 0, false
 	}
 	seconds, err := strconv.ParseInt(parts[2], 10, 64)
 	if err != nil {
 		return 0, false
 	}
 	frameParts := strings.SplitN(parts[3], ".", 2)
 	frames, err := strconv.ParseFloat(frameParts[0], 64)
 	if err != nil {
 		return 0, false
 	}
 	subFrames := 0.0
 	if len(frameParts) == 2 {
 		subFrames, err = strconv.ParseFloat(frameParts[1], 64)
 		if err != nil {
 			return 0, false
 		}
 	}
 	totalSeconds := float64(hours*60*60 + minutes*60 + seconds)
 	totalSeconds += frames / params.frameRate
 	totalSeconds += subFrames / (params.subFrameRate * params.frameRate)
 	return int64(math.Round(totalSeconds * 1000)), true
 }
 func attrValue(attrs []xml.Attr, key string) (string, bool) {
 	for _, attr := range attrs {
 		if strings.EqualFold(attr.Name.Local, key) {
 			return strings.TrimSpace(attr.Value), true
 		}
 	}
 	return "", false
 }
 func normalizeTTMLLang(lang string) string {
 	lang = strings.ToLower(strings.TrimSpace(lang))
 	if lang == "" {
 		return "xxx"
 	}
 	return lang
 }
 func sanitizeTTMLText(raw string) string {
 	raw = str.SanitizeText(raw)
 	raw = strings.ReplaceAll(raw, "\r\n", "\n")
 	raw = strings.ReplaceAll(raw, "\r", "\n")
 	lines := strings.Split(raw, "\n")
 	for i := range lines {
 		lines[i] = strings.TrimSpace(lines[i])
 	}
 	return strings.TrimSpace(strings.Join(lines, "\n"))
 }
 func linesAreSynced(lines []model.Line) bool {
 	for i := range lines {
 		if lines[i].Start != nil {
 			return true
 		}
 		for j := range lines[i].Token {
 			if lines[i].Token[j].Start != nil {
 				return true
 			}
 		}
 	}
 	return false
 }
 func hydrateLineTimingFromTokens(line model.Line) model.Line {
 	if len(line.Token) == 0 {
 		return line
 	}
 	var earliestStart *int64
 	var latestEnd *int64
 	for i := range line.Token {
 		token := line.Token[i]
 		if token.Start != nil {
 			if earliestStart == nil || *token.Start < *earliestStart {
 				v := *token.Start
 				earliestStart = &v
 			}
 		}
 		candidateEnd := token.End
 		if candidateEnd == nil {
 			candidateEnd = token.Start
 		}
 		if candidateEnd != nil {
 			if latestEnd == nil || *candidateEnd > *latestEnd {
 				v := *candidateEnd
 				latestEnd = &v
 			}
 		}
 	}
 	if line.Start == nil && earliestStart != nil {
 		v := *earliestStart
 		line.Start = &v
 	}
 	if line.End == nil && latestEnd != nil {
 		v := *latestEnd
 		line.End = &v
 	}
 	return line
 }
 func max(v float64, fallback float64) float64 {
 	if v <= 0 {
 		return fallback
 	}
 	return v
 }
--- a/core/lyrics/ttml_test.go
+++ b/core/lyrics/ttml_test.go
@ -0,0 +1,398 @@
 package lyrics
 import (
 	"testing"
 	"github.com/navidrome/navidrome/model"
 )
 func TestParseTTML_MultiLanguageAndTiming(t *testing.T) {
 	content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
 <tt xmlns="http://www.w3.org/ns/ttml" xmlns:ttp="http://www.w3.org/ns/ttml#parameter" ttp:frameRate="30" ttp:subFrameRate="2" ttp:tickRate="10">
  <body>
    <div xml:lang="eng" begin="1s">
      <p begin="2s">Line one</p>
      <p begin="00:00:04:15.1"><span>Line two</span><br/>with break</p>
    </div>
    <div xml:lang="por">
      <p begin="45t">Linha</p>
    </div>
  </body>
 </tt>`)
 	list, err := parseTTML(content)
 	if err != nil {
 		t.Fatalf("parseTTML returned error: %v", err)
 	}
 	if len(list) != 2 {
 		t.Fatalf("expected 2 lyric tracks, got %d", len(list))
 	}
 	eng := list[0]
 	if eng.Lang != "eng" {
 		t.Fatalf("expected first track language 'eng', got %q", eng.Lang)
 	}
 	if !eng.Synced {
 		t.Fatal("expected first track to be synced")
 	}
 	assertTimedLine(t, eng.Line[0], 3000, "Line one")
 	assertTimedLine(t, eng.Line[1], 4517, "Line two\nwith break")
 	por := list[1]
 	if por.Lang != "por" {
 		t.Fatalf("expected second track language 'por', got %q", por.Lang)
 	}
 	assertTimedLine(t, por.Line[0], 4500, "Linha")
 }
 func TestParseTTML_UnsupportedCueSkipped(t *testing.T) {
 	content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
 <tt xmlns="http://www.w3.org/ns/ttml">
  <body xml:lang="eng">
    <div>
      <p begin="wallclock(2026-01-01T00:00:00Z)">Skip me</p>
      <p begin="1s">Keep me</p>
    </div>
  </body>
 </tt>`)
 	list, err := parseTTML(content)
 	if err != nil {
 		t.Fatalf("parseTTML returned error: %v", err)
 	}
 	if len(list) != 1 {
 		t.Fatalf("expected 1 lyric track, got %d", len(list))
 	}
 	if len(list[0].Line) != 1 {
 		t.Fatalf("expected 1 line in lyric track, got %d", len(list[0].Line))
 	}
 	assertTimedLine(t, list[0].Line[0], 1000, "Keep me")
 }
 func TestParseTTML_BeginEndDurWithInheritance(t *testing.T) {
 	content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
 <tt xmlns="http://www.w3.org/ns/ttml">
  <body xml:lang="eng" begin="10s">
    <div begin="5s" dur="8s">
      <p begin="1s" dur="2s">First line</p>
      <p begin="3s" end="5s">Second line</p>
    </div>
  </body>
 </tt>`)
 	list, err := parseTTML(content)
 	if err != nil {
 		t.Fatalf("parseTTML returned error: %v", err)
 	}
 	if len(list) != 1 {
 		t.Fatalf("expected 1 lyric track, got %d", len(list))
 	}
 	if list[0].Lang != "eng" {
 		t.Fatalf("expected language 'eng', got %q", list[0].Lang)
 	}
 	if len(list[0].Line) != 2 {
 		t.Fatalf("expected 2 lines, got %d", len(list[0].Line))
 	}
 	assertTimedLine(t, list[0].Line[0], 16000, "First line")
 	assertTimedLine(t, list[0].Line[1], 18000, "Second line")
 }
 func TestParseTTML_NonStandardBareSecondOffsets(t *testing.T) {
 	content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
 <tt xmlns="http://www.w3.org/ns/ttml">
  <body xml:lang="eng" begin="10">
    <div>
      <p begin="0.170">First line</p>
      <p begin="3.710">Second line</p>
    </div>
  </body>
 </tt>`)
 	list, err := parseTTML(content)
 	if err != nil {
 		t.Fatalf("parseTTML returned error: %v", err)
 	}
 	if len(list) != 1 {
 		t.Fatalf("expected 1 lyric track, got %d", len(list))
 	}
 	if len(list[0].Line) != 2 {
 		t.Fatalf("expected 2 lines, got %d", len(list[0].Line))
 	}
 	assertTimedLine(t, list[0].Line[0], 10170, "First line")
 	assertTimedLine(t, list[0].Line[1], 13710, "Second line")
 }
 func TestParseTTML_WordTimingTokens(t *testing.T) {
 	content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
 <tt xmlns="http://www.w3.org/ns/ttml" xmlns:ttm="http://www.w3.org/ns/ttml#metadata">
  <body xml:lang="eng">
    <div>
      <p begin="00:01.000" end="00:03.000">
        <span begin="00:01.000" end="00:01.400">He</span><span begin="00:01.400" end="00:01.800">llo</span>
        <span ttm:role="x-bg"><span begin="00:02.000" end="00:02.500">echo</span></span>
      </p>
    </div>
  </body>
 </tt>`)
 	list, err := parseTTML(content)
 	if err != nil {
 		t.Fatalf("parseTTML returned error: %v", err)
 	}
 	if len(list) != 1 {
 		t.Fatalf("expected 1 lyric track, got %d", len(list))
 	}
 	if len(list[0].Line) != 1 {
 		t.Fatalf("expected 1 line, got %d", len(list[0].Line))
 	}
 	line := list[0].Line[0]
 	assertTimedLine(t, line, 1000, "Hello\necho")
 	if line.End == nil || *line.End != 3000 {
 		t.Fatalf("expected line end 3000, got %v", line.End)
 	}
 	if len(line.Token) != 3 {
 		t.Fatalf("expected 3 timed tokens, got %d", len(line.Token))
 	}
 	assertToken(t, line.Token[0], 1000, 1400, "He", "")
 	assertToken(t, line.Token[1], 1400, 1800, "llo", "")
 	assertToken(t, line.Token[2], 2000, 2500, "echo", "x-bg")
 }
 func TestParseTTML_AmbiguousDecimalTimingPrefersAbsoluteWhenInsideParentWindow(t *testing.T) {
 	content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
 <tt xmlns="http://www.w3.org/ns/ttml">
  <body xml:lang="eng">
    <div begin="37.870" end="45.570">
      <p begin="43.444" end="45.570">
        <span begin="43.444" end="43.716">go</span>
        <span begin="43.716" end="43.887">go</span>
      </p>
    </div>
  </body>
 </tt>`)
 	list, err := parseTTML(content)
 	if err != nil {
 		t.Fatalf("parseTTML returned error: %v", err)
 	}
 	if len(list) != 1 || len(list[0].Line) != 1 {
 		t.Fatalf("expected one parsed lyric line, got %#v", list)
 	}
 	line := list[0].Line[0]
 	assertTimedLine(t, line, 43444, "go\ngo")
 	if line.End == nil || *line.End != 45570 {
 		t.Fatalf("expected line end 45570, got %v", line.End)
 	}
 	if len(line.Token) != 2 {
 		t.Fatalf("expected 2 timed tokens, got %d", len(line.Token))
 	}
 	assertToken(t, line.Token[0], 43444, 43716, "go", "")
 	assertToken(t, line.Token[1], 43716, 43887, "go", "")
 }
 func TestParseTTML_UnsyncedFallback(t *testing.T) {
 	content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
 <tt xmlns="http://www.w3.org/ns/ttml">
  <body>
    <div>
      <p>No timing here</p>
    </div>
  </body>
 </tt>`)
 	list, err := parseTTML(content)
 	if err != nil {
 		t.Fatalf("parseTTML returned error: %v", err)
 	}
 	if len(list) != 1 {
 		t.Fatalf("expected 1 lyric track, got %d", len(list))
 	}
 	if list[0].Lang != "xxx" {
 		t.Fatalf("expected default language 'xxx', got %q", list[0].Lang)
 	}
 	if list[0].Synced {
 		t.Fatal("expected lyric track to be unsynced")
 	}
 	if len(list[0].Line) != 1 {
 		t.Fatalf("expected 1 line, got %d", len(list[0].Line))
 	}
 	if list[0].Line[0].Start != nil {
 		t.Fatalf("expected line start to be nil, got %v", *list[0].Line[0].Start)
 	}
 	if list[0].Line[0].Value != "No timing here" {
 		t.Fatalf("expected line value %q, got %q", "No timing here", list[0].Line[0].Value)
 	}
 }
 func TestParseTTML_MetadataTracksByKey(t *testing.T) {
 	content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
 <tt xmlns="http://www.w3.org/ns/ttml" xmlns:itunes="http://music.apple.com/lyric-ttml-internal">
  <head>
    <metadata>
      <iTunesMetadata xmlns="http://music.apple.com/lyric-ttml-internal">
        <translations>
          <translation xml:lang="es">
            <text for="L1">Hola</text>
            <text for="MISSING">Skip me</text>
          </translation>
        </translations>
        <transliterations>
          <transliteration xml:lang="ja-Latn">
            <text for="L2"><span begin="00:02.000" end="00:02.300" xmlns="http://www.w3.org/ns/ttml">ko</span><span begin="00:02.300" end="00:02.600" xmlns="http://www.w3.org/ns/ttml">nni</span></text>
          </transliteration>
        </transliterations>
      </iTunesMetadata>
    </metadata>
  </head>
  <body xml:lang="ja">
    <div>
      <p begin="00:01.000" end="00:01.500" itunes:key="L1">こんにちは</p>
      <p begin="00:02.000" end="00:02.700" itunes:key="L2">こんばんは</p>
    </div>
  </body>
 </tt>`)
 	list, err := parseTTML(content)
 	if err != nil {
 		t.Fatalf("parseTTML returned error: %v", err)
 	}
 	if len(list) != 3 {
 		t.Fatalf("expected 3 lyric tracks, got %d", len(list))
 	}
 	main := list[0]
 	if main.Kind != "main" {
 		t.Fatalf("expected main track kind %q, got %q", "main", main.Kind)
 	}
 	if main.Lang != "ja" {
 		t.Fatalf("expected main track language %q, got %q", "ja", main.Lang)
 	}
 	if len(main.Line) != 2 {
 		t.Fatalf("expected 2 lines in main track, got %d", len(main.Line))
 	}
 	translation := list[1]
 	if translation.Kind != "translation" {
 		t.Fatalf("expected translation kind %q, got %q", "translation", translation.Kind)
 	}
 	if translation.Lang != "es" {
 		t.Fatalf("expected translation language %q, got %q", "es", translation.Lang)
 	}
 	if len(translation.Line) != 1 {
 		t.Fatalf("expected 1 translation line, got %d", len(translation.Line))
 	}
 	assertTimedLine(t, translation.Line[0], 1000, "Hola")
 	if translation.Line[0].End == nil || *translation.Line[0].End != 1500 {
 		t.Fatalf("expected translation line end %d, got %v", 1500, translation.Line[0].End)
 	}
 	pronunciation := list[2]
 	if pronunciation.Kind != "pronunciation" {
 		t.Fatalf("expected pronunciation kind %q, got %q", "pronunciation", pronunciation.Kind)
 	}
 	if pronunciation.Lang != "ja-latn" {
 		t.Fatalf("expected pronunciation language %q, got %q", "ja-latn", pronunciation.Lang)
 	}
 	if len(pronunciation.Line) != 1 {
 		t.Fatalf("expected 1 pronunciation line, got %d", len(pronunciation.Line))
 	}
 	assertTimedLine(t, pronunciation.Line[0], 2000, "konni")
 	if pronunciation.Line[0].End == nil || *pronunciation.Line[0].End != 2600 {
 		t.Fatalf("expected pronunciation line end %d, got %v", 2600, pronunciation.Line[0].End)
 	}
 	if len(pronunciation.Line[0].Token) != 2 {
 		t.Fatalf("expected 2 pronunciation tokens, got %d", len(pronunciation.Line[0].Token))
 	}
 	assertToken(t, pronunciation.Line[0].Token[0], 2000, 2300, "ko", "")
 	assertToken(t, pronunciation.Line[0].Token[1], 2300, 2600, "nni", "")
 }
 func TestParseTTML_PronunciationBareDecimalEndTimes(t *testing.T) {
 	content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
 <tt xmlns="http://www.w3.org/ns/ttml" xmlns:itunes="http://music.apple.com/lyric-ttml-internal">
  <head>
    <metadata>
      <iTunesMetadata xmlns="http://music.apple.com/lyric-ttml-internal">
        <transliterations>
          <transliteration xml:lang="ja-Latn">
            <text for="L1"><span begin="2.747" end="3.018" xmlns="http://www.w3.org/ns/ttml">I</span> <span begin="3.018" end="3.179" xmlns="http://www.w3.org/ns/ttml">woke</span> <span begin="3.179" end="3.582" xmlns="http://www.w3.org/ns/ttml">up</span></text>
          </transliteration>
        </transliterations>
      </iTunesMetadata>
    </metadata>
  </head>
  <body xml:lang="ja">
    <div>
      <p begin="00:02.747" end="00:04.000" itunes:key="L1">起きた</p>
    </div>
  </body>
 </tt>`)
 	list, err := parseTTML(content)
 	if err != nil {
 		t.Fatalf("parseTTML returned error: %v", err)
 	}
 	var pronunciation *model.Lyrics
 	for i := range list {
 		if list[i].Kind == "pronunciation" {
 			pronunciation = &list[i]
 			break
 		}
 	}
 	if pronunciation == nil {
 		t.Fatal("expected a pronunciation track")
 	}
 	if len(pronunciation.Line) != 1 {
 		t.Fatalf("expected 1 pronunciation line, got %d", len(pronunciation.Line))
 	}
 	line := pronunciation.Line[0]
 	assertTimedLine(t, line, 2747, "I woke up")
 	if len(line.Token) != 3 {
 		t.Fatalf("expected 3 tokens, got %d", len(line.Token))
 	}
 	assertToken(t, line.Token[0], 2747, 3018, "I", "")
 	assertToken(t, line.Token[1], 3018, 3179, "woke", "")
 	assertToken(t, line.Token[2], 3179, 3582, "up", "")
 }
 func assertTimedLine(t *testing.T, line model.Line, expectedStart int64, expectedValue string) {
 	t.Helper()
 	if line.Start == nil {
 		t.Fatal("expected line start to be set, got nil")
 	}
 	if *line.Start != expectedStart {
 		t.Fatalf("expected line start %d, got %d", expectedStart, *line.Start)
 	}
 	if line.Value != expectedValue {
 		t.Fatalf("expected line value %q, got %q", expectedValue, line.Value)
 	}
 }
 func assertToken(t *testing.T, token model.Token, expectedStart int64, expectedEnd int64, expectedValue string, expectedRole string) {
 	t.Helper()
 	if token.Start == nil {
 		t.Fatal("expected token start to be set, got nil")
 	}
 	if *token.Start != expectedStart {
 		t.Fatalf("expected token start %d, got %d", expectedStart, *token.Start)
 	}
 	if token.End == nil {
 		t.Fatal("expected token end to be set, got nil")
 	}
 	if *token.End != expectedEnd {
 		t.Fatalf("expected token end %d, got %d", expectedEnd, *token.End)
 	}
 	if token.Value != expectedValue {
 		t.Fatalf("expected token value %q, got %q", expectedValue, token.Value)
 	}
 	if token.Role != expectedRole {
 		t.Fatalf("expected token role %q, got %q", expectedRole, token.Role)
 	}
 }
--- a/model/lyrics.go
+++ b/model/lyrics.go
@ -11,14 +11,24 @@ import (
 	"github.com/navidrome/navidrome/utils/str"
 )
-type Line struct {
+type Token struct {
 	Start *int64 `structs:"start,omitempty" json:"start,omitempty"`
 	End   *int64 `structs:"end,omitempty"   json:"end,omitempty"`
 	Value string `structs:"value"           json:"value"`
 	Role  string `structs:"role,omitempty"  json:"role,omitempty"`
 }
 type Line struct {
 	Start *int64  `structs:"start,omitempty" json:"start,omitempty"`
 	End   *int64  `structs:"end,omitempty"   json:"end,omitempty"`
 	Value string  `structs:"value"           json:"value"`
 	Token []Token `structs:"token,omitempty" json:"token,omitempty"`
 }
 type Lyrics struct {
 	DisplayArtist string `structs:"displayArtist,omitempty" json:"displayArtist,omitempty"`
 	DisplayTitle  string `structs:"displayTitle,omitempty"  json:"displayTitle,omitempty"`
 	Kind          string `structs:"kind,omitempty"          json:"kind,omitempty"`
 	Lang          string `structs:"lang"                    json:"lang"`
 	Line          []Line `structs:"line"                    json:"line"`
 	Offset        *int64 `structs:"offset,omitempty"        json:"offset,omitempty"`
--- a/server/subsonic/helpers.go
+++ b/server/subsonic/helpers.go
@ -478,19 +478,47 @@ func mapExplicitStatus(explicitStatus string) string {
 func buildStructuredLyric(mf *model.MediaFile, lyrics model.Lyrics) responses.StructuredLyric {
 	lines := make([]responses.Line, len(lyrics.Line))
 	tokenLines := make([]responses.TokenLine, 0, len(lyrics.Line))
 	for i, line := range lyrics.Line {
 		lines[i] = responses.Line{
 			Start: line.Start,
 			Value: line.Value,
 		}
 		if len(line.Token) == 0 {
 			continue
 		}
 		tokens := make([]responses.LyricToken, len(line.Token))
 		for j, token := range line.Token {
 			tokens[j] = responses.LyricToken{
 				Start: token.Start,
 				End:   token.End,
 				Value: token.Value,
 				Role:  token.Role,
 			}
 		}
 		tokenLines = append(tokenLines, responses.TokenLine{
 			Index: int32(i),
 			Start: line.Start,
 			End:   line.End,
 			Value: line.Value,
 			Token: tokens,
 		})
 	}
 	kind := strings.TrimSpace(lyrics.Kind)
 	if kind == "" {
 		kind = "main"
 	}
 	structured := responses.StructuredLyric{
 		DisplayArtist: lyrics.DisplayArtist,
 		DisplayTitle:  lyrics.DisplayTitle,
 		Kind:          kind,
 		Lang:          lyrics.Lang,
 		Line:          lines,
 		TokenLine:     tokenLines,
 		Offset:        lyrics.Offset,
 		Synced:        lyrics.Synced,
 	}
--- a/server/subsonic/media_retrieval.go
+++ b/server/subsonic/media_retrieval.go
@ -98,7 +98,9 @@ func (api *Router) GetLyrics(r *http.Request) (*responses.Subsonic, error) {
 	response := newResponse()
 	lyricsResponse := responses.Lyrics{}
 	response.Lyrics = &lyricsResponse
-	mediaFiles, err := api.ds.MediaFile(r.Context()).GetAll(filter.SongsByArtistTitleWithLyricsFirst(artist, title))
+	opts := filter.SongsByArtistTitleWithLyricsFirst(artist, title)
 	opts.Max = 0
 	mediaFiles, err := api.ds.MediaFile(r.Context()).GetAll(opts)
 	if err != nil {
 		return nil, err
@ -108,25 +110,26 @@ func (api *Router) GetLyrics(r *http.Request) (*responses.Subsonic, error) {
 		return response, nil
 	}
-	structuredLyrics, err := api.lyrics.GetLyrics(r.Context(), &mediaFiles[0])
+	for i := range mediaFiles {
-	if err != nil {
+		structuredLyrics, err := api.lyrics.GetLyrics(r.Context(), &mediaFiles[i])
-		return nil, err
+		if err != nil {
 			return nil, err
 		}
 		if len(structuredLyrics) == 0 {
 			continue
 		}
 		lyricsResponse.Artist = artist
 		lyricsResponse.Title = title
 		var lyricsText strings.Builder
 		for _, line := range structuredLyrics[0].Line {
 			lyricsText.WriteString(line.Value + "\n")
 		}
 		lyricsResponse.Value = lyricsText.String()
 		break
 	}
 	if len(structuredLyrics) == 0 {
 		return response, nil
 	}
 	lyricsResponse.Artist = artist
 	lyricsResponse.Title = title
 	var lyricsText strings.Builder
 	for _, line := range structuredLyrics[0].Line {
 		lyricsText.WriteString(line.Value + "\n")
 	}
 	lyricsResponse.Value = lyricsText.String()
 	return response, nil
 }
--- a/server/subsonic/media_retrieval_test.go
+++ b/server/subsonic/media_retrieval_test.go
@ -186,6 +186,36 @@ var _ = Describe("MediaRetrievalController", func() {
 			Expect(response.Lyrics.Title).To(Equal("Never Gonna Give You Up"))
 			Expect(response.Lyrics.Value).To(Equal("We're no strangers to love\nYou know the rules and so do I\n"))
 		})
 		It("should continue searching candidates for sidecar lyrics", func() {
 			conf.Server.LyricsPriority = ".ttml,embedded"
 			r := newGetRequest("artist=Rick+Astley", "title=Never+Gonna+Give+You+Up")
 			baseTime := time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC)
 			mockRepo.SetData(model.MediaFiles{
 				{
 					ID:        "1",
 					Path:      "tests/fixtures/01 Invisible (RED) Edit Version.mp3",
 					Artist:    "Rick Astley",
 					Title:     "Never Gonna Give You Up",
 					Lyrics:    "[]",
 					UpdatedAt: baseTime.Add(2 * time.Hour), // Newer, but no TTML sidecar
 				},
 				{
 					ID:        "2",
 					Path:      "tests/fixtures/test.mp3",
 					Artist:    "Rick Astley",
 					Title:     "Never Gonna Give You Up",
 					Lyrics:    "[]",
 					UpdatedAt: baseTime.Add(1 * time.Hour), // Older, but has TTML sidecar
 				},
 			})
 			response, err := router.GetLyrics(r)
 			Expect(err).ToNot(HaveOccurred())
 			Expect(response.Lyrics.Artist).To(Equal("Rick Astley"))
 			Expect(response.Lyrics.Title).To(Equal("Never Gonna Give You Up"))
 			Expect(response.Lyrics.Value).To(Equal("We're no strangers to love\nYou know the rules and so do I\n"))
 		})
 	})
 	Describe("GetLyricsBySongId", func() {
@ -202,6 +232,11 @@ var _ = Describe("MediaRetrievalController", func() {
 				Expect(realLyric.DisplayArtist).To(Equal(expectedLyric.DisplayArtist))
 				Expect(realLyric.DisplayTitle).To(Equal(expectedLyric.DisplayTitle))
 				expectedKind := expectedLyric.Kind
 				if expectedKind == "" {
 					expectedKind = "main"
 				}
 				Expect(realLyric.Kind).To(Equal(expectedKind))
 				Expect(realLyric.Lang).To(Equal(expectedLyric.Lang))
 				Expect(realLyric.Synced).To(Equal(expectedLyric.Synced))
@ -222,6 +257,40 @@ var _ = Describe("MediaRetrievalController", func() {
 						Expect(*realLine.Start).To(Equal(*expectedLine.Start))
 					}
 				}
 				Expect(realLyric.TokenLine).To(HaveLen(len(expectedLyric.TokenLine)))
 				for j, realTokenLine := range realLyric.TokenLine {
 					expectedTokenLine := expectedLyric.TokenLine[j]
 					Expect(realTokenLine.Index).To(Equal(expectedTokenLine.Index))
 					Expect(realTokenLine.Value).To(Equal(expectedTokenLine.Value))
 					if expectedTokenLine.Start == nil {
 						Expect(realTokenLine.Start).To(BeNil())
 					} else {
 						Expect(*realTokenLine.Start).To(Equal(*expectedTokenLine.Start))
 					}
 					if expectedTokenLine.End == nil {
 						Expect(realTokenLine.End).To(BeNil())
 					} else {
 						Expect(*realTokenLine.End).To(Equal(*expectedTokenLine.End))
 					}
 					Expect(realTokenLine.Token).To(HaveLen(len(expectedTokenLine.Token)))
 					for k, realToken := range realTokenLine.Token {
 						expectedToken := expectedTokenLine.Token[k]
 						Expect(realToken.Value).To(Equal(expectedToken.Value))
 						Expect(realToken.Role).To(Equal(expectedToken.Role))
 						if expectedToken.Start == nil {
 							Expect(realToken.Start).To(BeNil())
 						} else {
 							Expect(*realToken.Start).To(Equal(*expectedToken.Start))
 						}
 						if expectedToken.End == nil {
 							Expect(realToken.End).To(BeNil())
 						} else {
 							Expect(*realToken.End).To(Equal(*expectedToken.End))
 						}
 					}
 				}
 			}
 		}
@ -323,6 +392,238 @@ var _ = Describe("MediaRetrievalController", func() {
 				},
 			})
 		})
 		It("should return multilingual TTML sidecar lyrics", func() {
 			conf.Server.LyricsPriority = ".ttml,embedded"
 			r := newGetRequest("id=1")
 			mockRepo.SetData(model.MediaFiles{
 				{
 					ID:     "1",
 					Path:   "tests/fixtures/test.mp3",
 					Artist: "Rick Astley",
 					Title:  "Never Gonna Give You Up",
 					Lyrics: "[]",
 				},
 			})
 			response, err := router.GetLyricsBySongId(r)
 			Expect(err).ToNot(HaveOccurred())
 			porTime := int64(18800)
 			ttmlTime := int64(22800)
 			compareResponses(response.LyricsList, responses.LyricsList{
 				StructuredLyrics: responses.StructuredLyrics{
 					{
 						DisplayArtist: "Rick Astley",
 						DisplayTitle:  "Never Gonna Give You Up",
 						Lang:          "eng",
 						Synced:        true,
 						Line: []responses.Line{
 							{
 								Start: &times[0],
 								Value: "We're no strangers to love",
 							},
 							{
 								Start: &ttmlTime,
 								Value: "You know the rules and so do I",
 							},
 						},
 					},
 					{
 						DisplayArtist: "Rick Astley",
 						DisplayTitle:  "Never Gonna Give You Up",
 						Lang:          "por",
 						Synced:        true,
 						Line: []responses.Line{
 							{
 								Start: &porTime,
 								Value: "Nao somos estranhos ao amor",
 							},
 						},
 					},
 				},
 			})
 		})
 		It("should return metadata-linked translation and pronunciation tracks from TTML", func() {
 			conf.Server.LyricsPriority = ".ttml,embedded"
 			r := newGetRequest("id=1")
 			mockRepo.SetData(model.MediaFiles{
 				{
 					ID:     "1",
 					Path:   "tests/fixtures/test-metadata.mp3",
 					Artist: "Rick Astley",
 					Title:  "Never Gonna Give You Up",
 					Lyrics: "[]",
 				},
 			})
 			response, err := router.GetLyricsBySongId(r)
 			Expect(err).ToNot(HaveOccurred())
 			mainStartA := int64(1000)
 			mainStartB := int64(2000)
 			tokenStartA := int64(2000)
 			tokenEndA := int64(2300)
 			tokenStartB := int64(2300)
 			tokenEndB := int64(2600)
 			compareResponses(response.LyricsList, responses.LyricsList{
 				StructuredLyrics: responses.StructuredLyrics{
 					{
 						DisplayArtist: "Rick Astley",
 						DisplayTitle:  "Never Gonna Give You Up",
 						Kind:          "main",
 						Lang:          "ja",
 						Synced:        true,
 						Line: []responses.Line{
 							{
 								Start: &mainStartA,
 								Value: "こんにちは",
 							},
 							{
 								Start: &mainStartB,
 								Value: "こんばんは",
 							},
 						},
 					},
 					{
 						DisplayArtist: "Rick Astley",
 						DisplayTitle:  "Never Gonna Give You Up",
 						Kind:          "translation",
 						Lang:          "es",
 						Synced:        true,
 						Line: []responses.Line{
 							{
 								Start: &mainStartA,
 								Value: "Hola",
 							},
 						},
 					},
 					{
 						DisplayArtist: "Rick Astley",
 						DisplayTitle:  "Never Gonna Give You Up",
 						Kind:          "pronunciation",
 						Lang:          "ja-latn",
 						Synced:        true,
 						Line: []responses.Line{
 							{
 								Start: &mainStartB,
 								Value: "konni",
 							},
 						},
 						TokenLine: []responses.TokenLine{
 							{
 								Index: 0,
 								Start: &mainStartB,
 								End:   &tokenEndB,
 								Value: "konni",
 								Token: []responses.LyricToken{
 									{
 										Start: &tokenStartA,
 										End:   &tokenEndA,
 										Value: "ko",
 									},
 									{
 										Start: &tokenStartB,
 										End:   &tokenEndB,
 										Value: "nni",
 									},
 								},
 							},
 						},
 					},
 				},
 			})
 		})
 		It("should return tokenized lines for songLyrics v2 clients", func() {
 			r := newGetRequest("id=1")
 			lineStart := int64(1000)
 			lineEnd := int64(3000)
 			tokenStartA := int64(1000)
 			tokenEndA := int64(1400)
 			tokenStartB := int64(2000)
 			tokenEndB := int64(2500)
 			lyricsJson, err := json.Marshal(model.LyricList{
 				{
 					Lang:   "eng",
 					Synced: true,
 					Line: []model.Line{
 						{
 							Start: &lineStart,
 							End:   &lineEnd,
 							Value: "Hello echo",
 							Token: []model.Token{
 								{
 									Start: &tokenStartA,
 									End:   &tokenEndA,
 									Value: "Hello",
 								},
 								{
 									Start: &tokenStartB,
 									End:   &tokenEndB,
 									Value: "echo",
 									Role:  "x-bg",
 								},
 							},
 						},
 					},
 				},
 			})
 			Expect(err).ToNot(HaveOccurred())
 			mockRepo.SetData(model.MediaFiles{
 				{
 					ID:     "1",
 					Artist: "Rick Astley",
 					Title:  "Never Gonna Give You Up",
 					Lyrics: string(lyricsJson),
 				},
 			})
 			response, err := router.GetLyricsBySongId(r)
 			Expect(err).ToNot(HaveOccurred())
 			compareResponses(response.LyricsList, responses.LyricsList{
 				StructuredLyrics: responses.StructuredLyrics{
 					{
 						DisplayArtist: "Rick Astley",
 						DisplayTitle:  "Never Gonna Give You Up",
 						Lang:          "eng",
 						Synced:        true,
 						Line: []responses.Line{
 							{
 								Start: &lineStart,
 								Value: "Hello echo",
 							},
 						},
 						TokenLine: []responses.TokenLine{
 							{
 								Index: 0,
 								Start: &lineStart,
 								End:   &lineEnd,
 								Value: "Hello echo",
 								Token: []responses.LyricToken{
 									{
 										Start: &tokenStartA,
 										End:   &tokenEndA,
 										Value: "Hello",
 									},
 									{
 										Start: &tokenStartB,
 										End:   &tokenEndB,
 										Value: "echo",
 										Role:  "x-bg",
 									},
 								},
 							},
 						},
 					},
 				},
 			})
 		})
 	})
 })
--- a/server/subsonic/opensubsonic.go
+++ b/server/subsonic/opensubsonic.go
@ -11,7 +11,7 @@ func (api *Router) GetOpenSubsonicExtensions(_ *http.Request) (*responses.Subson
 	response.OpenSubsonicExtensions = &responses.OpenSubsonicExtensions{
 		{Name: "transcodeOffset", Versions: []int32{1}},
 		{Name: "formPost", Versions: []int32{1}},
-		{Name: "songLyrics", Versions: []int32{1}},
+		{Name: "songLyrics", Versions: []int32{1, 2}},
 		{Name: "indexBasedQueue", Versions: []int32{1}},
 		{Name: "transcoding", Versions: []int32{1}},
 	}
--- a/server/subsonic/opensubsonic_test.go
+++ b/server/subsonic/opensubsonic_test.go
@ -38,7 +38,7 @@ var _ = Describe("GetOpenSubsonicExtensions", func() {
 			HaveLen(5),
 			ContainElement(responses.OpenSubsonicExtension{Name: "transcodeOffset", Versions: []int32{1}}),
 			ContainElement(responses.OpenSubsonicExtension{Name: "formPost", Versions: []int32{1}}),
-			ContainElement(responses.OpenSubsonicExtension{Name: "songLyrics", Versions: []int32{1}}),
+			ContainElement(responses.OpenSubsonicExtension{Name: "songLyrics", Versions: []int32{1, 2}}),
 			ContainElement(responses.OpenSubsonicExtension{Name: "indexBasedQueue", Versions: []int32{1}}),
 			ContainElement(responses.OpenSubsonicExtension{Name: "transcoding", Versions: []int32{1}}),
 		))
--- a/server/subsonic/responses/responses.go
+++ b/server/subsonic/responses/responses.go
@ -537,13 +537,30 @@ type Line struct {
 	Value string `xml:",chardata"            json:"value"`
 }
 type LyricToken struct {
 	Start *int64 `xml:"start,attr,omitempty" json:"start,omitempty"`
 	End   *int64 `xml:"end,attr,omitempty"   json:"end,omitempty"`
 	Value string `xml:"value,attr"           json:"value"`
 	Role  string `xml:"role,attr,omitempty"  json:"role,omitempty"`
 }
 type TokenLine struct {
 	Index int32        `xml:"index,attr"                    json:"index"`
 	Start *int64       `xml:"start,attr,omitempty"         json:"start,omitempty"`
 	End   *int64       `xml:"end,attr,omitempty"           json:"end,omitempty"`
 	Value string       `xml:"value,attr,omitempty"         json:"value,omitempty"`
 	Token []LyricToken `xml:"token,omitempty"        json:"token,omitempty"`
 }
 type StructuredLyric struct {
-	DisplayArtist string `xml:"displayArtist,attr,omitempty" json:"displayArtist,omitempty"`
+	DisplayArtist string      `xml:"displayArtist,attr,omitempty" json:"displayArtist,omitempty"`
-	DisplayTitle  string `xml:"displayTitle,attr,omitempty"  json:"displayTitle,omitempty"`
+	DisplayTitle  string      `xml:"displayTitle,attr,omitempty"  json:"displayTitle,omitempty"`
-	Lang          string `xml:"lang,attr"                    json:"lang"`
+	Kind          string      `xml:"kind,attr,omitempty"          json:"kind,omitempty"`
-	Line          []Line `xml:"line"                         json:"line"`
+	Lang          string      `xml:"lang,attr"                    json:"lang"`
-	Offset        *int64 `xml:"offset,attr,omitempty"        json:"offset,omitempty"`
+	Line          []Line      `xml:"line"                         json:"line"`
-	Synced        bool   `xml:"synced,attr"                  json:"synced"`
+	TokenLine     []TokenLine `xml:"tokenLine,omitempty"     json:"tokenLine,omitempty"`
 	Offset        *int64      `xml:"offset,attr,omitempty"        json:"offset,omitempty"`
 	Synced        bool        `xml:"synced,attr"                  json:"synced"`
 }
 type StructuredLyrics []StructuredLyric
--- a/tests/fixtures/bom-test.ttml
+++ b/tests/fixtures/bom-test.ttml
@ -0,0 +1,2 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <tt xmlns="http://www.w3.org/ns/ttml"><body><div xml:lang="eng"><p begin="00:00:00.00">BOM test line</p></div></body></tt>
--- a/tests/fixtures/bom-utf16-test.ttml
+++ b/tests/fixtures/bom-utf16-test.ttml
--- a/tests/fixtures/test-metadata.ttml
+++ b/tests/fixtures/test-metadata.ttml
@ -0,0 +1,25 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <tt xmlns="http://www.w3.org/ns/ttml" xmlns:itunes="http://music.apple.com/lyric-ttml-internal">
  <head>
    <metadata>
      <iTunesMetadata xmlns="http://music.apple.com/lyric-ttml-internal">
        <translations>
          <translation xml:lang="es">
            <text for="L1">Hola</text>
          </translation>
        </translations>
        <transliterations>
          <transliteration xml:lang="ja-Latn">
            <text for="L2"><span begin="00:02.000" end="00:02.300" xmlns="http://www.w3.org/ns/ttml">ko</span><span begin="00:02.300" end="00:02.600" xmlns="http://www.w3.org/ns/ttml">nni</span></text>
          </transliteration>
        </transliterations>
      </iTunesMetadata>
    </metadata>
  </head>
  <body xml:lang="ja">
    <div>
      <p begin="00:01.000" end="00:01.500" itunes:key="L1">こんにちは</p>
      <p begin="00:02.000" end="00:02.700" itunes:key="L2">こんばんは</p>
    </div>
  </body>
 </tt>
--- a/tests/fixtures/test.ttml
+++ b/tests/fixtures/test.ttml
@ -0,0 +1,12 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <tt xmlns="http://www.w3.org/ns/ttml" xmlns:ttp="http://www.w3.org/ns/ttml#parameter" ttp:frameRate="30" ttp:subFrameRate="2" ttp:tickRate="10">
  <body>
    <div xml:lang="eng">
      <p begin="00:00:18.80">We're no strangers to love</p>
      <p begin="00:00:22:24">You know the rules and so do I</p>
    </div>
    <div xml:lang="por">
      <p begin="188t">Nao somos estranhos ao amor</p>
    </div>
  </body>
 </tt>
--- a/ui/src/actions/player.js
+++ b/ui/src/actions/player.js
@ -9,6 +9,7 @@ export const PLAYER_SET_VOLUME = 'PLAYER_SET_VOLUME'
 export const PLAYER_SET_MODE = 'PLAYER_SET_MODE'
 export const TRANSCODING_SET_PROFILE = 'TRANSCODING_SET_PROFILE'
 export const PLAYER_REFRESH_QUEUE = 'PLAYER_REFRESH_QUEUE'
 export const PLAYER_UPDATE_LYRIC = 'PLAYER_UPDATE_LYRIC'
 export const setTrack = (data) => ({
  type: PLAYER_SET_TRACK,
@ -114,3 +115,8 @@ export const refreshQueue = (resolvedUrls) => ({
  type: PLAYER_REFRESH_QUEUE,
  data: resolvedUrls,
 })
 export const updateQueueLyric = (trackId, lyric) => ({
  type: PLAYER_UPDATE_LYRIC,
  data: { trackId, lyric },
 })
--- a/ui/src/audioplayer/KaraokeLyricsOverlay.jsx
+++ b/ui/src/audioplayer/KaraokeLyricsOverlay.jsx
--- a/ui/src/audioplayer/Player.jsx
+++ b/ui/src/audioplayer/Player.jsx
@ -22,6 +22,7 @@ import {
  refreshQueue,
  setPlayMode,
  setTranscodingProfile,
  updateQueueLyric,
  setVolume,
  syncQueue,
 } from '../actions'
@ -33,6 +34,25 @@ import { keyMap } from '../hotkeys'
 import keyHandlers from './keyHandlers'
 import { calculateGain } from '../utils/calculateReplayGain'
 import { detectBrowserProfile, decisionService } from '../transcode'
 import {
  getPreferredLyricLanguage,
  hasStructuredLyricContent,
  selectLyricLayers,
  structuredLyricToLrc,
 } from './lyrics'
 import KaraokeLyricsOverlay from './KaraokeLyricsOverlay'
 const emptyLyricLayers = {
  main: null,
  translation: null,
  pronunciation: null,
 }
 const normalizeLyricLayers = (layers) => ({
  main: layers?.main || null,
  translation: layers?.translation || null,
  pronunciation: layers?.pronunciation || null,
 })
 const Player = () => {
  const theme = useCurrentTheme()
@ -120,6 +140,72 @@ const Player = () => {
  const gainInfo = useSelector((state) => state.replayGain)
  const [context, setContext] = useState(null)
  const [gainNode, setGainNode] = useState(null)
  const lyricCacheRef = useRef(new Map())
  const lyricRequestIdRef = useRef(0)
  const playerRef = useRef(null)
  const [karaokeVisible, setKaraokeVisible] = useState(false)
  const [selectedLyricLayers, setSelectedLyricLayers] =
    useState(emptyLyricLayers)
  const [showTranslation, setShowTranslation] = useState(false)
  const [showPronunciation, setShowPronunciation] = useState(false)
  const currentTrackId = playerState.current?.trackId
  const currentTrackIsRadio = playerState.current?.isRadio
  const selectedStructuredLyric = selectedLyricLayers.main
  const hasKaraokeLyric = hasStructuredLyricContent(selectedStructuredLyric)
  const hasTranslationLyric = hasStructuredLyricContent(
    selectedLyricLayers.translation,
  )
  const hasPronunciationLyric = hasStructuredLyricContent(
    selectedLyricLayers.pronunciation,
  )
  const applyLyricToRuntimePlayer = useCallback((trackId, lyric) => {
    if (!trackId) {
      return
    }
    const player = playerRef.current
    if (!player || typeof player.setState !== 'function') {
      return
    }
    player.setState((prevState) => {
      const prevLists = Array.isArray(prevState.audioLists)
        ? prevState.audioLists
        : []
      let changed = false
      const audioLists = prevLists.map((item) => {
        if (item.trackId !== trackId) {
          return item
        }
        if (item.lyric === lyric) {
          return item
        }
        changed = true
        return {
          ...item,
          lyric,
        }
      })
      const currentItem = audioLists.find(
        (item) => item.musicSrc === prevState.musicSrc,
      )
      const currentLyric =
        typeof currentItem?.lyric === 'string'
          ? currentItem.lyric
          : prevState.lyric
      if (!changed && currentLyric === prevState.lyric) {
        return null
      }
      return {
        audioLists,
        lyric: currentLyric,
      }
    })
  }, [])
  useEffect(() => {
    if (
@ -166,6 +252,107 @@ const Player = () => {
    return () => window.removeEventListener('beforeunload', handleBeforeUnload)
  }, [playerState, audioInstance])
  useEffect(() => {
    if (!currentTrackId || currentTrackIsRadio) {
      setSelectedLyricLayers(emptyLyricLayers)
      setShowTranslation(false)
      setShowPronunciation(false)
      setKaraokeVisible(false)
      return
    }
    const cached = lyricCacheRef.current.get(currentTrackId)
    let layers = emptyLyricLayers
    if (cached && typeof cached !== 'string') {
      if (cached.layers) {
        layers = normalizeLyricLayers(cached.layers)
      } else if (cached.structuredLyric) {
        layers = normalizeLyricLayers({
          main: cached.structuredLyric,
        })
      }
    }
    setSelectedLyricLayers(layers)
    setShowTranslation(false)
    setShowPronunciation(hasStructuredLyricContent(layers.pronunciation))
  }, [currentTrackId, currentTrackIsRadio])
  useEffect(() => {
    lyricRequestIdRef.current += 1
    const requestId = lyricRequestIdRef.current
    if (!currentTrackId || currentTrackIsRadio) {
      return
    }
    const cached = lyricCacheRef.current.get(currentTrackId)
    if (cached !== undefined) {
      const cachedLyric =
        typeof cached === 'string' ? cached : cached?.lrc || ''
      const cachedLayers =
        typeof cached === 'string'
          ? emptyLyricLayers
          : cached?.layers
            ? normalizeLyricLayers(cached.layers)
            : normalizeLyricLayers({ main: cached?.structuredLyric })
      setSelectedLyricLayers(cachedLayers)
      setShowTranslation(false)
      setShowPronunciation(
        hasStructuredLyricContent(cachedLayers.pronunciation),
      )
      if (cachedLyric) {
        dispatch(updateQueueLyric(currentTrackId, cachedLyric))
        applyLyricToRuntimePlayer(currentTrackId, cachedLyric)
      }
      return
    }
    subsonic
      .getLyricsBySongId(currentTrackId)
      .then((resp) => {
        if (lyricRequestIdRef.current !== requestId) {
          return
        }
        const structuredLyrics =
          resp?.json?.['subsonic-response']?.lyricsList?.structuredLyrics || []
        const layers = selectLyricLayers(
          structuredLyrics,
          getPreferredLyricLanguage(),
        )
        const lyric = layers.main ? structuredLyricToLrc(layers.main) : ''
        lyricCacheRef.current.set(currentTrackId, {
          lrc: lyric,
          layers,
        })
        setSelectedLyricLayers(layers)
        setShowTranslation(false)
        setShowPronunciation(hasStructuredLyricContent(layers.pronunciation))
        if (lyric !== '') {
          dispatch(updateQueueLyric(currentTrackId, lyric))
          applyLyricToRuntimePlayer(currentTrackId, lyric)
        }
      })
      .catch(() => {
        if (lyricRequestIdRef.current !== requestId) {
          return
        }
        setSelectedLyricLayers(emptyLyricLayers)
        setShowTranslation(false)
        setShowPronunciation(false)
        // Do not cache network/request failures as empty lyrics, so we can retry.
        lyricCacheRef.current.delete(currentTrackId)
      })
  }, [dispatch, currentTrackId, currentTrackIsRadio, applyLyricToRuntimePlayer])
  useEffect(() => {
    if (!hasKaraokeLyric && karaokeVisible) {
      setKaraokeVisible(false)
    }
  }, [hasKaraokeLyric, karaokeVisible])
  const defaultOptions = useMemo(
    () => ({
      theme: playerTheme,
@ -177,7 +364,7 @@ const Player = () => {
      clearPriorAudioLists: false,
      showDestroy: true,
      showDownload: false,
-      showLyric: true,
+      showLyric: false,
      showReload: false,
      toggleMode: !isDesktop,
      glassBg: false,
@ -214,12 +401,24 @@ const Player = () => {
        (playerState.clear || playerState.playIndex === 0),
      clearPriorAudioLists: playerState.clear,
      extendsContent: (
-        <PlayerToolbar id={current.trackId} isRadio={current.isRadio} />
+        <PlayerToolbar
          id={current.trackId}
          isRadio={current.isRadio}
          onToggleLyrics={() => setKaraokeVisible((visible) => !visible)}
          lyricsActive={karaokeVisible}
          lyricsDisabled={!hasKaraokeLyric}
        />
      ),
      defaultVolume: isMobilePlayer ? 1 : playerState.volume,
      showMediaSession: !current.isRadio,
    }
-  }, [playerState, defaultOptions, isMobilePlayer])
+  }, [
    playerState,
    defaultOptions,
    isMobilePlayer,
    karaokeVisible,
    hasKaraokeLyric,
  ])
  const onAudioListsChange = useCallback(
    (_, audioLists, audioInfo) => dispatch(syncQueue(audioInfo, audioLists)),
@ -391,6 +590,7 @@ const Player = () => {
  return (
    <ThemeProvider theme={createMuiTheme(theme)}>
      <ReactJkMusicPlayer
        ref={playerRef}
        {...options}
        className={classes.player}
        onAudioListsChange={onAudioListsChange}
@ -406,6 +606,28 @@ const Player = () => {
        onBeforeDestroy={onBeforeDestroy}
        getAudioInstance={setAudioInstance}
      />
      <KaraokeLyricsOverlay
        visible={karaokeVisible}
        mainLyric={selectedLyricLayers.main}
        translationLyric={selectedLyricLayers.translation}
        pronunciationLyric={selectedLyricLayers.pronunciation}
        showTranslation={showTranslation}
        showPronunciation={showPronunciation}
        translationEnabled={hasTranslationLyric}
        pronunciationEnabled={hasPronunciationLyric}
        onToggleTranslation={() =>
          setShowTranslation((previous) =>
            hasTranslationLyric ? !previous : false,
          )
        }
        onTogglePronunciation={() =>
          setShowPronunciation((previous) =>
            hasPronunciationLyric ? !previous : false,
          )
        }
        audioInstance={audioInstance}
        onClose={() => setKaraokeVisible(false)}
      />
      <GlobalHotKeys handlers={handlers} keyMap={keyMap} allowChanges />
    </ThemeProvider>
  )
--- a/ui/src/audioplayer/PlayerToolbar.jsx
+++ b/ui/src/audioplayer/PlayerToolbar.jsx
@ -4,7 +4,9 @@ import { useGetOne } from 'react-admin'
 import { GlobalHotKeys } from 'react-hotkeys'
 import IconButton from '@material-ui/core/IconButton'
 import { useMediaQuery } from '@material-ui/core'
 import Tooltip from '@material-ui/core/Tooltip'
 import { RiSaveLine } from 'react-icons/ri'
 import { RiFileMusicLine } from 'react-icons/ri'
 import { LoveButton, useToggleLove } from '../common'
 import { openSaveQueueDialog } from '../actions'
 import { keyMap } from '../hotkeys'
@ -55,7 +57,13 @@ const useStyles = makeStyles((theme) => ({
  },
 }))
-const PlayerToolbar = ({ id, isRadio }) => {
+const PlayerToolbar = ({
  id,
  isRadio,
  onToggleLyrics,
  lyricsActive = false,
  lyricsDisabled = false,
 }) => {
  const dispatch = useDispatch()
  const { data, loading } = useGetOne('song', id, { enabled: !!id && !isRadio })
  const [toggleLove, toggling] = useToggleLove('song', data)
@ -99,6 +107,25 @@ const PlayerToolbar = ({ id, isRadio }) => {
    />
  )
  const toggleLyricsButton = (
    <Tooltip title="Toggle synchronized lyrics">
      <span>
        <IconButton
          size={isDesktop ? 'small' : undefined}
          onClick={onToggleLyrics}
          disabled={!onToggleLyrics || lyricsDisabled}
          data-testid="toggle-lyrics-button"
          className={buttonClass}
          color={lyricsActive ? 'primary' : 'default'}
        >
          <RiFileMusicLine
            className={!isDesktop ? classes.mobileIcon : undefined}
          />
        </IconButton>
      </span>
    </Tooltip>
  )
  return (
    <>
      <GlobalHotKeys keyMap={keyMap} handlers={handlers} allowChanges />
@ -106,11 +133,13 @@ const PlayerToolbar = ({ id, isRadio }) => {
        <li className={`${listItemClass} item`}>
          {saveQueueButton}
          {loveButton}
          {toggleLyricsButton}
        </li>
      ) : (
        <>
          <li className={`${listItemClass} item`}>{saveQueueButton}</li>
          <li className={`${listItemClass} item`}>{loveButton}</li>
          <li className={`${listItemClass} item`}>{toggleLyricsButton}</li>
        </>
      )}
    </>
--- a/ui/src/audioplayer/PlayerToolbar.test.jsx
+++ b/ui/src/audioplayer/PlayerToolbar.test.jsx
@ -71,6 +71,7 @@ describe('<PlayerToolbar />', () => {
      // Verify both buttons are rendered
      expect(screen.getByTestId('save-queue-button')).toBeInTheDocument()
      expect(screen.getByTestId('love-button')).toBeInTheDocument()
      expect(screen.getByTestId('toggle-lyrics-button')).toBeInTheDocument()
      // Verify desktop classes are applied
      expect(listItems[0].className).toContain('toolbar')
@ -102,6 +103,14 @@ describe('<PlayerToolbar />', () => {
        type: 'OPEN_SAVE_QUEUE_DIALOG',
      })
    })
    it('triggers lyric toggle callback when lyrics button is clicked', () => {
      const onToggleLyrics = vi.fn()
      render(<PlayerToolbar id="song-1" onToggleLyrics={onToggleLyrics} />)
      fireEvent.click(screen.getByTestId('toggle-lyrics-button'))
      expect(onToggleLyrics).toHaveBeenCalledTimes(1)
    })
  })
  describe('Mobile layout', () => {
@ -114,11 +123,12 @@ describe('<PlayerToolbar />', () => {
      // Each button should be in its own list item
      const listItems = screen.getAllByRole('listitem')
-      expect(listItems).toHaveLength(2)
+      expect(listItems).toHaveLength(3)
      // Verify both buttons are rendered
      expect(screen.getByTestId('save-queue-button')).toBeInTheDocument()
      expect(screen.getByTestId('love-button')).toBeInTheDocument()
      expect(screen.getByTestId('toggle-lyrics-button')).toBeInTheDocument()
      // Verify mobile classes are applied
      expect(listItems[0].className).toContain('mobileListItem')
@ -140,6 +150,13 @@ describe('<PlayerToolbar />', () => {
      const loveButton = screen.getByTestId('love-button')
      expect(loveButton).toBeDisabled()
    })
    it('disables lyrics button when lyrics are unavailable', () => {
      render(<PlayerToolbar id="song-1" lyricsDisabled={true} />)
      const lyricsButton = screen.getByTestId('toggle-lyrics-button')
      expect(lyricsButton).toBeDisabled()
    })
  })
  describe('Common behavior', () => {
--- a/ui/src/audioplayer/lyrics.js
+++ b/ui/src/audioplayer/lyrics.js
@ -0,0 +1,617 @@
 const normalizeLanguageTag = (language) =>
  (language || '').toLowerCase().replace('_', '-')
 const KARAOKE_SWITCH_EPSILON_MS = 18
 const LYRIC_KIND_MAIN = 'main'
 const LYRIC_KIND_TRANSLATION = 'translation'
 const LYRIC_KIND_PRONUNCIATION = 'pronunciation'
 const padTime = (value) => {
  const str = value.toString()
  return str.length === 1 ? `0${str}` : str
 }
 const toTime = (value) => {
  const numeric = Number(value)
  return Number.isFinite(numeric) ? numeric : null
 }
 const compareNullableTime = (a, b) => {
  if (a == null && b == null) {
    return 0
  }
  if (a == null) {
    return 1
  }
  if (b == null) {
    return -1
  }
  return a - b
 }
 const sortTokensByStart = (tokens) =>
  tokens
    .map((token, order) => ({ ...token, order }))
    .sort((a, b) => {
      const byStart = compareNullableTime(a.start, b.start)
      if (byStart !== 0) {
        return byStart
      }
      const byEnd = compareNullableTime(a.end, b.end)
      if (byEnd !== 0) {
        return byEnd
      }
      return a.order - b.order
    })
    .map(({ order, ...token }) => token)
 const languageMatch = (candidate, preferred) => {
  if (!candidate || !preferred) {
    return false
  }
  return (
    candidate === preferred ||
    candidate.startsWith(`${preferred}-`) ||
    preferred.startsWith(`${candidate}-`)
  )
 }
 const hasTimedLines = (lyric) =>
  lyric &&
  lyric.synced &&
  Array.isArray(lyric.line) &&
  lyric.line.some((line) => Number.isFinite(Number(line.start)))
 const normalizeToken = (token) => {
  if (!token) {
    return null
  }
  const value = typeof token.value === 'string' ? token.value : ''
  if (!value.trim()) {
    return null
  }
  return {
    start: toTime(token.start),
    end: toTime(token.end),
    value,
    role: typeof token.role === 'string' ? token.role : '',
  }
 }
 const normalizeTokenLine = (tokenLine, fallbackIndex) => {
  const index = Number.isFinite(Number(tokenLine?.index))
    ? Number(tokenLine.index)
    : fallbackIndex
  const tokens = sortTokensByStart(
    Array.isArray(tokenLine?.token)
      ? tokenLine.token.map(normalizeToken).filter(Boolean)
      : [],
  )
  return {
    index,
    start: toTime(tokenLine?.start),
    end: toTime(tokenLine?.end),
    value: typeof tokenLine?.value === 'string' ? tokenLine.value : '',
    tokens,
  }
 }
 const normalizeLyricKind = (kind) => {
  const normalized = (kind || '').toLowerCase().trim()
  switch (normalized) {
    case LYRIC_KIND_TRANSLATION:
      return LYRIC_KIND_TRANSLATION
    case LYRIC_KIND_PRONUNCIATION:
      return LYRIC_KIND_PRONUNCIATION
    default:
      return LYRIC_KIND_MAIN
  }
 }
 const pickLyricByLanguage = (lyrics, preferredLanguage) => {
  if (!Array.isArray(lyrics) || lyrics.length === 0) {
    return null
  }
  const preferred = normalizeLanguageTag(preferredLanguage)
  const preferredBase = preferred.split('-')[0]
  return (
    lyrics.find((lyric) =>
      languageMatch(normalizeLanguageTag(lyric.lang), preferred),
    ) ||
    lyrics.find((lyric) =>
      languageMatch(normalizeLanguageTag(lyric.lang), preferredBase),
    ) ||
    lyrics.find((lyric) =>
      languageMatch(normalizeLanguageTag(lyric.lang), 'en'),
    ) ||
    lyrics[0]
  )
 }
 const lineTimeWindow = (lines, index) => {
  const line = lines[index]
  if (!line) {
    return { start: null, end: null }
  }
  const start = toTime(line.start)
  const end = toTime(line.end) ?? toTime(lines[index + 1]?.start)
  return { start, end }
 }
 const buildSyntheticWordTokens = (line, token) => {
  const text = typeof line?.value === 'string' ? line.value : ''
  if (!text.trim()) {
    return null
  }
  const chunks = text.match(/\S+\s*/g) || []
  if (chunks.length < 2) {
    return null
  }
  const normalizedLine = text.replace(/\s+/g, ' ').trim().toLowerCase()
  const normalizedTokenValue = (token?.value || '')
    .replace(/\s+/g, ' ')
    .trim()
    .toLowerCase()
  if (!normalizedTokenValue || !normalizedLine) {
    return null
  }
  const compressedLine = normalizedLine.replace(/\s+/g, '')
  const compressedToken = normalizedTokenValue.replace(/\s+/g, '')
  const tokenLooksLikeWholeLine =
    compressedToken === compressedLine ||
    compressedToken.length >= Math.floor(compressedLine.length * 0.8)
  if (!tokenLooksLikeWholeLine) {
    return null
  }
  const tokenStart = toTime(token?.start)
  const tokenEnd = toTime(token?.end)
  const lineStart = toTime(line?.start)
  const lineEnd = toTime(line?.end)
  const baseStart = tokenStart ?? lineStart
  const baseEnd = tokenEnd ?? lineEnd
  if (
    baseStart == null ||
    baseEnd == null ||
    !Number.isFinite(baseStart) ||
    !Number.isFinite(baseEnd) ||
    baseEnd <= baseStart
  ) {
    return null
  }
  const duration = baseEnd - baseStart
  return chunks.map((chunk, idx) => ({
    start: baseStart + (duration * idx) / chunks.length,
    end: baseStart + (duration * (idx + 1)) / chunks.length,
    value: chunk,
    role: typeof token?.role === 'string' ? token.role : '',
  }))
 }
 export const hasTokenTiming = (structuredLyric) =>
  Boolean(
    structuredLyric &&
    Array.isArray(structuredLyric.tokenLine) &&
    structuredLyric.tokenLine.some(
      (tokenLine) =>
        Array.isArray(tokenLine?.token) &&
        tokenLine.token.some((token) => Number.isFinite(Number(token?.start))),
    ),
  )
 export const hasStructuredLyricContent = (structuredLyric) =>
  Boolean(
    structuredLyric &&
    ((Array.isArray(structuredLyric.line) &&
      structuredLyric.line.some(
        (line) => typeof line?.value === 'string' && line.value.trim() !== '',
      )) ||
      hasTokenTiming(structuredLyric)),
  )
 export const getPreferredLyricLanguage = () => {
  if (typeof window !== 'undefined' && window.localStorage) {
    const stored = window.localStorage.getItem('locale')
    if (stored) {
      return stored
    }
  }
  if (typeof navigator !== 'undefined' && navigator.language) {
    return navigator.language
  }
  return 'en'
 }
 export const selectLyricLayers = (structuredLyrics, preferredLanguage) => {
  if (!Array.isArray(structuredLyrics)) {
    return {
      main: null,
      translation: null,
      pronunciation: null,
    }
  }
  const synced = structuredLyrics.filter(hasTimedLines)
  if (synced.length === 0) {
    return {
      main: null,
      translation: null,
      pronunciation: null,
    }
  }
  const grouped = {
    [LYRIC_KIND_MAIN]: [],
    [LYRIC_KIND_TRANSLATION]: [],
    [LYRIC_KIND_PRONUNCIATION]: [],
  }
  for (const lyric of synced) {
    grouped[normalizeLyricKind(lyric?.kind)].push(lyric)
  }
  const mainCandidates = grouped[LYRIC_KIND_MAIN].length
    ? grouped[LYRIC_KIND_MAIN]
    : synced
  return {
    main: pickLyricByLanguage(mainCandidates, preferredLanguage),
    translation: pickLyricByLanguage(
      grouped[LYRIC_KIND_TRANSLATION],
      preferredLanguage,
    ),
    pronunciation: pickLyricByLanguage(
      grouped[LYRIC_KIND_PRONUNCIATION],
      preferredLanguage,
    ),
  }
 }
 export const pickStructuredLyric = (structuredLyrics, preferredLanguage) =>
  selectLyricLayers(structuredLyrics, preferredLanguage).main
 export const structuredLyricToLrc = (structuredLyric) => {
  if (!structuredLyric || !Array.isArray(structuredLyric.line)) {
    return ''
  }
  let lyricText = ''
  for (const line of structuredLyric.line) {
    const start = Number(line.start)
    if (!Number.isFinite(start) || start < 0) {
      continue
    }
    let time = Math.floor(start / 10)
    const ms = time % 100
    time = Math.floor(time / 100)
    const sec = time % 60
    time = Math.floor(time / 60)
    const min = time % 60
    lyricText += `[${padTime(min)}:${padTime(sec)}.${padTime(ms)}] ${line.value || ''}\n`
  }
  return lyricText
 }
 export const structuredLyricsToLrc = (structuredLyrics, preferredLanguage) => {
  const selected = pickStructuredLyric(structuredLyrics, preferredLanguage)
  if (!selected) {
    return ''
  }
  return structuredLyricToLrc(selected)
 }
 export const buildKaraokeLines = (structuredLyric) => {
  if (!structuredLyric) {
    return []
  }
  const baseLines = Array.isArray(structuredLyric.line)
    ? structuredLyric.line
    : []
  const rawTokenLines = Array.isArray(structuredLyric.tokenLine)
    ? structuredLyric.tokenLine
    : []
  const lines =
    rawTokenLines.length > 0
      ? rawTokenLines.map((tokenLine, fallbackIndex) => {
          const normalized = normalizeTokenLine(tokenLine, fallbackIndex)
          const baseLine = baseLines[normalized.index] || {}
          const tokens = normalized.tokens
          const fallbackStart =
            tokens.find((token) => token.start != null)?.start ?? null
          const fallbackEnd =
            [...tokens].reverse().find((token) => token.end != null)?.end ??
            null
          const value =
            normalized.value ||
            (typeof baseLine.value === 'string' ? baseLine.value : '') ||
            tokens.map((token) => token.value).join('')
          return {
            index: normalized.index,
            start: normalized.start ?? toTime(baseLine.start) ?? fallbackStart,
            end: normalized.end ?? toTime(baseLine.end) ?? fallbackEnd,
            value,
            tokens,
          }
        })
      : baseLines.map((line, index) => ({
          index,
          start: toTime(line.start),
          end: toTime(line.end),
          value: typeof line.value === 'string' ? line.value : '',
          tokens: [],
        }))
  const normalized = lines
    .filter((line) => line.value || line.tokens.length > 0)
    .sort((a, b) => {
      if (a.start == null && b.start == null) {
        return a.index - b.index
      }
      if (a.start == null) {
        return 1
      }
      if (b.start == null) {
        return -1
      }
      if (a.start !== b.start) {
        return a.start - b.start
      }
      return a.index - b.index
    })
    .map((line) => {
      const nextLine = { ...line }
      if (nextLine.tokens.length === 1) {
        const syntheticTokens = buildSyntheticWordTokens(
          nextLine,
          nextLine.tokens[0],
        )
        if (syntheticTokens) {
          nextLine.tokens = syntheticTokens
        }
      }
      return nextLine
    })
  for (let i = 0; i < normalized.length; i += 1) {
    if (normalized[i].end == null) {
      const nextStart = normalized[i + 1]?.start
      if (nextStart != null) {
        normalized[i].end = nextStart
      }
    }
  }
  return normalized
 }
 export const resolveKaraokeTokenWindow = (
  line,
  tokenIndex,
  lineEndFallback = null,
 ) => {
  const tokens = Array.isArray(line?.tokens) ? line.tokens : []
  const token = tokens[tokenIndex]
  if (!token) {
    return { start: null, end: null }
  }
  const prevToken = tokenIndex > 0 ? tokens[tokenIndex - 1] : null
  const nextToken =
    tokenIndex + 1 < tokens.length ? tokens[tokenIndex + 1] : null
  const lineStart = toTime(line?.start)
  const lineEnd = toTime(line?.end) ?? toTime(lineEndFallback)
  const tokenCount = tokens.length
  const hasLineWindow =
    lineStart != null &&
    lineEnd != null &&
    Number.isFinite(lineStart) &&
    Number.isFinite(lineEnd) &&
    lineEnd > lineStart
  const estimatedStart =
    hasLineWindow && tokenCount > 0
      ? lineStart + ((lineEnd - lineStart) * tokenIndex) / tokenCount
      : null
  const estimatedEnd =
    hasLineWindow && tokenCount > 0
      ? lineStart + ((lineEnd - lineStart) * (tokenIndex + 1)) / tokenCount
      : null
  let explicitStartCount = 0
  let explicitEndCount = 0
  const uniqueStarts = new Set()
  const uniqueEnds = new Set()
  for (let i = 0; i < tokenCount; i += 1) {
    const explicitStart = toTime(tokens[i]?.start)
    if (explicitStart != null) {
      explicitStartCount += 1
      uniqueStarts.add(explicitStart)
    }
    const explicitEnd = toTime(tokens[i]?.end)
    if (explicitEnd != null) {
      explicitEndCount += 1
      uniqueEnds.add(explicitEnd)
    }
  }
  const collapsedStarts =
    explicitStartCount > 1 && uniqueStarts.size <= Math.max(1, tokenCount / 4)
  const collapsedEnds =
    explicitEndCount > 1 && uniqueEnds.size <= Math.max(1, tokenCount / 4)
  const shouldForceEstimated =
    hasLineWindow && tokenCount > 1 && (collapsedStarts || collapsedEnds)
  if (shouldForceEstimated) {
    return {
      start: estimatedStart,
      end: estimatedEnd,
    }
  }
  const prevEnd = toTime(prevToken?.end) ?? toTime(prevToken?.start)
  let start = toTime(token.start)
  if (start == null) {
    start = prevEnd ?? estimatedStart ?? lineStart
  }
  let end = toTime(token.end)
  if (end == null) {
    const nextDirectStart = toTime(nextToken?.start)
    const nextEstimatedStart =
      hasLineWindow && tokenIndex + 1 < tokenCount
        ? lineStart + ((lineEnd - lineStart) * (tokenIndex + 1)) / tokenCount
        : null
    end = nextDirectStart ?? nextEstimatedStart ?? estimatedEnd ?? lineEnd
  }
  if (
    tokenCount === 1 &&
    hasLineWindow &&
    (start == null || end == null || end <= start + 1)
  ) {
    start = lineStart
    end = lineEnd
  }
  if (start != null && end != null && end < start) {
    end = start
  }
  return { start, end }
 }
 export const getActiveKaraokeState = (lines, currentTimeMs) => {
  if (!Array.isArray(lines) || lines.length === 0) {
    return { lineIndex: -1, tokenIndex: -1 }
  }
  const current = Number.isFinite(Number(currentTimeMs))
    ? Number(currentTimeMs)
    : 0
  let lineIndex = 0
  for (let i = 0; i < lines.length; i += 1) {
    const lineStart = toTime(lines[i]?.start)
    if (lineStart == null || lineStart <= current + KARAOKE_SWITCH_EPSILON_MS) {
      lineIndex = i
      continue
    }
    break
  }
  for (let i = lineIndex; i >= 0; i -= 1) {
    const lineStart = toTime(lines[i]?.start)
    const lineEnd = toTime(lines[i]?.end) ?? toTime(lines[i + 1]?.start)
    if (lineStart != null && current + KARAOKE_SWITCH_EPSILON_MS < lineStart) {
      continue
    }
    if (lineEnd == null || current <= lineEnd + KARAOKE_SWITCH_EPSILON_MS) {
      lineIndex = i
      break
    }
  }
  const activeLine = lines[lineIndex] || null
  const tokens = Array.isArray(activeLine?.tokens) ? activeLine.tokens : []
  let tokenIndex = -1
  for (let i = 0; i < tokens.length; i += 1) {
    const { start: tokenStart, end: tokenEnd } = resolveKaraokeTokenWindow(
      activeLine,
      i,
      lines[lineIndex + 1]?.start,
    )
    if (
      tokenStart == null ||
      tokenStart <= current + KARAOKE_SWITCH_EPSILON_MS
    ) {
      tokenIndex = i
      if (tokenEnd != null && current <= tokenEnd + KARAOKE_SWITCH_EPSILON_MS) {
        break
      }
      continue
    }
    break
  }
  return { lineIndex, tokenIndex }
 }
 export const findLayerLineIndexForMain = (mainLines, layerLines, mainIndex) => {
  if (
    !Array.isArray(mainLines) ||
    !Array.isArray(layerLines) ||
    mainLines.length === 0 ||
    layerLines.length === 0 ||
    mainIndex < 0 ||
    mainIndex >= mainLines.length
  ) {
    return -1
  }
  const { start: mainStart, end: mainEnd } = lineTimeWindow(
    mainLines,
    mainIndex,
  )
  if (mainStart == null) {
    return -1
  }
  const mainWindowEnd = mainEnd ?? mainStart
  const mainWindowDuration = Math.max(0, mainWindowEnd - mainStart)
  const maxDelta = Math.max(550, Math.min(1400, mainWindowDuration + 420))
  let bestIdx = -1
  let bestScore = Number.POSITIVE_INFINITY
  for (let i = 0; i < layerLines.length; i += 1) {
    const { start, end } = lineTimeWindow(layerLines, i)
    if (start != null && end != null) {
      const overlap = Math.min(end, mainEnd ?? end) - Math.max(start, mainStart)
      if (overlap >= 0) {
        const score = Math.abs(start - mainStart) + Math.abs(i - mainIndex) * 30
        if (score < bestScore) {
          bestScore = score
          bestIdx = i
        }
        continue
      }
    }
    if (start != null) {
      if (Math.abs(start - mainStart) > maxDelta) {
        continue
      }
      const score = Math.abs(start - mainStart) + Math.abs(i - mainIndex) * 45
      if (score < bestScore) {
        bestScore = score
        bestIdx = i
      }
    }
  }
  return bestIdx
 }
 export const resolveLayerLineForMain = (mainLines, layerLines, mainIndex) => {
  const index = findLayerLineIndexForMain(mainLines, layerLines, mainIndex)
  return {
    index,
    line: index >= 0 ? layerLines[index] : null,
  }
 }
--- a/ui/src/audioplayer/lyrics.test.js
+++ b/ui/src/audioplayer/lyrics.test.js
@ -0,0 +1,416 @@
 import {
  buildKaraokeLines,
  findLayerLineIndexForMain,
  getPreferredLyricLanguage,
  getActiveKaraokeState,
  hasStructuredLyricContent,
  pickStructuredLyric,
  resolveKaraokeTokenWindow,
  resolveLayerLineForMain,
  selectLyricLayers,
  structuredLyricToLrc,
  structuredLyricsToLrc,
 } from './lyrics'
 describe('lyrics helpers', () => {
  beforeEach(() => {
    localStorage.clear()
  })
  it('prefers a lyric track that matches the locale', () => {
    const selected = pickStructuredLyric(
      [
        {
          lang: 'eng',
          synced: true,
          line: [{ start: 1000, value: 'English line' }],
        },
        {
          lang: 'pt-BR',
          synced: true,
          line: [{ start: 1000, value: 'Linha em portugues' }],
        },
      ],
      'pt-BR',
    )
    expect(selected.lang).toBe('pt-BR')
  })
  it('falls back to english when preferred locale is not available', () => {
    const selected = pickStructuredLyric(
      [
        {
          lang: 'eng',
          synced: true,
          line: [{ start: 1000, value: 'English line' }],
        },
        {
          lang: 'deu',
          synced: true,
          line: [{ start: 1000, value: 'Deutsche Zeile' }],
        },
      ],
      'pt-BR',
    )
    expect(selected.lang).toBe('eng')
  })
  it('falls back to first synced track when english is missing', () => {
    const selected = pickStructuredLyric(
      [
        {
          lang: 'jpn',
          synced: true,
          line: [{ start: 1000, value: 'Nihongo' }],
        },
        {
          lang: 'deu',
          synced: true,
          line: [{ start: 1000, value: 'Deutsch' }],
        },
      ],
      'pt-BR',
    )
    expect(selected.lang).toBe('jpn')
  })
  it('selects translation and pronunciation layers by kind', () => {
    const layers = selectLyricLayers(
      [
        {
          kind: 'main',
          lang: 'ja',
          synced: true,
          line: [{ start: 1000, value: 'こんにちは' }],
        },
        {
          kind: 'translation',
          lang: 'es',
          synced: true,
          line: [{ start: 1000, value: 'Hola' }],
        },
        {
          kind: 'pronunciation',
          lang: 'ja-Latn',
          synced: true,
          line: [{ start: 1000, value: 'konnichiwa' }],
        },
      ],
      'es-MX',
    )
    expect(layers.main.lang).toBe('ja')
    expect(layers.translation.lang).toBe('es')
    expect(layers.pronunciation.lang).toBe('ja-Latn')
  })
  it('treats missing kind as main for backward compatibility', () => {
    const layers = selectLyricLayers(
      [
        {
          lang: 'eng',
          synced: true,
          line: [{ start: 1000, value: 'Main' }],
        },
      ],
      'eng',
    )
    expect(layers.main.lang).toBe('eng')
    expect(layers.translation).toBeNull()
    expect(layers.pronunciation).toBeNull()
  })
  it('matches layer line by timing for the active main line', () => {
    const mainLines = [
      { index: 0, start: 1000, end: 1800, value: 'Line A', tokens: [] },
      { index: 1, start: 2000, end: 2800, value: 'Line B', tokens: [] },
    ]
    const layerLines = [
      { index: 0, start: 900, end: 1750, value: 'A2', tokens: [] },
      { index: 1, start: 2050, end: 2900, value: 'B2', tokens: [] },
    ]
    expect(findLayerLineIndexForMain(mainLines, layerLines, 1)).toBe(1)
    expect(resolveLayerLineForMain(mainLines, layerLines, 0).line.value).toBe(
      'A2',
    )
  })
  it('matches metadata layers by nearest timing even when indexes differ', () => {
    const mainLines = [
      { index: 0, start: 1000, end: 1800, value: 'Line A', tokens: [] },
      { index: 1, start: 2000, end: 2800, value: 'Line B', tokens: [] },
      { index: 2, start: 3000, end: 3800, value: 'Line C', tokens: [] },
    ]
    const layerLines = [
      { index: 2, start: 3020, end: 3820, value: 'C2', tokens: [] },
      { index: 0, start: 980, end: 1760, value: 'A2', tokens: [] },
      { index: 1, start: 2010, end: 2810, value: 'B2', tokens: [] },
    ]
    expect(findLayerLineIndexForMain(mainLines, layerLines, 1)).toBe(2)
    expect(resolveLayerLineForMain(mainLines, layerLines, 2).line.value).toBe(
      'C2',
    )
  })
  it('returns no layer match when the nearest line is too far in time', () => {
    const mainLines = [
      { index: 0, start: 1000, end: 1800, value: 'Line A', tokens: [] },
      { index: 1, start: 2000, end: 2800, value: 'Line B', tokens: [] },
    ]
    const layerLines = [
      { index: 0, start: 60000, end: 60800, value: 'Far line', tokens: [] },
    ]
    expect(findLayerLineIndexForMain(mainLines, layerLines, 1)).toBe(-1)
    expect(resolveLayerLineForMain(mainLines, layerLines, 1).line).toBeNull()
  })
  it('converts a structured lyric track to LRC', () => {
    const lrc = structuredLyricToLrc({
      lang: 'eng',
      synced: true,
      line: [
        { start: 18800, value: "We're no strangers to love" },
        { start: 22801, value: 'You know the rules and so do I' },
      ],
    })
    expect(lrc).toBe(
      "[00:18.80] We're no strangers to love\n[00:22.80] You know the rules and so do I\n",
    )
  })
  it('returns empty text when no synced lyrics are available', () => {
    const lrc = structuredLyricsToLrc(
      [{ lang: 'eng', synced: false, line: [{ value: 'Unsynced line' }] }],
      'eng',
    )
    expect(lrc).toBe('')
  })
  it('reads preferred language from localStorage first', () => {
    localStorage.setItem('locale', 'pt-BR')
    expect(getPreferredLyricLanguage()).toBe('pt-BR')
  })
  it('builds karaoke lines from tokenLine payload', () => {
    const lines = buildKaraokeLines({
      lang: 'eng',
      synced: true,
      line: [{ start: 1000, end: 3000, value: 'Hello world' }],
      tokenLine: [
        {
          index: 0,
          start: 1000,
          end: 3000,
          value: 'Hello world',
          token: [
            { start: 1000, end: 1500, value: 'Hello' },
            { start: 2000, end: 2500, value: 'world', role: 'x-bg' },
          ],
        },
      ],
    })
    expect(lines).toEqual([
      {
        index: 0,
        start: 1000,
        end: 3000,
        value: 'Hello world',
        tokens: [
          { start: 1000, end: 1500, value: 'Hello', role: '' },
          { start: 2000, end: 2500, value: 'world', role: 'x-bg' },
        ],
      },
    ])
  })
  it('sorts token timing by start to keep playback stable', () => {
    const lines = buildKaraokeLines({
      lang: 'eng',
      synced: true,
      line: [{ start: 1000, end: 3000, value: 'Hello world' }],
      tokenLine: [
        {
          index: 0,
          start: 1000,
          end: 3000,
          value: 'Hello world',
          token: [
            { start: 2000, end: 2500, value: 'world', role: '' },
            { start: 1000, end: 1500, value: 'Hello', role: '' },
          ],
        },
      ],
    })
    expect(lines[0].tokens.map((token) => token.value)).toEqual([
      'Hello',
      'world',
    ])
  })
  it('splits a single full-line token into synthetic word tokens', () => {
    const lines = buildKaraokeLines({
      lang: 'ko-Latn',
      synced: true,
      line: [{ start: 1000, end: 2000, value: 'Da-la-lun, dun' }],
      tokenLine: [
        {
          index: 0,
          start: 1000,
          end: 2000,
          value: 'Da-la-lun, dun',
          token: [{ start: 1000, end: 2000, value: 'Da-la-lun, dun' }],
        },
      ],
    })
    expect(lines).toHaveLength(1)
    expect(lines[0].tokens).toHaveLength(2)
    expect(lines[0].tokens[0].value).toBe('Da-la-lun, ')
    expect(lines[0].tokens[1].value).toBe('dun')
    const firstWindow = resolveKaraokeTokenWindow(lines[0], 0)
    const secondWindow = resolveKaraokeTokenWindow(lines[0], 1)
    expect(firstWindow.start).toBeCloseTo(1000)
    expect(firstWindow.end).toBeCloseTo(1500)
    expect(secondWindow.start).toBeCloseTo(1500)
    expect(secondWindow.end).toBeCloseTo(2000)
  })
  it('detects active line and token for karaoke timing', () => {
    const state = getActiveKaraokeState(
      [
        {
          index: 0,
          start: 1000,
          end: 3000,
          value: 'Hello world',
          tokens: [
            { start: 1000, end: 1500, value: 'Hello', role: '' },
            { start: 2000, end: 2500, value: 'world', role: '' },
          ],
        },
        {
          index: 1,
          start: 3500,
          end: 5000,
          value: 'Second line',
          tokens: [],
        },
      ],
      2200,
    )
    expect(state).toEqual({ lineIndex: 0, tokenIndex: 1 })
  })
  it('resolves token window fallback boundaries from neighboring tokens', () => {
    const line = {
      start: 1000,
      end: 3000,
      value: 'Hello world',
      tokens: [
        { start: 1200, value: 'Hello', role: '' },
        { start: 1800, value: 'world', role: '' },
      ],
    }
    expect(resolveKaraokeTokenWindow(line, 0)).toEqual({
      start: 1200,
      end: 1800,
    })
    expect(resolveKaraokeTokenWindow(line, 1)).toEqual({
      start: 1800,
      end: 3000,
    })
  })
  it('infers sequential token windows when token timings are missing', () => {
    const line = {
      start: 1000,
      end: 2000,
      value: 'A B C',
      tokens: [
        { value: 'A', role: '' },
        { value: 'B', role: '' },
        { value: 'C', role: '' },
      ],
    }
    const first = resolveKaraokeTokenWindow(line, 0)
    const second = resolveKaraokeTokenWindow(line, 1)
    const third = resolveKaraokeTokenWindow(line, 2)
    expect(first.start).toBeCloseTo(1000)
    expect(first.end).toBeCloseTo(1333.3333333333333)
    expect(second.start).toBeCloseTo(1333.3333333333333)
    expect(second.end).toBeCloseTo(1666.6666666666667)
    expect(third.start).toBeCloseTo(1666.6666666666667)
    expect(third.end).toBeCloseTo(2000)
  })
  it('falls back to sequential windows when token timings are collapsed', () => {
    const line = {
      start: 1000,
      end: 2000,
      value: 'A B C',
      tokens: [
        { start: 1000, end: 2000, value: 'A', role: '' },
        { start: 1000, end: 2000, value: 'B', role: '' },
        { start: 1000, end: 2000, value: 'C', role: '' },
      ],
    }
    const first = resolveKaraokeTokenWindow(line, 0)
    const second = resolveKaraokeTokenWindow(line, 1)
    const third = resolveKaraokeTokenWindow(line, 2)
    expect(first.start).toBeCloseTo(1000)
    expect(first.end).toBeCloseTo(1333.3333333333333)
    expect(second.start).toBeCloseTo(1333.3333333333333)
    expect(second.end).toBeCloseTo(1666.6666666666667)
    expect(third.start).toBeCloseTo(1666.6666666666667)
    expect(third.end).toBeCloseTo(2000)
  })
  it('keeps token selection stable near tight token boundaries', () => {
    const state = getActiveKaraokeState(
      [
        {
          index: 0,
          start: 1000,
          end: 2000,
          value: 'A B',
          tokens: [
            { start: 1000, end: 1100, value: 'A', role: '' },
            { start: 1110, end: 1300, value: 'B', role: '' },
          ],
        },
      ],
      1108,
    )
    expect(state).toEqual({ lineIndex: 0, tokenIndex: 0 })
  })
  it('reports structured lyric content when token timing exists', () => {
    expect(
      hasStructuredLyricContent({
        tokenLine: [{ token: [{ start: 100, value: 'a' }] }],
      }),
    ).toBe(true)
  })
 })
--- a/ui/src/reducers/playerReducer.js
+++ b/ui/src/reducers/playerReducer.js
@ -7,6 +7,7 @@ import {
  PLAYER_CURRENT,
  PLAYER_PLAY_NEXT,
  PLAYER_PLAY_TRACKS,
  PLAYER_UPDATE_LYRIC,
  PLAYER_SET_TRACK,
  PLAYER_SET_VOLUME,
  PLAYER_SYNC_QUEUE,
@ -60,21 +61,25 @@ const mapToAudioLists = (item) => {
  let lyricText = ''
  if (lyrics) {
-    const structured = JSON.parse(lyrics)
+    try {
-    for (const structuredLyric of structured) {
+      const structured = JSON.parse(lyrics)
-      if (structuredLyric.synced) {
+      for (const structuredLyric of structured) {
-        for (const line of structuredLyric.line) {
+        if (structuredLyric.synced) {
-          let time = Math.floor(line.start / 10)
+          for (const line of structuredLyric.line) {
-          const ms = time % 100
+            let time = Math.floor(line.start / 10)
-          time = Math.floor(time / 100)
+            const ms = time % 100
-          const sec = time % 60
+            time = Math.floor(time / 100)
-          time = Math.floor(time / 60)
+            const sec = time % 60
-          const min = time % 60
+            time = Math.floor(time / 60)
            const min = time % 60
-          ms.toString()
+            ms.toString()
-          lyricText += `[${pad(min)}:${pad(sec)}.${pad(ms)}] ${line.value}\n`
+            lyricText += `[${pad(min)}:${pad(sec)}.${pad(ms)}] ${line.value}\n`
          }
        }
      }
    } catch {
      lyricText = ''
    }
  }
@ -206,6 +211,45 @@ const reduceMode = (state, { data: { mode } }) => {
  }
 }
 const reduceUpdateLyric = (state, { data: { trackId, lyric } }) => {
  if (!trackId) {
    return state
  }
  let changed = false
  const queue = state.queue.map((item) => {
    if (item.trackId !== trackId) {
      return item
    }
    if (item.lyric === lyric) {
      return item
    }
    changed = true
    return {
      ...item,
      lyric,
    }
  })
  if (!changed) {
    return state
  }
  const current =
    state.current?.trackId === trackId
      ? {
          ...state.current,
          lyric,
        }
      : state.current
  return {
    ...state,
    queue,
    current,
  }
 }
 export const playerReducer = (previousState = initialState, payload) => {
  const { type } = payload
  switch (type) {
@ -243,6 +287,8 @@ export const playerReducer = (previousState = initialState, payload) => {
          previousState.savedPlayIndex >= 0 ? previousState.savedPlayIndex : 0,
      }
    }
    case PLAYER_UPDATE_LYRIC:
      return reduceUpdateLyric(previousState, payload)
    default:
      return previousState
  }
--- a/ui/src/reducers/playerReducer.test.js
+++ b/ui/src/reducers/playerReducer.test.js
@ -1,11 +1,24 @@
-import { describe, it, expect } from 'vitest'
+import { describe, expect, it, vi } from 'vitest'
 import { playerReducer } from './playerReducer'
 import {
  PLAYER_SYNC_QUEUE,
  PLAYER_CURRENT,
  PLAYER_REFRESH_QUEUE,
  PLAYER_SET_TRACK,
  PLAYER_SYNC_QUEUE,
  PLAYER_UPDATE_LYRIC,
 } from '../actions'
 vi.mock('uuid', () => ({
  v4: () => 'test-uuid',
 }))
 vi.mock('../subsonic', () => ({
  default: {
    streamUrl: vi.fn((id) => `/rest/stream?id=${id}`),
    getCoverArtUrl: vi.fn(() => '/rest/getCoverArt?id=test'),
  },
 }))
 describe('playerReducer', () => {
  describe('pending track selection survives SYNC_QUEUE and premature CURRENT', () => {
    // Simulates the real sequence when clicking a new song while one is playing:
@ -54,8 +67,6 @@ describe('playerReducer', () => {
    })
    it('CURRENT for old track preserves pending playIndex', () => {
      // After SYNC_QUEUE, queue has new UUIDs. The old track's UUID (zzz)
      // is at index 2, but playIndex is 0. This is a premature callback.
      const stateAfterSync = {
        ...stateAfterPlayTracks,
        queue: [
@ -71,7 +82,7 @@ describe('playerReducer', () => {
      const result = playerReducer(stateAfterSync, action)
      expect(result.playIndex).toBe(0)
      expect(result.clear).toBe(true)
-      expect(result.savedPlayIndex).toBe(2) // preserved from before
+      expect(result.savedPlayIndex).toBe(2)
    })
    it('CURRENT for correct track consumes pending playIndex', () => {
@ -83,7 +94,6 @@ describe('playerReducer', () => {
          { trackId: 's3', uuid: 'zzz', name: 'Song 3' },
        ],
      }
      // Player switched to Song 1 (uuid 'xxx', index 0 == playIndex)
      const action = {
        type: PLAYER_CURRENT,
        data: { uuid: 'xxx', name: 'Song 1', volume: 1 },
@ -142,4 +152,80 @@ describe('playerReducer', () => {
      expect(result.playIndex).toBe(0)
    })
  })
  it('maps embedded synced lyrics to LRC text', () => {
    const lyrics = JSON.stringify([
      {
        lang: 'eng',
        synced: true,
        line: [{ start: 1000, value: 'Line one' }],
      },
      {
        lang: 'eng',
        synced: false,
        line: [{ value: 'Unsynced line' }],
      },
    ])
    const state = playerReducer(undefined, {
      type: PLAYER_SET_TRACK,
      data: {
        id: 'song-1',
        title: 'Test Song',
        artist: 'Test Artist',
        album: 'Test Album',
        duration: 60,
        lyrics,
      },
    })
    expect(state.queue).toHaveLength(1)
    expect(state.queue[0].lyric).toBe('[00:01.00] Line one\n')
  })
  it('updates queue lyric by track id', () => {
    const initial = playerReducer(undefined, {
      type: PLAYER_SET_TRACK,
      data: {
        id: 'song-1',
        title: 'Test Song',
        artist: 'Test Artist',
        album: 'Test Album',
        duration: 60,
      },
    })
    const updated = playerReducer(initial, {
      type: PLAYER_UPDATE_LYRIC,
      data: {
        trackId: 'song-1',
        lyric: '[00:01.00] Updated lyric\n',
      },
    })
    expect(updated.queue[0].lyric).toBe('[00:01.00] Updated lyric\n')
  })
  it('returns same state when lyric update does not match any track', () => {
    const initial = playerReducer(undefined, {
      type: PLAYER_SET_TRACK,
      data: {
        id: 'song-1',
        title: 'Test Song',
        artist: 'Test Artist',
        album: 'Test Album',
        duration: 60,
      },
    })
    const updated = playerReducer(initial, {
      type: PLAYER_UPDATE_LYRIC,
      data: {
        trackId: 'missing-track',
        lyric: '[00:01.00] Updated lyric\n',
      },
    })
    expect(updated).toBe(initial)
  })
 })
--- a/ui/src/subsonic/index.js
+++ b/ui/src/subsonic/index.js
@ -120,6 +120,10 @@ const getTopSongs = (artist, count = 50) => {
  return httpClient(url('getTopSongs', null, { artist, count }))
 }
 const getLyricsBySongId = (id) => {
  return httpClient(url('getLyricsBySongId', id))
 }
 const streamUrl = (id, options) => {
  return baseUrl(
    url('stream', id, {
@ -149,4 +153,5 @@ export default {
  getArtistInfo,
  getTopSongs,
  getSimilarSongs2,
  getLyricsBySongId,
 }
--- a/ui/src/subsonic/index.test.js
+++ b/ui/src/subsonic/index.test.js
@ -1,7 +1,12 @@
 import { vi } from 'vitest'
 import { COVER_ART_SIZE } from '../consts'
 import { httpClient } from '../dataProvider'
 import subsonic from './index'
 vi.mock('../dataProvider', () => ({
  httpClient: vi.fn(() => Promise.resolve({})),
 }))
 describe('getCoverArtUrl', () => {
  beforeEach(() => {
    // Mock window.location
@ -178,3 +183,29 @@ describe('getAvatarUrl', () => {
    expect(url).toContain('username=john')
  })
 })
 describe('getLyricsBySongId', () => {
  beforeEach(() => {
    vi.clearAllMocks()
    const localStorageMock = {
      getItem: vi.fn((key) => {
        const values = {
          username: 'testuser',
          'subsonic-token': 'testtoken',
          'subsonic-salt': 'testsalt',
        }
        return values[key] || null
      }),
    }
    Object.defineProperty(window, 'localStorage', { value: localStorageMock })
  })
  it('calls the getLyricsBySongId endpoint', async () => {
    await subsonic.getLyricsBySongId('song-1')
    expect(httpClient).toHaveBeenCalledTimes(1)
    const calledUrl = httpClient.mock.calls[0][0]
    expect(calledUrl).toContain('/rest/getLyricsBySongId?')
    expect(calledUrl).toContain('id=song-1')
  })
 })
		`@ -0,0 +1,2 @@`
							`<?xml version="1.0" encoding="UTF-8"?>`
							`<tt xmlns="http://www.w3.org/ns/ttml"><body><div xml:lang="eng"><p begin="00:00:00.00">BOM test line</p></div></body></tt>`