feat: add TTML lyrics support with token-level karaoke and translation/pronunciation layers

Add a full TTML (Timed Text Markup Language) sidecar lyrics parser that extracts word/syllable-level timing from <span> elements, plus translation and pronunciation (transliteration) tracks from Apple Music TTML metadata sections. Backend changes: - TTML parser (core/lyrics/ttml.go) with support for all TTML time formats, nested timing contexts, and bare decimal second offsets - Translation/pronunciation tracks resolved via key-based metadata linking - Line timing hydration from token-level start/end values - 'kind' field added to Lyrics model and StructuredLyric API response (main/translation/pronunciation) - 'tokenLine' array in API response for word-level timing data - UTF-8 BOM and UTF-16 LE encoding support for TTML files - Fix for ambiguous time resolution in pronunciation spans (pre-1-minute) Frontend changes: - KaraokeLyricsOverlay rewritten with scrollable multi-line layout, word-level wipe highlighting with eased alpha transitions, rAF-driven playback clock with drift correction - Inline translation (above) and pronunciation (below) each main line, with smart filtering to hide redundant lines (same normalized text) - TR/PR toggle buttons and layer selection via selectLyricLayers() - Click-to-seek: click any lyric line to jump to that position - Customization popover with font-size sliders and color presets for each line type (TR/Default/PR), persisted to localStorage - Smooth font-size transition between active and inactive lines - Resizable overlay height via drag handle - lyrics.js: resolveKaraokeTokenWindow, buildSyntheticWordTokens, findLayerLineIndexForMain, token sorting, collapsed timing detection API extension (non-breaking, additive): - tokenLine[].token[] provides per-word start/end timing (ms) - tokenLine[].index maps back to the corresponding line[] entry - kind field: 'main', 'translation', 'pronunciation' - Clients ignoring tokenLine/kind continue to work unchanged
2026-05-03 06:51:16 +00:00 · 2026-02-20 16:54:45 +02:00 · 2026-02-20 16:54:45 +02:00 · c77e0de976
commit c77e0de976
parent ccee33f474
30 changed files with 4644 additions and 59 deletions
--- a/README.md
+++ b/README.md
@ -52,6 +52,7 @@ A share of the revenue helps fund the development of Navidrome at no additional
 - **Multi-platform**, runs on macOS, Linux and Windows. **Docker** images are also provided
 - Ready to use binaries for all major platforms, including **Raspberry Pi**
 - Automatically **monitors your library** for changes, importing new files and reloading new metadata 
+ - Supports synchronized lyrics from sidecar **.lrc** and **.ttml** files (via `lyricspriority`)
 - **Themeable**, modern and responsive **Web interface** based on [Material UI](https://material-ui.com)
 - **Compatible** with all Subsonic/Madsonic/Airsonic [clients](https://www.navidrome.org/docs/overview/#apps)
 - **Transcoding** on the fly. Can be set per user/player. **Opus encoding is supported**
--- a/conf/configuration.go
+++ b/conf/configuration.go
@ -677,7 +677,7 @@ func setViperDefaults() {
 	viper.SetDefault("coverartquality", 75)
 	viper.SetDefault("artistartpriority", "artist.*, album/artist.*, external")
 	viper.SetDefault("discartpriority", "disc*.*, cd*.*, cover.*, folder.*, front.*, discsubtitle, embedded")
-	viper.SetDefault("lyricspriority", ".lrc,.txt,embedded")
+	viper.SetDefault("lyricspriority", ".lrc,.ttml,.txt,embedded")
 	viper.SetDefault("enablegravatar", false)
 	viper.SetDefault("enablefavourites", true)
 	viper.SetDefault("enablestarrating", true)
--- a/core/lyrics/lyrics_test.go
+++ b/core/lyrics/lyrics_test.go
@ -44,6 +44,35 @@ var _ = Describe("sources", func() {
 		},
 	}

+	ttmlLyrics := model.LyricList{
+		model.Lyrics{
+			Kind: "main",
+			Lang: "eng",
+			Line: []model.Line{
+				{
+					Start: gg.P(int64(18800)),
+					Value: "We're no strangers to love",
+				},
+				{
+					Start: gg.P(int64(22800)),
+					Value: "You know the rules and so do I",
+				},
+			},
+			Synced: true,
+		},
+		model.Lyrics{
+			Kind: "main",
+			Lang: "por",
+			Line: []model.Line{
+				{
+					Start: gg.P(int64(18800)),
+					Value: "Nao somos estranhos ao amor",
+				},
+			},
+			Synced: true,
+		},
+	}
+
 	unsyncedLyrics := model.LyricList{
 		model.Lyrics{
 			Lang: "xxx",
@ -80,7 +109,8 @@ var _ = Describe("sources", func() {
 	},
 		Entry("embedded > lrc > txt", "embedded,.lrc,.txt", embeddedLyrics),
 		Entry("lrc > embedded > txt", ".lrc,embedded,.txt", syncedLyrics),
-		Entry("txt > lrc > embedded", ".txt,.lrc,embedded", unsyncedLyrics))
+		Entry("txt > lrc > embedded", ".txt,.lrc,embedded", unsyncedLyrics),
+		Entry("ttml > lrc > embedded", ".ttml,.lrc,embedded", ttmlLyrics))

 	Context("Errors", func() {
 		var RegularUserContext = XContext
--- a/core/lyrics/sources.go
+++ b/core/lyrics/sources.go
@ -5,6 +5,7 @@ import (
 	"errors"
 	"os"
 	"path"
+	"strings"

 	"github.com/navidrome/navidrome/log"
 	"github.com/navidrome/navidrome/model"
@ -36,18 +37,31 @@ func fromExternalFile(ctx context.Context, mf *model.MediaFile, suffix string) (
 		return nil, err
 	}

-	lyrics, err := model.ToLyrics("xxx", string(contents))
-	if err != nil {
-		log.Error(ctx, "error parsing lyric external file", "path", externalLyric, err)
-		return nil, err
-	} else if lyrics == nil {
+	var list model.LyricList
+	if strings.EqualFold(suffix, ".ttml") {
+		list, err = parseTTML(contents)
+		if err != nil {
+			log.Error(ctx, "error parsing ttml external file", "path", externalLyric, err)
+			return nil, err
+		}
+	} else {
+		lyrics, err := model.ToLyrics("xxx", string(contents))
+		if err != nil {
+			log.Error(ctx, "error parsing lyric external file", "path", externalLyric, err)
+			return nil, err
+		}
+		if lyrics != nil {
+			list = model.LyricList{*lyrics}
+		}
+	}
+
+	if len(list) == 0 {
 		log.Trace(ctx, "empty lyrics from external file", "path", externalLyric)
 		return nil, nil
 	}

 	log.Trace(ctx, "retrieved lyrics from external file", "path", externalLyric)
-
-	return model.LyricList{*lyrics}, nil
+	return list, nil
 }

 // fromPlugin attempts to load lyrics from a plugin with the given name.
--- a/core/lyrics/sources_test.go
+++ b/core/lyrics/sources_test.go
@ -109,6 +109,41 @@ var _ = Describe("sources", func() {
 			}))
 		})

+		It("should return synchronized multilingual lyrics from a TTML file", func() {
+			mf := model.MediaFile{Path: "tests/fixtures/test.mp3"}
+			lyrics, err := fromExternalFile(ctx, &mf, ".ttml")
+
+			Expect(err).To(BeNil())
+			Expect(lyrics).To(Equal(model.LyricList{
+				{
+					Kind: "main",
+					Lang: "eng",
+					Line: []model.Line{
+						{
+							Start: gg.P(int64(18800)),
+							Value: "We're no strangers to love",
+						},
+						{
+							Start: gg.P(int64(22800)),
+							Value: "You know the rules and so do I",
+						},
+					},
+					Synced: true,
+				},
+				{
+					Kind: "main",
+					Lang: "por",
+					Line: []model.Line{
+						{
+							Start: gg.P(int64(18800)),
+							Value: "Nao somos estranhos ao amor",
+						},
+					},
+					Synced: true,
+				},
+			}))
+		})
+
 		It("should handle LRC files with UTF-8 BOM marker (issue #4631)", func() {
 			// The function looks for <basePath-without-ext><suffix>, so we need to pass
 			// a MediaFile with .mp3 path and look for .lrc suffix
@ -142,5 +177,33 @@ var _ = Describe("sources", func() {
 			Expect(lyrics[0].Line[1].Start).To(Equal(gg.P(int64(22801))))
 			Expect(lyrics[0].Line[1].Value).To(Equal("You know the rules and so do I"))
 		})
+
+		It("should handle TTML files with UTF-8 BOM marker", func() {
+			mf := model.MediaFile{Path: "tests/fixtures/bom-test.mp3"}
+			lyrics, err := fromExternalFile(ctx, &mf, ".ttml")
+
+			Expect(err).To(BeNil())
+			Expect(lyrics).To(HaveLen(1))
+			Expect(lyrics[0].Kind).To(Equal("main"))
+			Expect(lyrics[0].Synced).To(BeTrue())
+			Expect(lyrics[0].Line).To(HaveLen(1))
+			Expect(lyrics[0].Line[0].Start).To(Equal(gg.P(int64(0))))
+			Expect(lyrics[0].Line[0].Value).To(Equal("BOM test line"))
+		})
+
+		It("should handle UTF-16 LE encoded TTML files", func() {
+			mf := model.MediaFile{Path: "tests/fixtures/bom-utf16-test.mp3"}
+			lyrics, err := fromExternalFile(ctx, &mf, ".ttml")
+
+			Expect(err).To(BeNil())
+			Expect(lyrics).To(HaveLen(1))
+			Expect(lyrics[0].Kind).To(Equal("main"))
+			Expect(lyrics[0].Synced).To(BeTrue())
+			Expect(lyrics[0].Line).To(HaveLen(2))
+			Expect(lyrics[0].Line[0].Start).To(Equal(gg.P(int64(18800))))
+			Expect(lyrics[0].Line[0].Value).To(Equal("UTF16 line one"))
+			Expect(lyrics[0].Line[1].Start).To(Equal(gg.P(int64(22801))))
+			Expect(lyrics[0].Line[1].Value).To(Equal("UTF16 line two"))
+		})
 	})
 })
--- a/core/lyrics/sources_ttml_test.go
+++ b/core/lyrics/sources_ttml_test.go
@ -0,0 +1,92 @@
+package lyrics
+
+import (
+	"context"
+	"os"
+	"path/filepath"
+	"testing"
+
+	"github.com/navidrome/navidrome/model"
+)
+
+func TestFromExternalFileTTML(t *testing.T) {
+	ctx := context.Background()
+	mf := model.MediaFile{Path: fixturePath("test.mp3")}
+
+	lyrics, err := fromExternalFile(ctx, &mf, ".ttml")
+	if err != nil {
+		t.Fatalf("fromExternalFile returned error: %v", err)
+	}
+	if len(lyrics) != 2 {
+		t.Fatalf("expected 2 lyric tracks, got %d", len(lyrics))
+	}
+	if lyrics[0].Lang != "eng" {
+		t.Fatalf("expected first language 'eng', got %q", lyrics[0].Lang)
+	}
+	if len(lyrics[0].Line) != 2 {
+		t.Fatalf("expected 2 english lines, got %d", len(lyrics[0].Line))
+	}
+	if lyrics[0].Line[0].Start == nil || *lyrics[0].Line[0].Start != 18800 {
+		t.Fatalf("expected first english line start to be 18800, got %v", lyrics[0].Line[0].Start)
+	}
+}
+
+func TestFromExternalFileTTMLWithUTF8BOM(t *testing.T) {
+	ctx := context.Background()
+	mf := model.MediaFile{Path: fixturePath("bom-test.ttml")}
+
+	lyrics, err := fromExternalFile(ctx, &mf, ".ttml")
+	if err != nil {
+		t.Fatalf("fromExternalFile returned error: %v", err)
+	}
+	if len(lyrics) != 1 {
+		t.Fatalf("expected 1 lyric track, got %d", len(lyrics))
+	}
+	if !lyrics[0].Synced {
+		t.Fatal("expected BOM TTML lyrics to be synced")
+	}
+	if len(lyrics[0].Line) != 1 {
+		t.Fatalf("expected 1 lyric line, got %d", len(lyrics[0].Line))
+	}
+	if lyrics[0].Line[0].Start == nil || *lyrics[0].Line[0].Start != 0 {
+		t.Fatalf("expected first line start 0, got %v", lyrics[0].Line[0].Start)
+	}
+}
+
+func TestFromExternalFileTTMLUTF16(t *testing.T) {
+	ctx := context.Background()
+	mf := model.MediaFile{Path: fixturePath("bom-utf16-test.ttml")}
+
+	lyrics, err := fromExternalFile(ctx, &mf, ".ttml")
+	if err != nil {
+		t.Fatalf("fromExternalFile returned error: %v", err)
+	}
+	if len(lyrics) != 1 {
+		t.Fatalf("expected 1 lyric track, got %d", len(lyrics))
+	}
+	if !lyrics[0].Synced {
+		t.Fatal("expected UTF16 TTML lyrics to be synced")
+	}
+	if len(lyrics[0].Line) != 2 {
+		t.Fatalf("expected 2 lyric lines, got %d", len(lyrics[0].Line))
+	}
+	if lyrics[0].Line[0].Start == nil || *lyrics[0].Line[0].Start != 18800 {
+		t.Fatalf("expected first line start 18800, got %v", lyrics[0].Line[0].Start)
+	}
+	if lyrics[0].Line[1].Start == nil || *lyrics[0].Line[1].Start != 22801 {
+		t.Fatalf("expected second line start 22801, got %v", lyrics[0].Line[1].Start)
+	}
+}
+
+func fixturePath(name string) string {
+	candidates := []string{
+		filepath.Join("tests", "fixtures", name),
+		filepath.Join("..", "..", "tests", "fixtures", name),
+	}
+	for _, candidate := range candidates {
+		if _, err := os.Stat(candidate); err == nil {
+			return candidate
+		}
+	}
+	return filepath.Join("tests", "fixtures", name)
+}
--- a/core/lyrics/ttml.go
+++ b/core/lyrics/ttml.go
@ -0,0 +1,886 @@
+package lyrics
+
+import (
+	"bytes"
+	"encoding/xml"
+	"errors"
+	"io"
+	"math"
+	"regexp"
+	"sort"
+	"strconv"
+	"strings"
+
+	"github.com/navidrome/navidrome/log"
+	"github.com/navidrome/navidrome/model"
+	"github.com/navidrome/navidrome/utils/str"
+)
+
+const (
+	defaultTTMLFrameRate    = 30.0
+	defaultTTMLSubFrameRate = 1.0
+	defaultTTMLTickRate     = 1.0
+
+	ttmlLyricKindMain          = "main"
+	ttmlLyricKindTranslation   = "translation"
+	ttmlLyricKindPronunciation = "pronunciation"
+)
+
+var offsetTimeRegex = regexp.MustCompile(`^([0-9]+(?:\.[0-9]+)?)(h|m|s|ms|f|t)$`)
+var xmlEncodingRegex = regexp.MustCompile(`(?i)<\?xml([^>]*?)encoding\s*=\s*["'][^"']+["']([^>]*)\?>`)
+
+type ttmlTimeKind int
+
+const (
+	ttmlTimeAbsolute ttmlTimeKind = iota
+	ttmlTimeOffset
+	ttmlTimeAmbiguous
+)
+
+type ttmlTimingParams struct {
+	frameRate    float64
+	subFrameRate float64
+	tickRate     float64
+}
+
+type ttmlTimingContext struct {
+	lang     string
+	role     string
+	begin    int64
+	hasBegin bool
+	end      int64
+	hasEnd   bool
+	invalid  bool
+}
+
+type ttmlLineRef struct {
+	order int
+	line  model.Line
+}
+
+type ttmlMetadataEntry struct {
+	key  string
+	line model.Line
+	seq  int
+}
+
+type ttmlResolvedMetadataLine struct {
+	order int
+	seq   int
+	line  model.Line
+}
+
+type ttmlParser struct {
+	decoder *xml.Decoder
+	params  ttmlTimingParams
+
+	mainLangOrder   []string
+	mainLinesByLang map[string][]model.Line
+
+	mainLineRefsByKey map[string]ttmlLineRef
+	mainLineOrder     int
+
+	translationLangOrder   []string
+	translationEntriesByLg map[string][]ttmlMetadataEntry
+
+	pronunciationLangOrder   []string
+	pronunciationEntriesByLg map[string][]ttmlMetadataEntry
+
+	metadataSeq int
+}
+
+func parseTTML(contents []byte) (model.LyricList, error) {
+	contents = xmlEncodingRegex.ReplaceAll(contents, []byte(`<?xml$1encoding="UTF-8"$2?>`))
+
+	p := ttmlParser{
+		decoder: xml.NewDecoder(bytes.NewReader(contents)),
+		params: ttmlTimingParams{
+			frameRate:    defaultTTMLFrameRate,
+			subFrameRate: defaultTTMLSubFrameRate,
+			tickRate:     defaultTTMLTickRate,
+		},
+		mainLinesByLang:          make(map[string][]model.Line),
+		mainLineRefsByKey:        make(map[string]ttmlLineRef),
+		translationEntriesByLg:   make(map[string][]ttmlMetadataEntry),
+		pronunciationEntriesByLg: make(map[string][]ttmlMetadataEntry),
+	}
+
+	root := ttmlTimingContext{lang: "xxx"}
+
+	for {
+		token, err := p.decoder.Token()
+		if errors.Is(err, io.EOF) {
+			break
+		}
+		if err != nil {
+			return nil, err
+		}
+
+		start, ok := token.(xml.StartElement)
+		if !ok {
+			continue
+		}
+
+		if err := p.parseElement(start, root); err != nil {
+			return nil, err
+		}
+	}
+
+	return p.toLyricList(), nil
+}
+
+func (p *ttmlParser) parseElement(start xml.StartElement, parent ttmlTimingContext) error {
+	local := strings.ToLower(start.Name.Local)
+	if local == "tt" {
+		p.updateTimingParams(start.Attr)
+	}
+
+	switch local {
+	case "translation":
+		return p.parseMetadataTrack(start, parent, ttmlLyricKindTranslation)
+	case "transliteration":
+		return p.parseMetadataTrack(start, parent, ttmlLyricKindPronunciation)
+	}
+
+	ctx := p.childContext(start.Attr, parent)
+	if local == "p" {
+		lineText, tokens, err := p.parseParagraph(ctx)
+		if err != nil {
+			return err
+		}
+		if ctx.invalid || lineText == "" {
+			return nil
+		}
+
+		parsedLine := model.Line{Value: lineText}
+		if ctx.hasBegin {
+			startMs := ctx.begin
+			parsedLine.Start = &startMs
+		}
+		if ctx.hasEnd {
+			endMs := ctx.end
+			parsedLine.End = &endMs
+		}
+		if len(tokens) > 0 {
+			parsedLine.Token = tokens
+		}
+		parsedLine = hydrateLineTimingFromTokens(parsedLine)
+
+		lineKey, _ := attrValue(start.Attr, "key")
+		p.addMainLine(ctx.lang, lineKey, parsedLine)
+		return nil
+	}
+
+	for {
+		token, err := p.decoder.Token()
+		if err != nil {
+			return err
+		}
+
+		switch t := token.(type) {
+		case xml.StartElement:
+			nextParent := ctx
+			if ctx.invalid {
+				// Best effort: ignore invalid timing in container elements, and
+				// continue traversing descendants with parent context.
+				nextParent = parent
+			}
+			if err := p.parseElement(t, nextParent); err != nil {
+				return err
+			}
+		case xml.EndElement:
+			if strings.EqualFold(t.Name.Local, start.Name.Local) {
+				return nil
+			}
+		}
+	}
+}
+
+func (p *ttmlParser) parseMetadataTrack(start xml.StartElement, parent ttmlTimingContext, kind string) error {
+	ctx := p.childContext(start.Attr, parent)
+	lang := normalizeTTMLLang(ctx.lang)
+
+	for {
+		token, err := p.decoder.Token()
+		if err != nil {
+			return err
+		}
+
+		switch t := token.(type) {
+		case xml.StartElement:
+			if strings.EqualFold(t.Name.Local, "text") {
+				entry, ok, err := p.parseMetadataText(t, ctx)
+				if err != nil {
+					return err
+				}
+				if ok {
+					p.addMetadataEntry(kind, lang, entry)
+				}
+				continue
+			}
+
+			nextParent := ctx
+			if ctx.invalid {
+				nextParent = parent
+			}
+			if err := p.parseElement(t, nextParent); err != nil {
+				return err
+			}
+		case xml.EndElement:
+			if strings.EqualFold(t.Name.Local, start.Name.Local) {
+				return nil
+			}
+		}
+	}
+}
+
+func (p *ttmlParser) parseMetadataText(start xml.StartElement, parent ttmlTimingContext) (ttmlMetadataEntry, bool, error) {
+	forKey, hasFor := attrValue(start.Attr, "for")
+	forKey = strings.TrimSpace(forKey)
+
+	value, tokens, err := p.parseInlineElement(start, parent)
+	if err != nil {
+		return ttmlMetadataEntry{}, false, err
+	}
+	if !hasFor || forKey == "" {
+		return ttmlMetadataEntry{}, false, nil
+	}
+
+	ctx := p.childContext(start.Attr, parent)
+	if ctx.invalid {
+		return ttmlMetadataEntry{}, false, nil
+	}
+
+	line := model.Line{Value: sanitizeTTMLText(value)}
+	if ctx.hasBegin {
+		startMs := ctx.begin
+		line.Start = &startMs
+	}
+	if ctx.hasEnd {
+		endMs := ctx.end
+		line.End = &endMs
+	}
+	if len(tokens) > 0 {
+		line.Token = tokens
+	}
+	line = hydrateLineTimingFromTokens(line)
+
+	if line.Value == "" && len(line.Token) == 0 {
+		return ttmlMetadataEntry{}, false, nil
+	}
+
+	return ttmlMetadataEntry{key: forKey, line: line}, true, nil
+}
+
+func (p *ttmlParser) parseParagraph(parent ttmlTimingContext) (string, []model.Token, error) {
+	var text strings.Builder
+	var tokens []model.Token
+
+	for {
+		token, err := p.decoder.Token()
+		if err != nil {
+			return "", nil, err
+		}
+
+		switch t := token.(type) {
+		case xml.StartElement:
+			value, inlineTokens, err := p.parseInlineElement(t, parent)
+			if err != nil {
+				return "", nil, err
+			}
+			text.WriteString(value)
+			tokens = append(tokens, inlineTokens...)
+		case xml.EndElement:
+			if strings.EqualFold(t.Name.Local, "p") {
+				return sanitizeTTMLText(text.String()), tokens, nil
+			}
+		case xml.CharData:
+			text.WriteString(string(t))
+		}
+	}
+}
+
+func (p *ttmlParser) parseInlineElement(start xml.StartElement, parent ttmlTimingContext) (string, []model.Token, error) {
+	local := strings.ToLower(start.Name.Local)
+	if local == "br" {
+		return "\n", nil, nil
+	}
+
+	ctx := p.childContext(start.Attr, parent)
+	_, hasBegin := attrValue(start.Attr, "begin")
+	_, hasEnd := attrValue(start.Attr, "end")
+	_, hasDur := attrValue(start.Attr, "dur")
+	hasOwnTiming := hasBegin || hasEnd || hasDur
+
+	var text strings.Builder
+	var tokens []model.Token
+
+	for {
+		token, err := p.decoder.Token()
+		if err != nil {
+			return "", nil, err
+		}
+
+		switch t := token.(type) {
+		case xml.StartElement:
+			value, inlineTokens, err := p.parseInlineElement(t, ctx)
+			if err != nil {
+				return "", nil, err
+			}
+			text.WriteString(value)
+			tokens = append(tokens, inlineTokens...)
+		case xml.EndElement:
+			if !strings.EqualFold(t.Name.Local, start.Name.Local) {
+				continue
+			}
+
+			value := text.String()
+			tokenText := sanitizeTTMLText(value)
+			if local == "span" && hasOwnTiming && !ctx.invalid && tokenText != "" && len(tokens) == 0 {
+				parsedToken := model.Token{
+					Value: tokenText,
+					Role:  ctx.role,
+				}
+				if ctx.hasBegin {
+					startMs := ctx.begin
+					parsedToken.Start = &startMs
+				}
+				if ctx.hasEnd {
+					endMs := ctx.end
+					parsedToken.End = &endMs
+				}
+				tokens = append(tokens, parsedToken)
+			}
+
+			return value, tokens, nil
+		case xml.CharData:
+			text.WriteString(string(t))
+		}
+	}
+}
+
+func (p *ttmlParser) toLyricList() model.LyricList {
+	res := make(model.LyricList, 0, len(p.mainLangOrder)+len(p.translationLangOrder)+len(p.pronunciationLangOrder))
+	for _, lang := range p.mainLangOrder {
+		lines := p.mainLinesByLang[lang]
+		if len(lines) == 0 {
+			continue
+		}
+		res = append(res, model.Lyrics{
+			Kind:   ttmlLyricKindMain,
+			Lang:   lang,
+			Line:   lines,
+			Synced: linesAreSynced(lines),
+		})
+	}
+
+	res = append(res, p.buildMetadataLyrics(ttmlLyricKindTranslation, p.translationLangOrder, p.translationEntriesByLg)...)
+	res = append(res, p.buildMetadataLyrics(ttmlLyricKindPronunciation, p.pronunciationLangOrder, p.pronunciationEntriesByLg)...)
+	return res
+}
+
+func (p *ttmlParser) buildMetadataLyrics(kind string, langOrder []string, entriesByLang map[string][]ttmlMetadataEntry) model.LyricList {
+	res := make(model.LyricList, 0, len(langOrder))
+
+	for _, lang := range langOrder {
+		entries := entriesByLang[lang]
+		if len(entries) == 0 {
+			continue
+		}
+
+		seenKeys := make(map[string]struct{}, len(entries))
+		resolved := make([]ttmlResolvedMetadataLine, 0, len(entries))
+		for _, entry := range entries {
+			if _, exists := seenKeys[entry.key]; exists {
+				continue
+			}
+			seenKeys[entry.key] = struct{}{}
+
+			ref, ok := p.mainLineRefsByKey[entry.key]
+			if !ok {
+				log.Warn("Skipping TTML metadata line without matching key", "kind", kind, "lang", lang, "key", entry.key)
+				continue
+			}
+
+			line := entry.line
+			if line.Start == nil && ref.line.Start != nil {
+				startMs := *ref.line.Start
+				line.Start = &startMs
+			}
+			if line.End == nil && ref.line.End != nil {
+				endMs := *ref.line.End
+				line.End = &endMs
+			}
+			line = hydrateLineTimingFromTokens(line)
+
+			if line.Value == "" && len(line.Token) == 0 {
+				continue
+			}
+
+			resolved = append(resolved, ttmlResolvedMetadataLine{
+				order: ref.order,
+				seq:   entry.seq,
+				line:  line,
+			})
+		}
+
+		if len(resolved) == 0 {
+			continue
+		}
+
+		sort.SliceStable(resolved, func(i, j int) bool {
+			if resolved[i].order != resolved[j].order {
+				return resolved[i].order < resolved[j].order
+			}
+			return resolved[i].seq < resolved[j].seq
+		})
+
+		lines := make([]model.Line, len(resolved))
+		for i := range resolved {
+			lines[i] = resolved[i].line
+		}
+
+		res = append(res, model.Lyrics{
+			Kind:   kind,
+			Lang:   lang,
+			Line:   lines,
+			Synced: linesAreSynced(lines),
+		})
+	}
+
+	return res
+}
+
+func (p *ttmlParser) addMainLine(lang string, lineKey string, line model.Line) {
+	lang = normalizeTTMLLang(lang)
+	if _, ok := p.mainLinesByLang[lang]; !ok {
+		p.mainLangOrder = append(p.mainLangOrder, lang)
+	}
+	p.mainLinesByLang[lang] = append(p.mainLinesByLang[lang], line)
+
+	lineKey = strings.TrimSpace(lineKey)
+	if lineKey != "" {
+		if _, exists := p.mainLineRefsByKey[lineKey]; !exists {
+			p.mainLineRefsByKey[lineKey] = ttmlLineRef{
+				order: p.mainLineOrder,
+				line:  line,
+			}
+		}
+	}
+	p.mainLineOrder++
+}
+
+func (p *ttmlParser) addMetadataEntry(kind string, lang string, entry ttmlMetadataEntry) {
+	lang = normalizeTTMLLang(lang)
+	entry.seq = p.metadataSeq
+	p.metadataSeq++
+
+	switch kind {
+	case ttmlLyricKindTranslation:
+		if _, ok := p.translationEntriesByLg[lang]; !ok {
+			p.translationLangOrder = append(p.translationLangOrder, lang)
+		}
+		p.translationEntriesByLg[lang] = append(p.translationEntriesByLg[lang], entry)
+	case ttmlLyricKindPronunciation:
+		if _, ok := p.pronunciationEntriesByLg[lang]; !ok {
+			p.pronunciationLangOrder = append(p.pronunciationLangOrder, lang)
+		}
+		p.pronunciationEntriesByLg[lang] = append(p.pronunciationEntriesByLg[lang], entry)
+	}
+}
+
+func (p *ttmlParser) childContext(attrs []xml.Attr, parent ttmlTimingContext) ttmlTimingContext {
+	ctx := parent
+
+	if lang, ok := attrValue(attrs, "lang"); ok {
+		ctx.lang = normalizeTTMLLang(lang)
+	}
+	if role, ok := attrValue(attrs, "role"); ok {
+		role = strings.TrimSpace(role)
+		if role != "" {
+			if ctx.role == "" {
+				ctx.role = role
+			} else if !strings.Contains(ctx.role, role) {
+				ctx.role = ctx.role + " " + role
+			}
+		}
+	}
+
+	beginExpr, hasBegin := attrValue(attrs, "begin")
+	endExpr, hasEnd := attrValue(attrs, "end")
+	durExpr, hasDur := attrValue(attrs, "dur")
+
+	if hasBegin {
+		begin, kind, ok := parseTTMLTimeExpression(beginExpr, p.params)
+		if !ok {
+			ctx.invalid = true
+			return ctx
+		}
+
+		base := int64(0)
+		if parent.hasBegin {
+			base = parent.begin
+		}
+		ctx.begin = resolveTTMLTime(begin, kind, base, parent)
+		ctx.hasBegin = true
+	} else {
+		ctx.begin = parent.begin
+		ctx.hasBegin = parent.hasBegin
+	}
+
+	var calculatedEnd int64
+	calculatedHasEnd := false
+
+	if hasEnd {
+		end, kind, ok := parseTTMLTimeExpression(endExpr, p.params)
+		if !ok {
+			ctx.invalid = true
+			return ctx
+		}
+
+		base := ctx.begin
+		if !ctx.hasBegin {
+			base = parent.begin
+		}
+		calculatedEnd = resolveTTMLTime(end, kind, base, parent)
+		calculatedHasEnd = true
+	}
+
+	if hasDur {
+		dur, ok := parseTTMLDurationExpression(durExpr, p.params)
+		if !ok {
+			ctx.invalid = true
+			return ctx
+		}
+		if ctx.hasBegin {
+			durEnd := ctx.begin + dur
+			if !calculatedHasEnd || durEnd < calculatedEnd {
+				calculatedEnd = durEnd
+				calculatedHasEnd = true
+			}
+		}
+	}
+
+	if !calculatedHasEnd && parent.hasEnd {
+		calculatedEnd = parent.end
+		calculatedHasEnd = true
+	}
+
+	ctx.end = calculatedEnd
+	ctx.hasEnd = calculatedHasEnd
+	return ctx
+}
+
+func (p *ttmlParser) updateTimingParams(attrs []xml.Attr) {
+	frameRate := p.params.frameRate
+	if value, ok := attrValue(attrs, "frameRate"); ok {
+		if parsed, err := strconv.ParseFloat(value, 64); err == nil && parsed > 0 {
+			frameRate = parsed
+		}
+	}
+
+	if value, ok := attrValue(attrs, "frameRateMultiplier"); ok {
+		parts := strings.Fields(value)
+		if len(parts) == 2 {
+			numerator, errA := strconv.ParseFloat(parts[0], 64)
+			denominator, errB := strconv.ParseFloat(parts[1], 64)
+			if errA == nil && errB == nil && denominator > 0 {
+				frameRate = frameRate * (numerator / denominator)
+			}
+		}
+	}
+
+	subFrameRate := p.params.subFrameRate
+	if value, ok := attrValue(attrs, "subFrameRate"); ok {
+		if parsed, err := strconv.ParseFloat(value, 64); err == nil && parsed > 0 {
+			subFrameRate = parsed
+		}
+	}
+
+	tickRate := p.params.tickRate
+	if value, ok := attrValue(attrs, "tickRate"); ok {
+		if parsed, err := strconv.ParseFloat(value, 64); err == nil && parsed > 0 {
+			tickRate = parsed
+		}
+	}
+
+	p.params.frameRate = max(frameRate, defaultTTMLFrameRate)
+	p.params.subFrameRate = max(subFrameRate, defaultTTMLSubFrameRate)
+	p.params.tickRate = max(tickRate, defaultTTMLTickRate)
+}
+
+func parseTTMLDurationExpression(expr string, params ttmlTimingParams) (int64, bool) {
+	value, _, ok := parseTTMLTimeExpression(expr, params)
+	return value, ok
+}
+
+func resolveTTMLTime(value int64, kind ttmlTimeKind, base int64, parent ttmlTimingContext) int64 {
+	switch kind {
+	case ttmlTimeAbsolute:
+		return value
+	case ttmlTimeOffset:
+		return base + value
+	case ttmlTimeAmbiguous:
+		absolute := value
+		offset := base + value
+
+		// No parent timing context → no reference frame for offsets.
+		// Prefer absolute when offset differs (i.e., base > 0).
+		if !parent.hasBegin && !parent.hasEnd && base != 0 {
+			return absolute
+		}
+
+		if parent.hasBegin && parent.hasEnd {
+			absoluteInParent := absolute >= parent.begin && absolute <= parent.end
+			offsetInParent := offset >= parent.begin && offset <= parent.end
+			if absoluteInParent && !offsetInParent {
+				return absolute
+			}
+			if offsetInParent && !absoluteInParent {
+				return offset
+			}
+		}
+
+		if parent.hasBegin {
+			if absolute < parent.begin && offset >= parent.begin {
+				return offset
+			}
+			if absolute >= parent.begin && offset > absolute {
+				return absolute
+			}
+		}
+		return offset
+	default:
+		return base + value
+	}
+}
+
+func parseTTMLTimeExpression(expr string, params ttmlTimingParams) (int64, ttmlTimeKind, bool) {
+	expr = strings.TrimSpace(expr)
+	if expr == "" {
+		return 0, ttmlTimeOffset, false
+	}
+
+	lower := strings.ToLower(expr)
+	if strings.Contains(lower, "wallclock(") ||
+		strings.Contains(lower, ".begin") ||
+		strings.Contains(lower, ".end") {
+		log.Warn("Unsupported TTML time expression", "value", expr)
+		return 0, ttmlTimeOffset, false
+	}
+
+	// Best-effort support for non-standard TTML seen in the wild where a
+	// bare decimal value is used (implicitly seconds), e.g. "0.170".
+	if value, err := strconv.ParseFloat(lower, 64); err == nil && value >= 0 {
+		return int64(math.Round(value * 1000)), ttmlTimeAmbiguous, true
+	}
+
+	if matches := offsetTimeRegex.FindStringSubmatch(lower); len(matches) == 3 {
+		value, err := strconv.ParseFloat(matches[1], 64)
+		if err != nil {
+			return 0, ttmlTimeOffset, false
+		}
+
+		unit := matches[2]
+		seconds := 0.0
+		switch unit {
+		case "h":
+			seconds = value * 60 * 60
+		case "m":
+			seconds = value * 60
+		case "s":
+			seconds = value
+		case "ms":
+			seconds = value / 1000
+		case "f":
+			seconds = value / params.frameRate
+		case "t":
+			seconds = value / params.tickRate
+		default:
+			return 0, ttmlTimeOffset, false
+		}
+
+		return int64(math.Round(seconds * 1000)), ttmlTimeOffset, true
+	}
+
+	colonCount := strings.Count(expr, ":")
+	switch colonCount {
+	case 1, 2:
+		clockMs, ok := parseTTMLClockTime(expr)
+		if !ok {
+			return 0, ttmlTimeAbsolute, false
+		}
+		return clockMs, ttmlTimeAbsolute, true
+	case 3:
+		framesMs, ok := parseTTMLFrameTime(expr, params)
+		if !ok {
+			return 0, ttmlTimeAbsolute, false
+		}
+		return framesMs, ttmlTimeAbsolute, true
+	default:
+		log.Warn("Unsupported TTML time expression", "value", expr)
+		return 0, ttmlTimeOffset, false
+	}
+}
+
+func parseTTMLClockTime(value string) (int64, bool) {
+	parts := strings.Split(value, ":")
+	if len(parts) != 2 && len(parts) != 3 {
+		return 0, false
+	}
+
+	hours := int64(0)
+	minutesIdx := 0
+	if len(parts) == 3 {
+		h, err := strconv.ParseInt(parts[0], 10, 64)
+		if err != nil {
+			return 0, false
+		}
+		hours = h
+		minutesIdx = 1
+	}
+
+	minutes, err := strconv.ParseInt(parts[minutesIdx], 10, 64)
+	if err != nil {
+		return 0, false
+	}
+
+	seconds, err := strconv.ParseFloat(parts[minutesIdx+1], 64)
+	if err != nil {
+		return 0, false
+	}
+
+	totalSeconds := float64(hours*60*60+minutes*60) + seconds
+	return int64(math.Round(totalSeconds * 1000)), true
+}
+
+func parseTTMLFrameTime(value string, params ttmlTimingParams) (int64, bool) {
+	parts := strings.Split(value, ":")
+	if len(parts) != 4 {
+		return 0, false
+	}
+
+	hours, err := strconv.ParseInt(parts[0], 10, 64)
+	if err != nil {
+		return 0, false
+	}
+
+	minutes, err := strconv.ParseInt(parts[1], 10, 64)
+	if err != nil {
+		return 0, false
+	}
+
+	seconds, err := strconv.ParseInt(parts[2], 10, 64)
+	if err != nil {
+		return 0, false
+	}
+
+	frameParts := strings.SplitN(parts[3], ".", 2)
+	frames, err := strconv.ParseFloat(frameParts[0], 64)
+	if err != nil {
+		return 0, false
+	}
+
+	subFrames := 0.0
+	if len(frameParts) == 2 {
+		subFrames, err = strconv.ParseFloat(frameParts[1], 64)
+		if err != nil {
+			return 0, false
+		}
+	}
+
+	totalSeconds := float64(hours*60*60 + minutes*60 + seconds)
+	totalSeconds += frames / params.frameRate
+	totalSeconds += subFrames / (params.subFrameRate * params.frameRate)
+
+	return int64(math.Round(totalSeconds * 1000)), true
+}
+
+func attrValue(attrs []xml.Attr, key string) (string, bool) {
+	for _, attr := range attrs {
+		if strings.EqualFold(attr.Name.Local, key) {
+			return strings.TrimSpace(attr.Value), true
+		}
+	}
+	return "", false
+}
+
+func normalizeTTMLLang(lang string) string {
+	lang = strings.ToLower(strings.TrimSpace(lang))
+	if lang == "" {
+		return "xxx"
+	}
+	return lang
+}
+
+func sanitizeTTMLText(raw string) string {
+	raw = str.SanitizeText(raw)
+	raw = strings.ReplaceAll(raw, "\r\n", "\n")
+	raw = strings.ReplaceAll(raw, "\r", "\n")
+
+	lines := strings.Split(raw, "\n")
+	for i := range lines {
+		lines[i] = strings.TrimSpace(lines[i])
+	}
+	return strings.TrimSpace(strings.Join(lines, "\n"))
+}
+
+func linesAreSynced(lines []model.Line) bool {
+	for i := range lines {
+		if lines[i].Start != nil {
+			return true
+		}
+		for j := range lines[i].Token {
+			if lines[i].Token[j].Start != nil {
+				return true
+			}
+		}
+	}
+	return false
+}
+
+func hydrateLineTimingFromTokens(line model.Line) model.Line {
+	if len(line.Token) == 0 {
+		return line
+	}
+
+	var earliestStart *int64
+	var latestEnd *int64
+	for i := range line.Token {
+		token := line.Token[i]
+		if token.Start != nil {
+			if earliestStart == nil || *token.Start < *earliestStart {
+				v := *token.Start
+				earliestStart = &v
+			}
+		}
+
+		candidateEnd := token.End
+		if candidateEnd == nil {
+			candidateEnd = token.Start
+		}
+		if candidateEnd != nil {
+			if latestEnd == nil || *candidateEnd > *latestEnd {
+				v := *candidateEnd
+				latestEnd = &v
+			}
+		}
+	}
+
+	if line.Start == nil && earliestStart != nil {
+		v := *earliestStart
+		line.Start = &v
+	}
+	if line.End == nil && latestEnd != nil {
+		v := *latestEnd
+		line.End = &v
+	}
+	return line
+}
+
+func max(v float64, fallback float64) float64 {
+	if v <= 0 {
+		return fallback
+	}
+	return v
+}
--- a/core/lyrics/ttml_test.go
+++ b/core/lyrics/ttml_test.go
@ -0,0 +1,398 @@
+package lyrics
+
+import (
+	"testing"
+
+	"github.com/navidrome/navidrome/model"
+)
+
+func TestParseTTML_MultiLanguageAndTiming(t *testing.T) {
+	content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
+<tt xmlns="http://www.w3.org/ns/ttml" xmlns:ttp="http://www.w3.org/ns/ttml#parameter" ttp:frameRate="30" ttp:subFrameRate="2" ttp:tickRate="10">
+  <body>
+    <div xml:lang="eng" begin="1s">
+      <p begin="2s">Line one</p>
+      <p begin="00:00:04:15.1"><span>Line two</span><br/>with break</p>
+    </div>
+    <div xml:lang="por">
+      <p begin="45t">Linha</p>
+    </div>
+  </body>
+</tt>`)
+
+	list, err := parseTTML(content)
+	if err != nil {
+		t.Fatalf("parseTTML returned error: %v", err)
+	}
+	if len(list) != 2 {
+		t.Fatalf("expected 2 lyric tracks, got %d", len(list))
+	}
+
+	eng := list[0]
+	if eng.Lang != "eng" {
+		t.Fatalf("expected first track language 'eng', got %q", eng.Lang)
+	}
+	if !eng.Synced {
+		t.Fatal("expected first track to be synced")
+	}
+	assertTimedLine(t, eng.Line[0], 3000, "Line one")
+	assertTimedLine(t, eng.Line[1], 4517, "Line two\nwith break")
+
+	por := list[1]
+	if por.Lang != "por" {
+		t.Fatalf("expected second track language 'por', got %q", por.Lang)
+	}
+	assertTimedLine(t, por.Line[0], 4500, "Linha")
+}
+
+func TestParseTTML_UnsupportedCueSkipped(t *testing.T) {
+	content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
+<tt xmlns="http://www.w3.org/ns/ttml">
+  <body xml:lang="eng">
+    <div>
+      <p begin="wallclock(2026-01-01T00:00:00Z)">Skip me</p>
+      <p begin="1s">Keep me</p>
+    </div>
+  </body>
+</tt>`)
+
+	list, err := parseTTML(content)
+	if err != nil {
+		t.Fatalf("parseTTML returned error: %v", err)
+	}
+	if len(list) != 1 {
+		t.Fatalf("expected 1 lyric track, got %d", len(list))
+	}
+	if len(list[0].Line) != 1 {
+		t.Fatalf("expected 1 line in lyric track, got %d", len(list[0].Line))
+	}
+	assertTimedLine(t, list[0].Line[0], 1000, "Keep me")
+}
+
+func TestParseTTML_BeginEndDurWithInheritance(t *testing.T) {
+	content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
+<tt xmlns="http://www.w3.org/ns/ttml">
+  <body xml:lang="eng" begin="10s">
+    <div begin="5s" dur="8s">
+      <p begin="1s" dur="2s">First line</p>
+      <p begin="3s" end="5s">Second line</p>
+    </div>
+  </body>
+</tt>`)
+
+	list, err := parseTTML(content)
+	if err != nil {
+		t.Fatalf("parseTTML returned error: %v", err)
+	}
+	if len(list) != 1 {
+		t.Fatalf("expected 1 lyric track, got %d", len(list))
+	}
+	if list[0].Lang != "eng" {
+		t.Fatalf("expected language 'eng', got %q", list[0].Lang)
+	}
+	if len(list[0].Line) != 2 {
+		t.Fatalf("expected 2 lines, got %d", len(list[0].Line))
+	}
+	assertTimedLine(t, list[0].Line[0], 16000, "First line")
+	assertTimedLine(t, list[0].Line[1], 18000, "Second line")
+}
+
+func TestParseTTML_NonStandardBareSecondOffsets(t *testing.T) {
+	content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
+<tt xmlns="http://www.w3.org/ns/ttml">
+  <body xml:lang="eng" begin="10">
+    <div>
+      <p begin="0.170">First line</p>
+      <p begin="3.710">Second line</p>
+    </div>
+  </body>
+</tt>`)
+
+	list, err := parseTTML(content)
+	if err != nil {
+		t.Fatalf("parseTTML returned error: %v", err)
+	}
+	if len(list) != 1 {
+		t.Fatalf("expected 1 lyric track, got %d", len(list))
+	}
+	if len(list[0].Line) != 2 {
+		t.Fatalf("expected 2 lines, got %d", len(list[0].Line))
+	}
+	assertTimedLine(t, list[0].Line[0], 10170, "First line")
+	assertTimedLine(t, list[0].Line[1], 13710, "Second line")
+}
+
+func TestParseTTML_WordTimingTokens(t *testing.T) {
+	content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
+<tt xmlns="http://www.w3.org/ns/ttml" xmlns:ttm="http://www.w3.org/ns/ttml#metadata">
+  <body xml:lang="eng">
+    <div>
+      <p begin="00:01.000" end="00:03.000">
+        <span begin="00:01.000" end="00:01.400">He</span><span begin="00:01.400" end="00:01.800">llo</span>
+        <span ttm:role="x-bg"><span begin="00:02.000" end="00:02.500">echo</span></span>
+      </p>
+    </div>
+  </body>
+</tt>`)
+
+	list, err := parseTTML(content)
+	if err != nil {
+		t.Fatalf("parseTTML returned error: %v", err)
+	}
+	if len(list) != 1 {
+		t.Fatalf("expected 1 lyric track, got %d", len(list))
+	}
+	if len(list[0].Line) != 1 {
+		t.Fatalf("expected 1 line, got %d", len(list[0].Line))
+	}
+
+	line := list[0].Line[0]
+	assertTimedLine(t, line, 1000, "Hello\necho")
+	if line.End == nil || *line.End != 3000 {
+		t.Fatalf("expected line end 3000, got %v", line.End)
+	}
+	if len(line.Token) != 3 {
+		t.Fatalf("expected 3 timed tokens, got %d", len(line.Token))
+	}
+
+	assertToken(t, line.Token[0], 1000, 1400, "He", "")
+	assertToken(t, line.Token[1], 1400, 1800, "llo", "")
+	assertToken(t, line.Token[2], 2000, 2500, "echo", "x-bg")
+}
+
+func TestParseTTML_AmbiguousDecimalTimingPrefersAbsoluteWhenInsideParentWindow(t *testing.T) {
+	content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
+<tt xmlns="http://www.w3.org/ns/ttml">
+  <body xml:lang="eng">
+    <div begin="37.870" end="45.570">
+      <p begin="43.444" end="45.570">
+        <span begin="43.444" end="43.716">go</span>
+        <span begin="43.716" end="43.887">go</span>
+      </p>
+    </div>
+  </body>
+</tt>`)
+
+	list, err := parseTTML(content)
+	if err != nil {
+		t.Fatalf("parseTTML returned error: %v", err)
+	}
+	if len(list) != 1 || len(list[0].Line) != 1 {
+		t.Fatalf("expected one parsed lyric line, got %#v", list)
+	}
+
+	line := list[0].Line[0]
+	assertTimedLine(t, line, 43444, "go\ngo")
+	if line.End == nil || *line.End != 45570 {
+		t.Fatalf("expected line end 45570, got %v", line.End)
+	}
+	if len(line.Token) != 2 {
+		t.Fatalf("expected 2 timed tokens, got %d", len(line.Token))
+	}
+	assertToken(t, line.Token[0], 43444, 43716, "go", "")
+	assertToken(t, line.Token[1], 43716, 43887, "go", "")
+}
+
+func TestParseTTML_UnsyncedFallback(t *testing.T) {
+	content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
+<tt xmlns="http://www.w3.org/ns/ttml">
+  <body>
+    <div>
+      <p>No timing here</p>
+    </div>
+  </body>
+</tt>`)
+
+	list, err := parseTTML(content)
+	if err != nil {
+		t.Fatalf("parseTTML returned error: %v", err)
+	}
+	if len(list) != 1 {
+		t.Fatalf("expected 1 lyric track, got %d", len(list))
+	}
+	if list[0].Lang != "xxx" {
+		t.Fatalf("expected default language 'xxx', got %q", list[0].Lang)
+	}
+	if list[0].Synced {
+		t.Fatal("expected lyric track to be unsynced")
+	}
+	if len(list[0].Line) != 1 {
+		t.Fatalf("expected 1 line, got %d", len(list[0].Line))
+	}
+	if list[0].Line[0].Start != nil {
+		t.Fatalf("expected line start to be nil, got %v", *list[0].Line[0].Start)
+	}
+	if list[0].Line[0].Value != "No timing here" {
+		t.Fatalf("expected line value %q, got %q", "No timing here", list[0].Line[0].Value)
+	}
+}
+
+func TestParseTTML_MetadataTracksByKey(t *testing.T) {
+	content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
+<tt xmlns="http://www.w3.org/ns/ttml" xmlns:itunes="http://music.apple.com/lyric-ttml-internal">
+  <head>
+    <metadata>
+      <iTunesMetadata xmlns="http://music.apple.com/lyric-ttml-internal">
+        <translations>
+          <translation xml:lang="es">
+            <text for="L1">Hola</text>
+            <text for="MISSING">Skip me</text>
+          </translation>
+        </translations>
+        <transliterations>
+          <transliteration xml:lang="ja-Latn">
+            <text for="L2"><span begin="00:02.000" end="00:02.300" xmlns="http://www.w3.org/ns/ttml">ko</span><span begin="00:02.300" end="00:02.600" xmlns="http://www.w3.org/ns/ttml">nni</span></text>
+          </transliteration>
+        </transliterations>
+      </iTunesMetadata>
+    </metadata>
+  </head>
+  <body xml:lang="ja">
+    <div>
+      <p begin="00:01.000" end="00:01.500" itunes:key="L1">こんにちは</p>
+      <p begin="00:02.000" end="00:02.700" itunes:key="L2">こんばんは</p>
+    </div>
+  </body>
+</tt>`)
+
+	list, err := parseTTML(content)
+	if err != nil {
+		t.Fatalf("parseTTML returned error: %v", err)
+	}
+	if len(list) != 3 {
+		t.Fatalf("expected 3 lyric tracks, got %d", len(list))
+	}
+
+	main := list[0]
+	if main.Kind != "main" {
+		t.Fatalf("expected main track kind %q, got %q", "main", main.Kind)
+	}
+	if main.Lang != "ja" {
+		t.Fatalf("expected main track language %q, got %q", "ja", main.Lang)
+	}
+	if len(main.Line) != 2 {
+		t.Fatalf("expected 2 lines in main track, got %d", len(main.Line))
+	}
+
+	translation := list[1]
+	if translation.Kind != "translation" {
+		t.Fatalf("expected translation kind %q, got %q", "translation", translation.Kind)
+	}
+	if translation.Lang != "es" {
+		t.Fatalf("expected translation language %q, got %q", "es", translation.Lang)
+	}
+	if len(translation.Line) != 1 {
+		t.Fatalf("expected 1 translation line, got %d", len(translation.Line))
+	}
+	assertTimedLine(t, translation.Line[0], 1000, "Hola")
+	if translation.Line[0].End == nil || *translation.Line[0].End != 1500 {
+		t.Fatalf("expected translation line end %d, got %v", 1500, translation.Line[0].End)
+	}
+
+	pronunciation := list[2]
+	if pronunciation.Kind != "pronunciation" {
+		t.Fatalf("expected pronunciation kind %q, got %q", "pronunciation", pronunciation.Kind)
+	}
+	if pronunciation.Lang != "ja-latn" {
+		t.Fatalf("expected pronunciation language %q, got %q", "ja-latn", pronunciation.Lang)
+	}
+	if len(pronunciation.Line) != 1 {
+		t.Fatalf("expected 1 pronunciation line, got %d", len(pronunciation.Line))
+	}
+	assertTimedLine(t, pronunciation.Line[0], 2000, "konni")
+	if pronunciation.Line[0].End == nil || *pronunciation.Line[0].End != 2600 {
+		t.Fatalf("expected pronunciation line end %d, got %v", 2600, pronunciation.Line[0].End)
+	}
+	if len(pronunciation.Line[0].Token) != 2 {
+		t.Fatalf("expected 2 pronunciation tokens, got %d", len(pronunciation.Line[0].Token))
+	}
+	assertToken(t, pronunciation.Line[0].Token[0], 2000, 2300, "ko", "")
+	assertToken(t, pronunciation.Line[0].Token[1], 2300, 2600, "nni", "")
+}
+
+func TestParseTTML_PronunciationBareDecimalEndTimes(t *testing.T) {
+	content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
+<tt xmlns="http://www.w3.org/ns/ttml" xmlns:itunes="http://music.apple.com/lyric-ttml-internal">
+  <head>
+    <metadata>
+      <iTunesMetadata xmlns="http://music.apple.com/lyric-ttml-internal">
+        <transliterations>
+          <transliteration xml:lang="ja-Latn">
+            <text for="L1"><span begin="2.747" end="3.018" xmlns="http://www.w3.org/ns/ttml">I</span> <span begin="3.018" end="3.179" xmlns="http://www.w3.org/ns/ttml">woke</span> <span begin="3.179" end="3.582" xmlns="http://www.w3.org/ns/ttml">up</span></text>
+          </transliteration>
+        </transliterations>
+      </iTunesMetadata>
+    </metadata>
+  </head>
+  <body xml:lang="ja">
+    <div>
+      <p begin="00:02.747" end="00:04.000" itunes:key="L1">起きた</p>
+    </div>
+  </body>
+</tt>`)
+
+	list, err := parseTTML(content)
+	if err != nil {
+		t.Fatalf("parseTTML returned error: %v", err)
+	}
+
+	var pronunciation *model.Lyrics
+	for i := range list {
+		if list[i].Kind == "pronunciation" {
+			pronunciation = &list[i]
+			break
+		}
+	}
+	if pronunciation == nil {
+		t.Fatal("expected a pronunciation track")
+	}
+	if len(pronunciation.Line) != 1 {
+		t.Fatalf("expected 1 pronunciation line, got %d", len(pronunciation.Line))
+	}
+
+	line := pronunciation.Line[0]
+	assertTimedLine(t, line, 2747, "I woke up")
+	if len(line.Token) != 3 {
+		t.Fatalf("expected 3 tokens, got %d", len(line.Token))
+	}
+	assertToken(t, line.Token[0], 2747, 3018, "I", "")
+	assertToken(t, line.Token[1], 3018, 3179, "woke", "")
+	assertToken(t, line.Token[2], 3179, 3582, "up", "")
+}
+
+func assertTimedLine(t *testing.T, line model.Line, expectedStart int64, expectedValue string) {
+	t.Helper()
+
+	if line.Start == nil {
+		t.Fatal("expected line start to be set, got nil")
+	}
+	if *line.Start != expectedStart {
+		t.Fatalf("expected line start %d, got %d", expectedStart, *line.Start)
+	}
+	if line.Value != expectedValue {
+		t.Fatalf("expected line value %q, got %q", expectedValue, line.Value)
+	}
+}
+
+func assertToken(t *testing.T, token model.Token, expectedStart int64, expectedEnd int64, expectedValue string, expectedRole string) {
+	t.Helper()
+
+	if token.Start == nil {
+		t.Fatal("expected token start to be set, got nil")
+	}
+	if *token.Start != expectedStart {
+		t.Fatalf("expected token start %d, got %d", expectedStart, *token.Start)
+	}
+	if token.End == nil {
+		t.Fatal("expected token end to be set, got nil")
+	}
+	if *token.End != expectedEnd {
+		t.Fatalf("expected token end %d, got %d", expectedEnd, *token.End)
+	}
+	if token.Value != expectedValue {
+		t.Fatalf("expected token value %q, got %q", expectedValue, token.Value)
+	}
+	if token.Role != expectedRole {
+		t.Fatalf("expected token role %q, got %q", expectedRole, token.Role)
+	}
+}
--- a/model/lyrics.go
+++ b/model/lyrics.go
@ -11,14 +11,24 @@ import (
 	"github.com/navidrome/navidrome/utils/str"
 )

-type Line struct {
+type Token struct {
 	Start *int64 `structs:"start,omitempty" json:"start,omitempty"`
+	End   *int64 `structs:"end,omitempty"   json:"end,omitempty"`
 	Value string `structs:"value"           json:"value"`
+	Role  string `structs:"role,omitempty"  json:"role,omitempty"`
+}
+
+type Line struct {
+	Start *int64  `structs:"start,omitempty" json:"start,omitempty"`
+	End   *int64  `structs:"end,omitempty"   json:"end,omitempty"`
+	Value string  `structs:"value"           json:"value"`
+	Token []Token `structs:"token,omitempty" json:"token,omitempty"`
 }

 type Lyrics struct {
 	DisplayArtist string `structs:"displayArtist,omitempty" json:"displayArtist,omitempty"`
 	DisplayTitle  string `structs:"displayTitle,omitempty"  json:"displayTitle,omitempty"`
+	Kind          string `structs:"kind,omitempty"          json:"kind,omitempty"`
 	Lang          string `structs:"lang"                    json:"lang"`
 	Line          []Line `structs:"line"                    json:"line"`
 	Offset        *int64 `structs:"offset,omitempty"        json:"offset,omitempty"`
--- a/server/subsonic/helpers.go
+++ b/server/subsonic/helpers.go
@ -478,19 +478,47 @@ func mapExplicitStatus(explicitStatus string) string {

 func buildStructuredLyric(mf *model.MediaFile, lyrics model.Lyrics) responses.StructuredLyric {
 	lines := make([]responses.Line, len(lyrics.Line))
+	tokenLines := make([]responses.TokenLine, 0, len(lyrics.Line))

 	for i, line := range lyrics.Line {
 		lines[i] = responses.Line{
 			Start: line.Start,
 			Value: line.Value,
 		}
+		if len(line.Token) == 0 {
+			continue
+		}
+
+		tokens := make([]responses.LyricToken, len(line.Token))
+		for j, token := range line.Token {
+			tokens[j] = responses.LyricToken{
+				Start: token.Start,
+				End:   token.End,
+				Value: token.Value,
+				Role:  token.Role,
+			}
+		}
+		tokenLines = append(tokenLines, responses.TokenLine{
+			Index: int32(i),
+			Start: line.Start,
+			End:   line.End,
+			Value: line.Value,
+			Token: tokens,
+		})
+	}
+
+	kind := strings.TrimSpace(lyrics.Kind)
+	if kind == "" {
+		kind = "main"
 	}

 	structured := responses.StructuredLyric{
 		DisplayArtist: lyrics.DisplayArtist,
 		DisplayTitle:  lyrics.DisplayTitle,
+		Kind:          kind,
 		Lang:          lyrics.Lang,
 		Line:          lines,
+		TokenLine:     tokenLines,
 		Offset:        lyrics.Offset,
 		Synced:        lyrics.Synced,
 	}
--- a/server/subsonic/media_retrieval.go
+++ b/server/subsonic/media_retrieval.go
@ -98,7 +98,9 @@ func (api *Router) GetLyrics(r *http.Request) (*responses.Subsonic, error) {
 	response := newResponse()
 	lyricsResponse := responses.Lyrics{}
 	response.Lyrics = &lyricsResponse
-	mediaFiles, err := api.ds.MediaFile(r.Context()).GetAll(filter.SongsByArtistTitleWithLyricsFirst(artist, title))
+	opts := filter.SongsByArtistTitleWithLyricsFirst(artist, title)
+	opts.Max = 0
+	mediaFiles, err := api.ds.MediaFile(r.Context()).GetAll(opts)

 	if err != nil {
 		return nil, err
@ -108,25 +110,26 @@ func (api *Router) GetLyrics(r *http.Request) (*responses.Subsonic, error) {
 		return response, nil
 	}

-	structuredLyrics, err := api.lyrics.GetLyrics(r.Context(), &mediaFiles[0])
-	if err != nil {
-		return nil, err
+	for i := range mediaFiles {
+		structuredLyrics, err := api.lyrics.GetLyrics(r.Context(), &mediaFiles[i])
+		if err != nil {
+			return nil, err
+		}
+		if len(structuredLyrics) == 0 {
+			continue
+		}
+
+		lyricsResponse.Artist = artist
+		lyricsResponse.Title = title
+
+		var lyricsText strings.Builder
+		for _, line := range structuredLyrics[0].Line {
+			lyricsText.WriteString(line.Value + "\n")
+		}
+		lyricsResponse.Value = lyricsText.String()
+		break
 	}

-	if len(structuredLyrics) == 0 {
-		return response, nil
-	}
-
-	lyricsResponse.Artist = artist
-	lyricsResponse.Title = title
-
-	var lyricsText strings.Builder
-	for _, line := range structuredLyrics[0].Line {
-		lyricsText.WriteString(line.Value + "\n")
-	}
-
-	lyricsResponse.Value = lyricsText.String()
-
 	return response, nil
 }

--- a/server/subsonic/media_retrieval_test.go
+++ b/server/subsonic/media_retrieval_test.go
@ -186,6 +186,36 @@ var _ = Describe("MediaRetrievalController", func() {
 			Expect(response.Lyrics.Title).To(Equal("Never Gonna Give You Up"))
 			Expect(response.Lyrics.Value).To(Equal("We're no strangers to love\nYou know the rules and so do I\n"))
 		})
+
+		It("should continue searching candidates for sidecar lyrics", func() {
+			conf.Server.LyricsPriority = ".ttml,embedded"
+			r := newGetRequest("artist=Rick+Astley", "title=Never+Gonna+Give+You+Up")
+			baseTime := time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC)
+			mockRepo.SetData(model.MediaFiles{
+				{
+					ID:        "1",
+					Path:      "tests/fixtures/01 Invisible (RED) Edit Version.mp3",
+					Artist:    "Rick Astley",
+					Title:     "Never Gonna Give You Up",
+					Lyrics:    "[]",
+					UpdatedAt: baseTime.Add(2 * time.Hour), // Newer, but no TTML sidecar
+				},
+				{
+					ID:        "2",
+					Path:      "tests/fixtures/test.mp3",
+					Artist:    "Rick Astley",
+					Title:     "Never Gonna Give You Up",
+					Lyrics:    "[]",
+					UpdatedAt: baseTime.Add(1 * time.Hour), // Older, but has TTML sidecar
+				},
+			})
+
+			response, err := router.GetLyrics(r)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(response.Lyrics.Artist).To(Equal("Rick Astley"))
+			Expect(response.Lyrics.Title).To(Equal("Never Gonna Give You Up"))
+			Expect(response.Lyrics.Value).To(Equal("We're no strangers to love\nYou know the rules and so do I\n"))
+		})
 	})

 	Describe("GetLyricsBySongId", func() {
@ -202,6 +232,11 @@ var _ = Describe("MediaRetrievalController", func() {

 				Expect(realLyric.DisplayArtist).To(Equal(expectedLyric.DisplayArtist))
 				Expect(realLyric.DisplayTitle).To(Equal(expectedLyric.DisplayTitle))
+				expectedKind := expectedLyric.Kind
+				if expectedKind == "" {
+					expectedKind = "main"
+				}
+				Expect(realLyric.Kind).To(Equal(expectedKind))
 				Expect(realLyric.Lang).To(Equal(expectedLyric.Lang))
 				Expect(realLyric.Synced).To(Equal(expectedLyric.Synced))

@ -222,6 +257,40 @@ var _ = Describe("MediaRetrievalController", func() {
 						Expect(*realLine.Start).To(Equal(*expectedLine.Start))
 					}
 				}
+
+				Expect(realLyric.TokenLine).To(HaveLen(len(expectedLyric.TokenLine)))
+				for j, realTokenLine := range realLyric.TokenLine {
+					expectedTokenLine := expectedLyric.TokenLine[j]
+					Expect(realTokenLine.Index).To(Equal(expectedTokenLine.Index))
+					Expect(realTokenLine.Value).To(Equal(expectedTokenLine.Value))
+					if expectedTokenLine.Start == nil {
+						Expect(realTokenLine.Start).To(BeNil())
+					} else {
+						Expect(*realTokenLine.Start).To(Equal(*expectedTokenLine.Start))
+					}
+					if expectedTokenLine.End == nil {
+						Expect(realTokenLine.End).To(BeNil())
+					} else {
+						Expect(*realTokenLine.End).To(Equal(*expectedTokenLine.End))
+					}
+
+					Expect(realTokenLine.Token).To(HaveLen(len(expectedTokenLine.Token)))
+					for k, realToken := range realTokenLine.Token {
+						expectedToken := expectedTokenLine.Token[k]
+						Expect(realToken.Value).To(Equal(expectedToken.Value))
+						Expect(realToken.Role).To(Equal(expectedToken.Role))
+						if expectedToken.Start == nil {
+							Expect(realToken.Start).To(BeNil())
+						} else {
+							Expect(*realToken.Start).To(Equal(*expectedToken.Start))
+						}
+						if expectedToken.End == nil {
+							Expect(realToken.End).To(BeNil())
+						} else {
+							Expect(*realToken.End).To(Equal(*expectedToken.End))
+						}
+					}
+				}
 			}
 		}

@ -323,6 +392,238 @@ var _ = Describe("MediaRetrievalController", func() {
 				},
 			})
 		})
+
+		It("should return multilingual TTML sidecar lyrics", func() {
+			conf.Server.LyricsPriority = ".ttml,embedded"
+			r := newGetRequest("id=1")
+
+			mockRepo.SetData(model.MediaFiles{
+				{
+					ID:     "1",
+					Path:   "tests/fixtures/test.mp3",
+					Artist: "Rick Astley",
+					Title:  "Never Gonna Give You Up",
+					Lyrics: "[]",
+				},
+			})
+
+			response, err := router.GetLyricsBySongId(r)
+			Expect(err).ToNot(HaveOccurred())
+
+			porTime := int64(18800)
+			ttmlTime := int64(22800)
+			compareResponses(response.LyricsList, responses.LyricsList{
+				StructuredLyrics: responses.StructuredLyrics{
+					{
+						DisplayArtist: "Rick Astley",
+						DisplayTitle:  "Never Gonna Give You Up",
+						Lang:          "eng",
+						Synced:        true,
+						Line: []responses.Line{
+							{
+								Start: &times[0],
+								Value: "We're no strangers to love",
+							},
+							{
+								Start: &ttmlTime,
+								Value: "You know the rules and so do I",
+							},
+						},
+					},
+					{
+						DisplayArtist: "Rick Astley",
+						DisplayTitle:  "Never Gonna Give You Up",
+						Lang:          "por",
+						Synced:        true,
+						Line: []responses.Line{
+							{
+								Start: &porTime,
+								Value: "Nao somos estranhos ao amor",
+							},
+						},
+					},
+				},
+			})
+		})
+
+		It("should return metadata-linked translation and pronunciation tracks from TTML", func() {
+			conf.Server.LyricsPriority = ".ttml,embedded"
+			r := newGetRequest("id=1")
+
+			mockRepo.SetData(model.MediaFiles{
+				{
+					ID:     "1",
+					Path:   "tests/fixtures/test-metadata.mp3",
+					Artist: "Rick Astley",
+					Title:  "Never Gonna Give You Up",
+					Lyrics: "[]",
+				},
+			})
+
+			response, err := router.GetLyricsBySongId(r)
+			Expect(err).ToNot(HaveOccurred())
+
+			mainStartA := int64(1000)
+			mainStartB := int64(2000)
+			tokenStartA := int64(2000)
+			tokenEndA := int64(2300)
+			tokenStartB := int64(2300)
+			tokenEndB := int64(2600)
+			compareResponses(response.LyricsList, responses.LyricsList{
+				StructuredLyrics: responses.StructuredLyrics{
+					{
+						DisplayArtist: "Rick Astley",
+						DisplayTitle:  "Never Gonna Give You Up",
+						Kind:          "main",
+						Lang:          "ja",
+						Synced:        true,
+						Line: []responses.Line{
+							{
+								Start: &mainStartA,
+								Value: "こんにちは",
+							},
+							{
+								Start: &mainStartB,
+								Value: "こんばんは",
+							},
+						},
+					},
+					{
+						DisplayArtist: "Rick Astley",
+						DisplayTitle:  "Never Gonna Give You Up",
+						Kind:          "translation",
+						Lang:          "es",
+						Synced:        true,
+						Line: []responses.Line{
+							{
+								Start: &mainStartA,
+								Value: "Hola",
+							},
+						},
+					},
+					{
+						DisplayArtist: "Rick Astley",
+						DisplayTitle:  "Never Gonna Give You Up",
+						Kind:          "pronunciation",
+						Lang:          "ja-latn",
+						Synced:        true,
+						Line: []responses.Line{
+							{
+								Start: &mainStartB,
+								Value: "konni",
+							},
+						},
+						TokenLine: []responses.TokenLine{
+							{
+								Index: 0,
+								Start: &mainStartB,
+								End:   &tokenEndB,
+								Value: "konni",
+								Token: []responses.LyricToken{
+									{
+										Start: &tokenStartA,
+										End:   &tokenEndA,
+										Value: "ko",
+									},
+									{
+										Start: &tokenStartB,
+										End:   &tokenEndB,
+										Value: "nni",
+									},
+								},
+							},
+						},
+					},
+				},
+			})
+		})
+
+		It("should return tokenized lines for songLyrics v2 clients", func() {
+			r := newGetRequest("id=1")
+
+			lineStart := int64(1000)
+			lineEnd := int64(3000)
+			tokenStartA := int64(1000)
+			tokenEndA := int64(1400)
+			tokenStartB := int64(2000)
+			tokenEndB := int64(2500)
+			lyricsJson, err := json.Marshal(model.LyricList{
+				{
+					Lang:   "eng",
+					Synced: true,
+					Line: []model.Line{
+						{
+							Start: &lineStart,
+							End:   &lineEnd,
+							Value: "Hello echo",
+							Token: []model.Token{
+								{
+									Start: &tokenStartA,
+									End:   &tokenEndA,
+									Value: "Hello",
+								},
+								{
+									Start: &tokenStartB,
+									End:   &tokenEndB,
+									Value: "echo",
+									Role:  "x-bg",
+								},
+							},
+						},
+					},
+				},
+			})
+			Expect(err).ToNot(HaveOccurred())
+
+			mockRepo.SetData(model.MediaFiles{
+				{
+					ID:     "1",
+					Artist: "Rick Astley",
+					Title:  "Never Gonna Give You Up",
+					Lyrics: string(lyricsJson),
+				},
+			})
+
+			response, err := router.GetLyricsBySongId(r)
+			Expect(err).ToNot(HaveOccurred())
+			compareResponses(response.LyricsList, responses.LyricsList{
+				StructuredLyrics: responses.StructuredLyrics{
+					{
+						DisplayArtist: "Rick Astley",
+						DisplayTitle:  "Never Gonna Give You Up",
+						Lang:          "eng",
+						Synced:        true,
+						Line: []responses.Line{
+							{
+								Start: &lineStart,
+								Value: "Hello echo",
+							},
+						},
+						TokenLine: []responses.TokenLine{
+							{
+								Index: 0,
+								Start: &lineStart,
+								End:   &lineEnd,
+								Value: "Hello echo",
+								Token: []responses.LyricToken{
+									{
+										Start: &tokenStartA,
+										End:   &tokenEndA,
+										Value: "Hello",
+									},
+									{
+										Start: &tokenStartB,
+										End:   &tokenEndB,
+										Value: "echo",
+										Role:  "x-bg",
+									},
+								},
+							},
+						},
+					},
+				},
+			})
+		})
 	})
 })

--- a/server/subsonic/opensubsonic.go
+++ b/server/subsonic/opensubsonic.go
@ -11,7 +11,7 @@ func (api *Router) GetOpenSubsonicExtensions(_ *http.Request) (*responses.Subson
 	response.OpenSubsonicExtensions = &responses.OpenSubsonicExtensions{
 		{Name: "transcodeOffset", Versions: []int32{1}},
 		{Name: "formPost", Versions: []int32{1}},
-		{Name: "songLyrics", Versions: []int32{1}},
+		{Name: "songLyrics", Versions: []int32{1, 2}},
 		{Name: "indexBasedQueue", Versions: []int32{1}},
 		{Name: "transcoding", Versions: []int32{1}},
 	}
--- a/server/subsonic/opensubsonic_test.go
+++ b/server/subsonic/opensubsonic_test.go
@ -38,7 +38,7 @@ var _ = Describe("GetOpenSubsonicExtensions", func() {
 			HaveLen(5),
 			ContainElement(responses.OpenSubsonicExtension{Name: "transcodeOffset", Versions: []int32{1}}),
 			ContainElement(responses.OpenSubsonicExtension{Name: "formPost", Versions: []int32{1}}),
-			ContainElement(responses.OpenSubsonicExtension{Name: "songLyrics", Versions: []int32{1}}),
+			ContainElement(responses.OpenSubsonicExtension{Name: "songLyrics", Versions: []int32{1, 2}}),
 			ContainElement(responses.OpenSubsonicExtension{Name: "indexBasedQueue", Versions: []int32{1}}),
 			ContainElement(responses.OpenSubsonicExtension{Name: "transcoding", Versions: []int32{1}}),
 		))
--- a/server/subsonic/responses/responses.go
+++ b/server/subsonic/responses/responses.go
@ -537,13 +537,30 @@ type Line struct {
 	Value string `xml:",chardata"            json:"value"`
 }

+type LyricToken struct {
+	Start *int64 `xml:"start,attr,omitempty" json:"start,omitempty"`
+	End   *int64 `xml:"end,attr,omitempty"   json:"end,omitempty"`
+	Value string `xml:"value,attr"           json:"value"`
+	Role  string `xml:"role,attr,omitempty"  json:"role,omitempty"`
+}
+
+type TokenLine struct {
+	Index int32        `xml:"index,attr"                    json:"index"`
+	Start *int64       `xml:"start,attr,omitempty"         json:"start,omitempty"`
+	End   *int64       `xml:"end,attr,omitempty"           json:"end,omitempty"`
+	Value string       `xml:"value,attr,omitempty"         json:"value,omitempty"`
+	Token []LyricToken `xml:"token,omitempty"        json:"token,omitempty"`
+}
+
 type StructuredLyric struct {
-	DisplayArtist string `xml:"displayArtist,attr,omitempty" json:"displayArtist,omitempty"`
-	DisplayTitle  string `xml:"displayTitle,attr,omitempty"  json:"displayTitle,omitempty"`
-	Lang          string `xml:"lang,attr"                    json:"lang"`
-	Line          []Line `xml:"line"                         json:"line"`
-	Offset        *int64 `xml:"offset,attr,omitempty"        json:"offset,omitempty"`
-	Synced        bool   `xml:"synced,attr"                  json:"synced"`
+	DisplayArtist string      `xml:"displayArtist,attr,omitempty" json:"displayArtist,omitempty"`
+	DisplayTitle  string      `xml:"displayTitle,attr,omitempty"  json:"displayTitle,omitempty"`
+	Kind          string      `xml:"kind,attr,omitempty"          json:"kind,omitempty"`
+	Lang          string      `xml:"lang,attr"                    json:"lang"`
+	Line          []Line      `xml:"line"                         json:"line"`
+	TokenLine     []TokenLine `xml:"tokenLine,omitempty"     json:"tokenLine,omitempty"`
+	Offset        *int64      `xml:"offset,attr,omitempty"        json:"offset,omitempty"`
+	Synced        bool        `xml:"synced,attr"                  json:"synced"`
 }

 type StructuredLyrics []StructuredLyric
--- a/tests/fixtures/bom-test.ttml
+++ b/tests/fixtures/bom-test.ttml
@ -0,0 +1,2 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<tt xmlns="http://www.w3.org/ns/ttml"><body><div xml:lang="eng"><p begin="00:00:00.00">BOM test line</p></div></body></tt>
--- a/tests/fixtures/bom-utf16-test.ttml
+++ b/tests/fixtures/bom-utf16-test.ttml
--- a/tests/fixtures/test-metadata.ttml
+++ b/tests/fixtures/test-metadata.ttml
@ -0,0 +1,25 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<tt xmlns="http://www.w3.org/ns/ttml" xmlns:itunes="http://music.apple.com/lyric-ttml-internal">
+  <head>
+    <metadata>
+      <iTunesMetadata xmlns="http://music.apple.com/lyric-ttml-internal">
+        <translations>
+          <translation xml:lang="es">
+            <text for="L1">Hola</text>
+          </translation>
+        </translations>
+        <transliterations>
+          <transliteration xml:lang="ja-Latn">
+            <text for="L2"><span begin="00:02.000" end="00:02.300" xmlns="http://www.w3.org/ns/ttml">ko</span><span begin="00:02.300" end="00:02.600" xmlns="http://www.w3.org/ns/ttml">nni</span></text>
+          </transliteration>
+        </transliterations>
+      </iTunesMetadata>
+    </metadata>
+  </head>
+  <body xml:lang="ja">
+    <div>
+      <p begin="00:01.000" end="00:01.500" itunes:key="L1">こんにちは</p>
+      <p begin="00:02.000" end="00:02.700" itunes:key="L2">こんばんは</p>
+    </div>
+  </body>
+</tt>
--- a/tests/fixtures/test.ttml
+++ b/tests/fixtures/test.ttml
@ -0,0 +1,12 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<tt xmlns="http://www.w3.org/ns/ttml" xmlns:ttp="http://www.w3.org/ns/ttml#parameter" ttp:frameRate="30" ttp:subFrameRate="2" ttp:tickRate="10">
+  <body>
+    <div xml:lang="eng">
+      <p begin="00:00:18.80">We're no strangers to love</p>
+      <p begin="00:00:22:24">You know the rules and so do I</p>
+    </div>
+    <div xml:lang="por">
+      <p begin="188t">Nao somos estranhos ao amor</p>
+    </div>
+  </body>
+</tt>
--- a/ui/src/actions/player.js
+++ b/ui/src/actions/player.js
@ -9,6 +9,7 @@ export const PLAYER_SET_VOLUME = 'PLAYER_SET_VOLUME'
 export const PLAYER_SET_MODE = 'PLAYER_SET_MODE'
 export const TRANSCODING_SET_PROFILE = 'TRANSCODING_SET_PROFILE'
 export const PLAYER_REFRESH_QUEUE = 'PLAYER_REFRESH_QUEUE'
+export const PLAYER_UPDATE_LYRIC = 'PLAYER_UPDATE_LYRIC'

 export const setTrack = (data) => ({
  type: PLAYER_SET_TRACK,
@ -114,3 +115,8 @@ export const refreshQueue = (resolvedUrls) => ({
  type: PLAYER_REFRESH_QUEUE,
  data: resolvedUrls,
 })
+
+export const updateQueueLyric = (trackId, lyric) => ({
+  type: PLAYER_UPDATE_LYRIC,
+  data: { trackId, lyric },
+})
--- a/ui/src/audioplayer/KaraokeLyricsOverlay.jsx
+++ b/ui/src/audioplayer/KaraokeLyricsOverlay.jsx
--- a/ui/src/audioplayer/Player.jsx
+++ b/ui/src/audioplayer/Player.jsx
@ -22,6 +22,7 @@ import {
  refreshQueue,
  setPlayMode,
  setTranscodingProfile,
+  updateQueueLyric,
  setVolume,
  syncQueue,
 } from '../actions'
@ -33,6 +34,25 @@ import { keyMap } from '../hotkeys'
 import keyHandlers from './keyHandlers'
 import { calculateGain } from '../utils/calculateReplayGain'
 import { detectBrowserProfile, decisionService } from '../transcode'
+import {
+  getPreferredLyricLanguage,
+  hasStructuredLyricContent,
+  selectLyricLayers,
+  structuredLyricToLrc,
+} from './lyrics'
+import KaraokeLyricsOverlay from './KaraokeLyricsOverlay'
+
+const emptyLyricLayers = {
+  main: null,
+  translation: null,
+  pronunciation: null,
+}
+
+const normalizeLyricLayers = (layers) => ({
+  main: layers?.main || null,
+  translation: layers?.translation || null,
+  pronunciation: layers?.pronunciation || null,
+})

 const Player = () => {
  const theme = useCurrentTheme()
@ -120,6 +140,72 @@ const Player = () => {
  const gainInfo = useSelector((state) => state.replayGain)
  const [context, setContext] = useState(null)
  const [gainNode, setGainNode] = useState(null)
+  const lyricCacheRef = useRef(new Map())
+  const lyricRequestIdRef = useRef(0)
+  const playerRef = useRef(null)
+  const [karaokeVisible, setKaraokeVisible] = useState(false)
+  const [selectedLyricLayers, setSelectedLyricLayers] =
+    useState(emptyLyricLayers)
+  const [showTranslation, setShowTranslation] = useState(false)
+  const [showPronunciation, setShowPronunciation] = useState(false)
+  const currentTrackId = playerState.current?.trackId
+  const currentTrackIsRadio = playerState.current?.isRadio
+  const selectedStructuredLyric = selectedLyricLayers.main
+  const hasKaraokeLyric = hasStructuredLyricContent(selectedStructuredLyric)
+  const hasTranslationLyric = hasStructuredLyricContent(
+    selectedLyricLayers.translation,
+  )
+  const hasPronunciationLyric = hasStructuredLyricContent(
+    selectedLyricLayers.pronunciation,
+  )
+
+  const applyLyricToRuntimePlayer = useCallback((trackId, lyric) => {
+    if (!trackId) {
+      return
+    }
+
+    const player = playerRef.current
+    if (!player || typeof player.setState !== 'function') {
+      return
+    }
+
+    player.setState((prevState) => {
+      const prevLists = Array.isArray(prevState.audioLists)
+        ? prevState.audioLists
+        : []
+      let changed = false
+      const audioLists = prevLists.map((item) => {
+        if (item.trackId !== trackId) {
+          return item
+        }
+        if (item.lyric === lyric) {
+          return item
+        }
+        changed = true
+        return {
+          ...item,
+          lyric,
+        }
+      })
+
+      const currentItem = audioLists.find(
+        (item) => item.musicSrc === prevState.musicSrc,
+      )
+      const currentLyric =
+        typeof currentItem?.lyric === 'string'
+          ? currentItem.lyric
+          : prevState.lyric
+
+      if (!changed && currentLyric === prevState.lyric) {
+        return null
+      }
+
+      return {
+        audioLists,
+        lyric: currentLyric,
+      }
+    })
+  }, [])

  useEffect(() => {
    if (
@ -166,6 +252,107 @@ const Player = () => {
    return () => window.removeEventListener('beforeunload', handleBeforeUnload)
  }, [playerState, audioInstance])

+  useEffect(() => {
+    if (!currentTrackId || currentTrackIsRadio) {
+      setSelectedLyricLayers(emptyLyricLayers)
+      setShowTranslation(false)
+      setShowPronunciation(false)
+      setKaraokeVisible(false)
+      return
+    }
+
+    const cached = lyricCacheRef.current.get(currentTrackId)
+    let layers = emptyLyricLayers
+    if (cached && typeof cached !== 'string') {
+      if (cached.layers) {
+        layers = normalizeLyricLayers(cached.layers)
+      } else if (cached.structuredLyric) {
+        layers = normalizeLyricLayers({
+          main: cached.structuredLyric,
+        })
+      }
+    }
+    setSelectedLyricLayers(layers)
+    setShowTranslation(false)
+    setShowPronunciation(hasStructuredLyricContent(layers.pronunciation))
+  }, [currentTrackId, currentTrackIsRadio])
+
+  useEffect(() => {
+    lyricRequestIdRef.current += 1
+    const requestId = lyricRequestIdRef.current
+
+    if (!currentTrackId || currentTrackIsRadio) {
+      return
+    }
+
+    const cached = lyricCacheRef.current.get(currentTrackId)
+    if (cached !== undefined) {
+      const cachedLyric =
+        typeof cached === 'string' ? cached : cached?.lrc || ''
+      const cachedLayers =
+        typeof cached === 'string'
+          ? emptyLyricLayers
+          : cached?.layers
+            ? normalizeLyricLayers(cached.layers)
+            : normalizeLyricLayers({ main: cached?.structuredLyric })
+
+      setSelectedLyricLayers(cachedLayers)
+      setShowTranslation(false)
+      setShowPronunciation(
+        hasStructuredLyricContent(cachedLayers.pronunciation),
+      )
+      if (cachedLyric) {
+        dispatch(updateQueueLyric(currentTrackId, cachedLyric))
+        applyLyricToRuntimePlayer(currentTrackId, cachedLyric)
+      }
+      return
+    }
+
+    subsonic
+      .getLyricsBySongId(currentTrackId)
+      .then((resp) => {
+        if (lyricRequestIdRef.current !== requestId) {
+          return
+        }
+
+        const structuredLyrics =
+          resp?.json?.['subsonic-response']?.lyricsList?.structuredLyrics || []
+        const layers = selectLyricLayers(
+          structuredLyrics,
+          getPreferredLyricLanguage(),
+        )
+        const lyric = layers.main ? structuredLyricToLrc(layers.main) : ''
+        lyricCacheRef.current.set(currentTrackId, {
+          lrc: lyric,
+          layers,
+        })
+        setSelectedLyricLayers(layers)
+        setShowTranslation(false)
+        setShowPronunciation(hasStructuredLyricContent(layers.pronunciation))
+
+        if (lyric !== '') {
+          dispatch(updateQueueLyric(currentTrackId, lyric))
+          applyLyricToRuntimePlayer(currentTrackId, lyric)
+        }
+      })
+      .catch(() => {
+        if (lyricRequestIdRef.current !== requestId) {
+          return
+        }
+        setSelectedLyricLayers(emptyLyricLayers)
+        setShowTranslation(false)
+        setShowPronunciation(false)
+        // Do not cache network/request failures as empty lyrics, so we can retry.
+        lyricCacheRef.current.delete(currentTrackId)
+      })
+  }, [dispatch, currentTrackId, currentTrackIsRadio, applyLyricToRuntimePlayer])
+
+  useEffect(() => {
+    if (!hasKaraokeLyric && karaokeVisible) {
+      setKaraokeVisible(false)
+    }
+  }, [hasKaraokeLyric, karaokeVisible])
+
  const defaultOptions = useMemo(
    () => ({
      theme: playerTheme,
@ -177,7 +364,7 @@ const Player = () => {
      clearPriorAudioLists: false,
      showDestroy: true,
      showDownload: false,
-      showLyric: true,
+      showLyric: false,
      showReload: false,
      toggleMode: !isDesktop,
      glassBg: false,
@ -214,12 +401,24 @@ const Player = () => {
        (playerState.clear || playerState.playIndex === 0),
      clearPriorAudioLists: playerState.clear,
      extendsContent: (
-        <PlayerToolbar id={current.trackId} isRadio={current.isRadio} />
+        <PlayerToolbar
+          id={current.trackId}
+          isRadio={current.isRadio}
+          onToggleLyrics={() => setKaraokeVisible((visible) => !visible)}
+          lyricsActive={karaokeVisible}
+          lyricsDisabled={!hasKaraokeLyric}
+        />
      ),
      defaultVolume: isMobilePlayer ? 1 : playerState.volume,
      showMediaSession: !current.isRadio,
    }
-  }, [playerState, defaultOptions, isMobilePlayer])
+  }, [
+    playerState,
+    defaultOptions,
+    isMobilePlayer,
+    karaokeVisible,
+    hasKaraokeLyric,
+  ])

  const onAudioListsChange = useCallback(
    (_, audioLists, audioInfo) => dispatch(syncQueue(audioInfo, audioLists)),
@ -391,6 +590,7 @@ const Player = () => {
  return (
    <ThemeProvider theme={createMuiTheme(theme)}>
      <ReactJkMusicPlayer
+        ref={playerRef}
        {...options}
        className={classes.player}
        onAudioListsChange={onAudioListsChange}
@ -406,6 +606,28 @@ const Player = () => {
        onBeforeDestroy={onBeforeDestroy}
        getAudioInstance={setAudioInstance}
      />
+      <KaraokeLyricsOverlay
+        visible={karaokeVisible}
+        mainLyric={selectedLyricLayers.main}
+        translationLyric={selectedLyricLayers.translation}
+        pronunciationLyric={selectedLyricLayers.pronunciation}
+        showTranslation={showTranslation}
+        showPronunciation={showPronunciation}
+        translationEnabled={hasTranslationLyric}
+        pronunciationEnabled={hasPronunciationLyric}
+        onToggleTranslation={() =>
+          setShowTranslation((previous) =>
+            hasTranslationLyric ? !previous : false,
+          )
+        }
+        onTogglePronunciation={() =>
+          setShowPronunciation((previous) =>
+            hasPronunciationLyric ? !previous : false,
+          )
+        }
+        audioInstance={audioInstance}
+        onClose={() => setKaraokeVisible(false)}
+      />
      <GlobalHotKeys handlers={handlers} keyMap={keyMap} allowChanges />
    </ThemeProvider>
  )
--- a/ui/src/audioplayer/PlayerToolbar.jsx
+++ b/ui/src/audioplayer/PlayerToolbar.jsx
@ -4,7 +4,9 @@ import { useGetOne } from 'react-admin'
 import { GlobalHotKeys } from 'react-hotkeys'
 import IconButton from '@material-ui/core/IconButton'
 import { useMediaQuery } from '@material-ui/core'
+import Tooltip from '@material-ui/core/Tooltip'
 import { RiSaveLine } from 'react-icons/ri'
+import { RiFileMusicLine } from 'react-icons/ri'
 import { LoveButton, useToggleLove } from '../common'
 import { openSaveQueueDialog } from '../actions'
 import { keyMap } from '../hotkeys'
@ -55,7 +57,13 @@ const useStyles = makeStyles((theme) => ({
  },
 }))

-const PlayerToolbar = ({ id, isRadio }) => {
+const PlayerToolbar = ({
+  id,
+  isRadio,
+  onToggleLyrics,
+  lyricsActive = false,
+  lyricsDisabled = false,
+}) => {
  const dispatch = useDispatch()
  const { data, loading } = useGetOne('song', id, { enabled: !!id && !isRadio })
  const [toggleLove, toggling] = useToggleLove('song', data)
@ -99,6 +107,25 @@ const PlayerToolbar = ({ id, isRadio }) => {
    />
  )

+  const toggleLyricsButton = (
+    <Tooltip title="Toggle synchronized lyrics">
+      <span>
+        <IconButton
+          size={isDesktop ? 'small' : undefined}
+          onClick={onToggleLyrics}
+          disabled={!onToggleLyrics || lyricsDisabled}
+          data-testid="toggle-lyrics-button"
+          className={buttonClass}
+          color={lyricsActive ? 'primary' : 'default'}
+        >
+          <RiFileMusicLine
+            className={!isDesktop ? classes.mobileIcon : undefined}
+          />
+        </IconButton>
+      </span>
+    </Tooltip>
+  )
+
  return (
    <>
      <GlobalHotKeys keyMap={keyMap} handlers={handlers} allowChanges />
@ -106,11 +133,13 @@ const PlayerToolbar = ({ id, isRadio }) => {
        <li className={`${listItemClass} item`}>
          {saveQueueButton}
          {loveButton}
+          {toggleLyricsButton}
        </li>
      ) : (
        <>
          <li className={`${listItemClass} item`}>{saveQueueButton}</li>
          <li className={`${listItemClass} item`}>{loveButton}</li>
+          <li className={`${listItemClass} item`}>{toggleLyricsButton}</li>
        </>
      )}
    </>
--- a/ui/src/audioplayer/PlayerToolbar.test.jsx
+++ b/ui/src/audioplayer/PlayerToolbar.test.jsx
@ -71,6 +71,7 @@ describe('<PlayerToolbar />', () => {
      // Verify both buttons are rendered
      expect(screen.getByTestId('save-queue-button')).toBeInTheDocument()
      expect(screen.getByTestId('love-button')).toBeInTheDocument()
+      expect(screen.getByTestId('toggle-lyrics-button')).toBeInTheDocument()

      // Verify desktop classes are applied
      expect(listItems[0].className).toContain('toolbar')
@ -102,6 +103,14 @@ describe('<PlayerToolbar />', () => {
        type: 'OPEN_SAVE_QUEUE_DIALOG',
      })
    })
+
+    it('triggers lyric toggle callback when lyrics button is clicked', () => {
+      const onToggleLyrics = vi.fn()
+      render(<PlayerToolbar id="song-1" onToggleLyrics={onToggleLyrics} />)
+
+      fireEvent.click(screen.getByTestId('toggle-lyrics-button'))
+      expect(onToggleLyrics).toHaveBeenCalledTimes(1)
+    })
  })

  describe('Mobile layout', () => {
@ -114,11 +123,12 @@ describe('<PlayerToolbar />', () => {

      // Each button should be in its own list item
      const listItems = screen.getAllByRole('listitem')
-      expect(listItems).toHaveLength(2)
+      expect(listItems).toHaveLength(3)

      // Verify both buttons are rendered
      expect(screen.getByTestId('save-queue-button')).toBeInTheDocument()
      expect(screen.getByTestId('love-button')).toBeInTheDocument()
+      expect(screen.getByTestId('toggle-lyrics-button')).toBeInTheDocument()

      // Verify mobile classes are applied
      expect(listItems[0].className).toContain('mobileListItem')
@ -140,6 +150,13 @@ describe('<PlayerToolbar />', () => {
      const loveButton = screen.getByTestId('love-button')
      expect(loveButton).toBeDisabled()
    })
+
+    it('disables lyrics button when lyrics are unavailable', () => {
+      render(<PlayerToolbar id="song-1" lyricsDisabled={true} />)
+
+      const lyricsButton = screen.getByTestId('toggle-lyrics-button')
+      expect(lyricsButton).toBeDisabled()
+    })
  })

  describe('Common behavior', () => {
--- a/ui/src/audioplayer/lyrics.js
+++ b/ui/src/audioplayer/lyrics.js
@ -0,0 +1,617 @@
+const normalizeLanguageTag = (language) =>
+  (language || '').toLowerCase().replace('_', '-')
+
+const KARAOKE_SWITCH_EPSILON_MS = 18
+const LYRIC_KIND_MAIN = 'main'
+const LYRIC_KIND_TRANSLATION = 'translation'
+const LYRIC_KIND_PRONUNCIATION = 'pronunciation'
+
+const padTime = (value) => {
+  const str = value.toString()
+  return str.length === 1 ? `0${str}` : str
+}
+
+const toTime = (value) => {
+  const numeric = Number(value)
+  return Number.isFinite(numeric) ? numeric : null
+}
+
+const compareNullableTime = (a, b) => {
+  if (a == null && b == null) {
+    return 0
+  }
+  if (a == null) {
+    return 1
+  }
+  if (b == null) {
+    return -1
+  }
+  return a - b
+}
+
+const sortTokensByStart = (tokens) =>
+  tokens
+    .map((token, order) => ({ ...token, order }))
+    .sort((a, b) => {
+      const byStart = compareNullableTime(a.start, b.start)
+      if (byStart !== 0) {
+        return byStart
+      }
+      const byEnd = compareNullableTime(a.end, b.end)
+      if (byEnd !== 0) {
+        return byEnd
+      }
+      return a.order - b.order
+    })
+    .map(({ order, ...token }) => token)
+
+const languageMatch = (candidate, preferred) => {
+  if (!candidate || !preferred) {
+    return false
+  }
+  return (
+    candidate === preferred ||
+    candidate.startsWith(`${preferred}-`) ||
+    preferred.startsWith(`${candidate}-`)
+  )
+}
+
+const hasTimedLines = (lyric) =>
+  lyric &&
+  lyric.synced &&
+  Array.isArray(lyric.line) &&
+  lyric.line.some((line) => Number.isFinite(Number(line.start)))
+
+const normalizeToken = (token) => {
+  if (!token) {
+    return null
+  }
+  const value = typeof token.value === 'string' ? token.value : ''
+  if (!value.trim()) {
+    return null
+  }
+  return {
+    start: toTime(token.start),
+    end: toTime(token.end),
+    value,
+    role: typeof token.role === 'string' ? token.role : '',
+  }
+}
+
+const normalizeTokenLine = (tokenLine, fallbackIndex) => {
+  const index = Number.isFinite(Number(tokenLine?.index))
+    ? Number(tokenLine.index)
+    : fallbackIndex
+  const tokens = sortTokensByStart(
+    Array.isArray(tokenLine?.token)
+      ? tokenLine.token.map(normalizeToken).filter(Boolean)
+      : [],
+  )
+
+  return {
+    index,
+    start: toTime(tokenLine?.start),
+    end: toTime(tokenLine?.end),
+    value: typeof tokenLine?.value === 'string' ? tokenLine.value : '',
+    tokens,
+  }
+}
+
+const normalizeLyricKind = (kind) => {
+  const normalized = (kind || '').toLowerCase().trim()
+  switch (normalized) {
+    case LYRIC_KIND_TRANSLATION:
+      return LYRIC_KIND_TRANSLATION
+    case LYRIC_KIND_PRONUNCIATION:
+      return LYRIC_KIND_PRONUNCIATION
+    default:
+      return LYRIC_KIND_MAIN
+  }
+}
+
+const pickLyricByLanguage = (lyrics, preferredLanguage) => {
+  if (!Array.isArray(lyrics) || lyrics.length === 0) {
+    return null
+  }
+
+  const preferred = normalizeLanguageTag(preferredLanguage)
+  const preferredBase = preferred.split('-')[0]
+
+  return (
+    lyrics.find((lyric) =>
+      languageMatch(normalizeLanguageTag(lyric.lang), preferred),
+    ) ||
+    lyrics.find((lyric) =>
+      languageMatch(normalizeLanguageTag(lyric.lang), preferredBase),
+    ) ||
+    lyrics.find((lyric) =>
+      languageMatch(normalizeLanguageTag(lyric.lang), 'en'),
+    ) ||
+    lyrics[0]
+  )
+}
+
+const lineTimeWindow = (lines, index) => {
+  const line = lines[index]
+  if (!line) {
+    return { start: null, end: null }
+  }
+
+  const start = toTime(line.start)
+  const end = toTime(line.end) ?? toTime(lines[index + 1]?.start)
+  return { start, end }
+}
+
+const buildSyntheticWordTokens = (line, token) => {
+  const text = typeof line?.value === 'string' ? line.value : ''
+  if (!text.trim()) {
+    return null
+  }
+
+  const chunks = text.match(/\S+\s*/g) || []
+  if (chunks.length < 2) {
+    return null
+  }
+
+  const normalizedLine = text.replace(/\s+/g, ' ').trim().toLowerCase()
+  const normalizedTokenValue = (token?.value || '')
+    .replace(/\s+/g, ' ')
+    .trim()
+    .toLowerCase()
+  if (!normalizedTokenValue || !normalizedLine) {
+    return null
+  }
+
+  const compressedLine = normalizedLine.replace(/\s+/g, '')
+  const compressedToken = normalizedTokenValue.replace(/\s+/g, '')
+  const tokenLooksLikeWholeLine =
+    compressedToken === compressedLine ||
+    compressedToken.length >= Math.floor(compressedLine.length * 0.8)
+  if (!tokenLooksLikeWholeLine) {
+    return null
+  }
+
+  const tokenStart = toTime(token?.start)
+  const tokenEnd = toTime(token?.end)
+  const lineStart = toTime(line?.start)
+  const lineEnd = toTime(line?.end)
+
+  const baseStart = tokenStart ?? lineStart
+  const baseEnd = tokenEnd ?? lineEnd
+  if (
+    baseStart == null ||
+    baseEnd == null ||
+    !Number.isFinite(baseStart) ||
+    !Number.isFinite(baseEnd) ||
+    baseEnd <= baseStart
+  ) {
+    return null
+  }
+
+  const duration = baseEnd - baseStart
+  return chunks.map((chunk, idx) => ({
+    start: baseStart + (duration * idx) / chunks.length,
+    end: baseStart + (duration * (idx + 1)) / chunks.length,
+    value: chunk,
+    role: typeof token?.role === 'string' ? token.role : '',
+  }))
+}
+
+export const hasTokenTiming = (structuredLyric) =>
+  Boolean(
+    structuredLyric &&
+    Array.isArray(structuredLyric.tokenLine) &&
+    structuredLyric.tokenLine.some(
+      (tokenLine) =>
+        Array.isArray(tokenLine?.token) &&
+        tokenLine.token.some((token) => Number.isFinite(Number(token?.start))),
+    ),
+  )
+
+export const hasStructuredLyricContent = (structuredLyric) =>
+  Boolean(
+    structuredLyric &&
+    ((Array.isArray(structuredLyric.line) &&
+      structuredLyric.line.some(
+        (line) => typeof line?.value === 'string' && line.value.trim() !== '',
+      )) ||
+      hasTokenTiming(structuredLyric)),
+  )
+
+export const getPreferredLyricLanguage = () => {
+  if (typeof window !== 'undefined' && window.localStorage) {
+    const stored = window.localStorage.getItem('locale')
+    if (stored) {
+      return stored
+    }
+  }
+  if (typeof navigator !== 'undefined' && navigator.language) {
+    return navigator.language
+  }
+  return 'en'
+}
+
+export const selectLyricLayers = (structuredLyrics, preferredLanguage) => {
+  if (!Array.isArray(structuredLyrics)) {
+    return {
+      main: null,
+      translation: null,
+      pronunciation: null,
+    }
+  }
+
+  const synced = structuredLyrics.filter(hasTimedLines)
+  if (synced.length === 0) {
+    return {
+      main: null,
+      translation: null,
+      pronunciation: null,
+    }
+  }
+
+  const grouped = {
+    [LYRIC_KIND_MAIN]: [],
+    [LYRIC_KIND_TRANSLATION]: [],
+    [LYRIC_KIND_PRONUNCIATION]: [],
+  }
+
+  for (const lyric of synced) {
+    grouped[normalizeLyricKind(lyric?.kind)].push(lyric)
+  }
+
+  const mainCandidates = grouped[LYRIC_KIND_MAIN].length
+    ? grouped[LYRIC_KIND_MAIN]
+    : synced
+
+  return {
+    main: pickLyricByLanguage(mainCandidates, preferredLanguage),
+    translation: pickLyricByLanguage(
+      grouped[LYRIC_KIND_TRANSLATION],
+      preferredLanguage,
+    ),
+    pronunciation: pickLyricByLanguage(
+      grouped[LYRIC_KIND_PRONUNCIATION],
+      preferredLanguage,
+    ),
+  }
+}
+
+export const pickStructuredLyric = (structuredLyrics, preferredLanguage) =>
+  selectLyricLayers(structuredLyrics, preferredLanguage).main
+
+export const structuredLyricToLrc = (structuredLyric) => {
+  if (!structuredLyric || !Array.isArray(structuredLyric.line)) {
+    return ''
+  }
+
+  let lyricText = ''
+  for (const line of structuredLyric.line) {
+    const start = Number(line.start)
+    if (!Number.isFinite(start) || start < 0) {
+      continue
+    }
+
+    let time = Math.floor(start / 10)
+    const ms = time % 100
+    time = Math.floor(time / 100)
+    const sec = time % 60
+    time = Math.floor(time / 60)
+    const min = time % 60
+
+    lyricText += `[${padTime(min)}:${padTime(sec)}.${padTime(ms)}] ${line.value || ''}\n`
+  }
+  return lyricText
+}
+
+export const structuredLyricsToLrc = (structuredLyrics, preferredLanguage) => {
+  const selected = pickStructuredLyric(structuredLyrics, preferredLanguage)
+  if (!selected) {
+    return ''
+  }
+  return structuredLyricToLrc(selected)
+}
+
+export const buildKaraokeLines = (structuredLyric) => {
+  if (!structuredLyric) {
+    return []
+  }
+
+  const baseLines = Array.isArray(structuredLyric.line)
+    ? structuredLyric.line
+    : []
+  const rawTokenLines = Array.isArray(structuredLyric.tokenLine)
+    ? structuredLyric.tokenLine
+    : []
+
+  const lines =
+    rawTokenLines.length > 0
+      ? rawTokenLines.map((tokenLine, fallbackIndex) => {
+          const normalized = normalizeTokenLine(tokenLine, fallbackIndex)
+          const baseLine = baseLines[normalized.index] || {}
+          const tokens = normalized.tokens
+          const fallbackStart =
+            tokens.find((token) => token.start != null)?.start ?? null
+          const fallbackEnd =
+            [...tokens].reverse().find((token) => token.end != null)?.end ??
+            null
+          const value =
+            normalized.value ||
+            (typeof baseLine.value === 'string' ? baseLine.value : '') ||
+            tokens.map((token) => token.value).join('')
+
+          return {
+            index: normalized.index,
+            start: normalized.start ?? toTime(baseLine.start) ?? fallbackStart,
+            end: normalized.end ?? toTime(baseLine.end) ?? fallbackEnd,
+            value,
+            tokens,
+          }
+        })
+      : baseLines.map((line, index) => ({
+          index,
+          start: toTime(line.start),
+          end: toTime(line.end),
+          value: typeof line.value === 'string' ? line.value : '',
+          tokens: [],
+        }))
+
+  const normalized = lines
+    .filter((line) => line.value || line.tokens.length > 0)
+    .sort((a, b) => {
+      if (a.start == null && b.start == null) {
+        return a.index - b.index
+      }
+      if (a.start == null) {
+        return 1
+      }
+      if (b.start == null) {
+        return -1
+      }
+      if (a.start !== b.start) {
+        return a.start - b.start
+      }
+      return a.index - b.index
+    })
+    .map((line) => {
+      const nextLine = { ...line }
+      if (nextLine.tokens.length === 1) {
+        const syntheticTokens = buildSyntheticWordTokens(
+          nextLine,
+          nextLine.tokens[0],
+        )
+        if (syntheticTokens) {
+          nextLine.tokens = syntheticTokens
+        }
+      }
+      return nextLine
+    })
+
+  for (let i = 0; i < normalized.length; i += 1) {
+    if (normalized[i].end == null) {
+      const nextStart = normalized[i + 1]?.start
+      if (nextStart != null) {
+        normalized[i].end = nextStart
+      }
+    }
+  }
+
+  return normalized
+}
+
+export const resolveKaraokeTokenWindow = (
+  line,
+  tokenIndex,
+  lineEndFallback = null,
+) => {
+  const tokens = Array.isArray(line?.tokens) ? line.tokens : []
+  const token = tokens[tokenIndex]
+  if (!token) {
+    return { start: null, end: null }
+  }
+
+  const prevToken = tokenIndex > 0 ? tokens[tokenIndex - 1] : null
+  const nextToken =
+    tokenIndex + 1 < tokens.length ? tokens[tokenIndex + 1] : null
+
+  const lineStart = toTime(line?.start)
+  const lineEnd = toTime(line?.end) ?? toTime(lineEndFallback)
+  const tokenCount = tokens.length
+  const hasLineWindow =
+    lineStart != null &&
+    lineEnd != null &&
+    Number.isFinite(lineStart) &&
+    Number.isFinite(lineEnd) &&
+    lineEnd > lineStart
+  const estimatedStart =
+    hasLineWindow && tokenCount > 0
+      ? lineStart + ((lineEnd - lineStart) * tokenIndex) / tokenCount
+      : null
+  const estimatedEnd =
+    hasLineWindow && tokenCount > 0
+      ? lineStart + ((lineEnd - lineStart) * (tokenIndex + 1)) / tokenCount
+      : null
+
+  let explicitStartCount = 0
+  let explicitEndCount = 0
+  const uniqueStarts = new Set()
+  const uniqueEnds = new Set()
+
+  for (let i = 0; i < tokenCount; i += 1) {
+    const explicitStart = toTime(tokens[i]?.start)
+    if (explicitStart != null) {
+      explicitStartCount += 1
+      uniqueStarts.add(explicitStart)
+    }
+
+    const explicitEnd = toTime(tokens[i]?.end)
+    if (explicitEnd != null) {
+      explicitEndCount += 1
+      uniqueEnds.add(explicitEnd)
+    }
+  }
+
+  const collapsedStarts =
+    explicitStartCount > 1 && uniqueStarts.size <= Math.max(1, tokenCount / 4)
+  const collapsedEnds =
+    explicitEndCount > 1 && uniqueEnds.size <= Math.max(1, tokenCount / 4)
+  const shouldForceEstimated =
+    hasLineWindow && tokenCount > 1 && (collapsedStarts || collapsedEnds)
+
+  if (shouldForceEstimated) {
+    return {
+      start: estimatedStart,
+      end: estimatedEnd,
+    }
+  }
+  const prevEnd = toTime(prevToken?.end) ?? toTime(prevToken?.start)
+
+  let start = toTime(token.start)
+  if (start == null) {
+    start = prevEnd ?? estimatedStart ?? lineStart
+  }
+
+  let end = toTime(token.end)
+  if (end == null) {
+    const nextDirectStart = toTime(nextToken?.start)
+    const nextEstimatedStart =
+      hasLineWindow && tokenIndex + 1 < tokenCount
+        ? lineStart + ((lineEnd - lineStart) * (tokenIndex + 1)) / tokenCount
+        : null
+    end = nextDirectStart ?? nextEstimatedStart ?? estimatedEnd ?? lineEnd
+  }
+
+  if (
+    tokenCount === 1 &&
+    hasLineWindow &&
+    (start == null || end == null || end <= start + 1)
+  ) {
+    start = lineStart
+    end = lineEnd
+  }
+
+  if (start != null && end != null && end < start) {
+    end = start
+  }
+
+  return { start, end }
+}
+
+export const getActiveKaraokeState = (lines, currentTimeMs) => {
+  if (!Array.isArray(lines) || lines.length === 0) {
+    return { lineIndex: -1, tokenIndex: -1 }
+  }
+
+  const current = Number.isFinite(Number(currentTimeMs))
+    ? Number(currentTimeMs)
+    : 0
+  let lineIndex = 0
+  for (let i = 0; i < lines.length; i += 1) {
+    const lineStart = toTime(lines[i]?.start)
+    if (lineStart == null || lineStart <= current + KARAOKE_SWITCH_EPSILON_MS) {
+      lineIndex = i
+      continue
+    }
+    break
+  }
+
+  for (let i = lineIndex; i >= 0; i -= 1) {
+    const lineStart = toTime(lines[i]?.start)
+    const lineEnd = toTime(lines[i]?.end) ?? toTime(lines[i + 1]?.start)
+    if (lineStart != null && current + KARAOKE_SWITCH_EPSILON_MS < lineStart) {
+      continue
+    }
+    if (lineEnd == null || current <= lineEnd + KARAOKE_SWITCH_EPSILON_MS) {
+      lineIndex = i
+      break
+    }
+  }
+
+  const activeLine = lines[lineIndex] || null
+  const tokens = Array.isArray(activeLine?.tokens) ? activeLine.tokens : []
+  let tokenIndex = -1
+  for (let i = 0; i < tokens.length; i += 1) {
+    const { start: tokenStart, end: tokenEnd } = resolveKaraokeTokenWindow(
+      activeLine,
+      i,
+      lines[lineIndex + 1]?.start,
+    )
+    if (
+      tokenStart == null ||
+      tokenStart <= current + KARAOKE_SWITCH_EPSILON_MS
+    ) {
+      tokenIndex = i
+      if (tokenEnd != null && current <= tokenEnd + KARAOKE_SWITCH_EPSILON_MS) {
+        break
+      }
+      continue
+    }
+    break
+  }
+
+  return { lineIndex, tokenIndex }
+}
+
+export const findLayerLineIndexForMain = (mainLines, layerLines, mainIndex) => {
+  if (
+    !Array.isArray(mainLines) ||
+    !Array.isArray(layerLines) ||
+    mainLines.length === 0 ||
+    layerLines.length === 0 ||
+    mainIndex < 0 ||
+    mainIndex >= mainLines.length
+  ) {
+    return -1
+  }
+
+  const { start: mainStart, end: mainEnd } = lineTimeWindow(
+    mainLines,
+    mainIndex,
+  )
+
+  if (mainStart == null) {
+    return -1
+  }
+  const mainWindowEnd = mainEnd ?? mainStart
+  const mainWindowDuration = Math.max(0, mainWindowEnd - mainStart)
+  const maxDelta = Math.max(550, Math.min(1400, mainWindowDuration + 420))
+
+  let bestIdx = -1
+  let bestScore = Number.POSITIVE_INFINITY
+
+  for (let i = 0; i < layerLines.length; i += 1) {
+    const { start, end } = lineTimeWindow(layerLines, i)
+
+    if (start != null && end != null) {
+      const overlap = Math.min(end, mainEnd ?? end) - Math.max(start, mainStart)
+      if (overlap >= 0) {
+        const score = Math.abs(start - mainStart) + Math.abs(i - mainIndex) * 30
+        if (score < bestScore) {
+          bestScore = score
+          bestIdx = i
+        }
+        continue
+      }
+    }
+
+    if (start != null) {
+      if (Math.abs(start - mainStart) > maxDelta) {
+        continue
+      }
+      const score = Math.abs(start - mainStart) + Math.abs(i - mainIndex) * 45
+      if (score < bestScore) {
+        bestScore = score
+        bestIdx = i
+      }
+    }
+  }
+
+  return bestIdx
+}
+
+export const resolveLayerLineForMain = (mainLines, layerLines, mainIndex) => {
+  const index = findLayerLineIndexForMain(mainLines, layerLines, mainIndex)
+  return {
+    index,
+    line: index >= 0 ? layerLines[index] : null,
+  }
+}
--- a/ui/src/audioplayer/lyrics.test.js
+++ b/ui/src/audioplayer/lyrics.test.js
@ -0,0 +1,416 @@
+import {
+  buildKaraokeLines,
+  findLayerLineIndexForMain,
+  getPreferredLyricLanguage,
+  getActiveKaraokeState,
+  hasStructuredLyricContent,
+  pickStructuredLyric,
+  resolveKaraokeTokenWindow,
+  resolveLayerLineForMain,
+  selectLyricLayers,
+  structuredLyricToLrc,
+  structuredLyricsToLrc,
+} from './lyrics'
+
+describe('lyrics helpers', () => {
+  beforeEach(() => {
+    localStorage.clear()
+  })
+
+  it('prefers a lyric track that matches the locale', () => {
+    const selected = pickStructuredLyric(
+      [
+        {
+          lang: 'eng',
+          synced: true,
+          line: [{ start: 1000, value: 'English line' }],
+        },
+        {
+          lang: 'pt-BR',
+          synced: true,
+          line: [{ start: 1000, value: 'Linha em portugues' }],
+        },
+      ],
+      'pt-BR',
+    )
+
+    expect(selected.lang).toBe('pt-BR')
+  })
+
+  it('falls back to english when preferred locale is not available', () => {
+    const selected = pickStructuredLyric(
+      [
+        {
+          lang: 'eng',
+          synced: true,
+          line: [{ start: 1000, value: 'English line' }],
+        },
+        {
+          lang: 'deu',
+          synced: true,
+          line: [{ start: 1000, value: 'Deutsche Zeile' }],
+        },
+      ],
+      'pt-BR',
+    )
+
+    expect(selected.lang).toBe('eng')
+  })
+
+  it('falls back to first synced track when english is missing', () => {
+    const selected = pickStructuredLyric(
+      [
+        {
+          lang: 'jpn',
+          synced: true,
+          line: [{ start: 1000, value: 'Nihongo' }],
+        },
+        {
+          lang: 'deu',
+          synced: true,
+          line: [{ start: 1000, value: 'Deutsch' }],
+        },
+      ],
+      'pt-BR',
+    )
+
+    expect(selected.lang).toBe('jpn')
+  })
+
+  it('selects translation and pronunciation layers by kind', () => {
+    const layers = selectLyricLayers(
+      [
+        {
+          kind: 'main',
+          lang: 'ja',
+          synced: true,
+          line: [{ start: 1000, value: 'こんにちは' }],
+        },
+        {
+          kind: 'translation',
+          lang: 'es',
+          synced: true,
+          line: [{ start: 1000, value: 'Hola' }],
+        },
+        {
+          kind: 'pronunciation',
+          lang: 'ja-Latn',
+          synced: true,
+          line: [{ start: 1000, value: 'konnichiwa' }],
+        },
+      ],
+      'es-MX',
+    )
+
+    expect(layers.main.lang).toBe('ja')
+    expect(layers.translation.lang).toBe('es')
+    expect(layers.pronunciation.lang).toBe('ja-Latn')
+  })
+
+  it('treats missing kind as main for backward compatibility', () => {
+    const layers = selectLyricLayers(
+      [
+        {
+          lang: 'eng',
+          synced: true,
+          line: [{ start: 1000, value: 'Main' }],
+        },
+      ],
+      'eng',
+    )
+
+    expect(layers.main.lang).toBe('eng')
+    expect(layers.translation).toBeNull()
+    expect(layers.pronunciation).toBeNull()
+  })
+
+  it('matches layer line by timing for the active main line', () => {
+    const mainLines = [
+      { index: 0, start: 1000, end: 1800, value: 'Line A', tokens: [] },
+      { index: 1, start: 2000, end: 2800, value: 'Line B', tokens: [] },
+    ]
+    const layerLines = [
+      { index: 0, start: 900, end: 1750, value: 'A2', tokens: [] },
+      { index: 1, start: 2050, end: 2900, value: 'B2', tokens: [] },
+    ]
+
+    expect(findLayerLineIndexForMain(mainLines, layerLines, 1)).toBe(1)
+    expect(resolveLayerLineForMain(mainLines, layerLines, 0).line.value).toBe(
+      'A2',
+    )
+  })
+
+  it('matches metadata layers by nearest timing even when indexes differ', () => {
+    const mainLines = [
+      { index: 0, start: 1000, end: 1800, value: 'Line A', tokens: [] },
+      { index: 1, start: 2000, end: 2800, value: 'Line B', tokens: [] },
+      { index: 2, start: 3000, end: 3800, value: 'Line C', tokens: [] },
+    ]
+    const layerLines = [
+      { index: 2, start: 3020, end: 3820, value: 'C2', tokens: [] },
+      { index: 0, start: 980, end: 1760, value: 'A2', tokens: [] },
+      { index: 1, start: 2010, end: 2810, value: 'B2', tokens: [] },
+    ]
+
+    expect(findLayerLineIndexForMain(mainLines, layerLines, 1)).toBe(2)
+    expect(resolveLayerLineForMain(mainLines, layerLines, 2).line.value).toBe(
+      'C2',
+    )
+  })
+
+  it('returns no layer match when the nearest line is too far in time', () => {
+    const mainLines = [
+      { index: 0, start: 1000, end: 1800, value: 'Line A', tokens: [] },
+      { index: 1, start: 2000, end: 2800, value: 'Line B', tokens: [] },
+    ]
+    const layerLines = [
+      { index: 0, start: 60000, end: 60800, value: 'Far line', tokens: [] },
+    ]
+
+    expect(findLayerLineIndexForMain(mainLines, layerLines, 1)).toBe(-1)
+    expect(resolveLayerLineForMain(mainLines, layerLines, 1).line).toBeNull()
+  })
+
+  it('converts a structured lyric track to LRC', () => {
+    const lrc = structuredLyricToLrc({
+      lang: 'eng',
+      synced: true,
+      line: [
+        { start: 18800, value: "We're no strangers to love" },
+        { start: 22801, value: 'You know the rules and so do I' },
+      ],
+    })
+
+    expect(lrc).toBe(
+      "[00:18.80] We're no strangers to love\n[00:22.80] You know the rules and so do I\n",
+    )
+  })
+
+  it('returns empty text when no synced lyrics are available', () => {
+    const lrc = structuredLyricsToLrc(
+      [{ lang: 'eng', synced: false, line: [{ value: 'Unsynced line' }] }],
+      'eng',
+    )
+
+    expect(lrc).toBe('')
+  })
+
+  it('reads preferred language from localStorage first', () => {
+    localStorage.setItem('locale', 'pt-BR')
+    expect(getPreferredLyricLanguage()).toBe('pt-BR')
+  })
+
+  it('builds karaoke lines from tokenLine payload', () => {
+    const lines = buildKaraokeLines({
+      lang: 'eng',
+      synced: true,
+      line: [{ start: 1000, end: 3000, value: 'Hello world' }],
+      tokenLine: [
+        {
+          index: 0,
+          start: 1000,
+          end: 3000,
+          value: 'Hello world',
+          token: [
+            { start: 1000, end: 1500, value: 'Hello' },
+            { start: 2000, end: 2500, value: 'world', role: 'x-bg' },
+          ],
+        },
+      ],
+    })
+
+    expect(lines).toEqual([
+      {
+        index: 0,
+        start: 1000,
+        end: 3000,
+        value: 'Hello world',
+        tokens: [
+          { start: 1000, end: 1500, value: 'Hello', role: '' },
+          { start: 2000, end: 2500, value: 'world', role: 'x-bg' },
+        ],
+      },
+    ])
+  })
+
+  it('sorts token timing by start to keep playback stable', () => {
+    const lines = buildKaraokeLines({
+      lang: 'eng',
+      synced: true,
+      line: [{ start: 1000, end: 3000, value: 'Hello world' }],
+      tokenLine: [
+        {
+          index: 0,
+          start: 1000,
+          end: 3000,
+          value: 'Hello world',
+          token: [
+            { start: 2000, end: 2500, value: 'world', role: '' },
+            { start: 1000, end: 1500, value: 'Hello', role: '' },
+          ],
+        },
+      ],
+    })
+
+    expect(lines[0].tokens.map((token) => token.value)).toEqual([
+      'Hello',
+      'world',
+    ])
+  })
+
+  it('splits a single full-line token into synthetic word tokens', () => {
+    const lines = buildKaraokeLines({
+      lang: 'ko-Latn',
+      synced: true,
+      line: [{ start: 1000, end: 2000, value: 'Da-la-lun, dun' }],
+      tokenLine: [
+        {
+          index: 0,
+          start: 1000,
+          end: 2000,
+          value: 'Da-la-lun, dun',
+          token: [{ start: 1000, end: 2000, value: 'Da-la-lun, dun' }],
+        },
+      ],
+    })
+
+    expect(lines).toHaveLength(1)
+    expect(lines[0].tokens).toHaveLength(2)
+    expect(lines[0].tokens[0].value).toBe('Da-la-lun, ')
+    expect(lines[0].tokens[1].value).toBe('dun')
+
+    const firstWindow = resolveKaraokeTokenWindow(lines[0], 0)
+    const secondWindow = resolveKaraokeTokenWindow(lines[0], 1)
+
+    expect(firstWindow.start).toBeCloseTo(1000)
+    expect(firstWindow.end).toBeCloseTo(1500)
+    expect(secondWindow.start).toBeCloseTo(1500)
+    expect(secondWindow.end).toBeCloseTo(2000)
+  })
+
+  it('detects active line and token for karaoke timing', () => {
+    const state = getActiveKaraokeState(
+      [
+        {
+          index: 0,
+          start: 1000,
+          end: 3000,
+          value: 'Hello world',
+          tokens: [
+            { start: 1000, end: 1500, value: 'Hello', role: '' },
+            { start: 2000, end: 2500, value: 'world', role: '' },
+          ],
+        },
+        {
+          index: 1,
+          start: 3500,
+          end: 5000,
+          value: 'Second line',
+          tokens: [],
+        },
+      ],
+      2200,
+    )
+
+    expect(state).toEqual({ lineIndex: 0, tokenIndex: 1 })
+  })
+
+  it('resolves token window fallback boundaries from neighboring tokens', () => {
+    const line = {
+      start: 1000,
+      end: 3000,
+      value: 'Hello world',
+      tokens: [
+        { start: 1200, value: 'Hello', role: '' },
+        { start: 1800, value: 'world', role: '' },
+      ],
+    }
+
+    expect(resolveKaraokeTokenWindow(line, 0)).toEqual({
+      start: 1200,
+      end: 1800,
+    })
+    expect(resolveKaraokeTokenWindow(line, 1)).toEqual({
+      start: 1800,
+      end: 3000,
+    })
+  })
+
+  it('infers sequential token windows when token timings are missing', () => {
+    const line = {
+      start: 1000,
+      end: 2000,
+      value: 'A B C',
+      tokens: [
+        { value: 'A', role: '' },
+        { value: 'B', role: '' },
+        { value: 'C', role: '' },
+      ],
+    }
+
+    const first = resolveKaraokeTokenWindow(line, 0)
+    const second = resolveKaraokeTokenWindow(line, 1)
+    const third = resolveKaraokeTokenWindow(line, 2)
+
+    expect(first.start).toBeCloseTo(1000)
+    expect(first.end).toBeCloseTo(1333.3333333333333)
+
+    expect(second.start).toBeCloseTo(1333.3333333333333)
+    expect(second.end).toBeCloseTo(1666.6666666666667)
+
+    expect(third.start).toBeCloseTo(1666.6666666666667)
+    expect(third.end).toBeCloseTo(2000)
+  })
+
+  it('falls back to sequential windows when token timings are collapsed', () => {
+    const line = {
+      start: 1000,
+      end: 2000,
+      value: 'A B C',
+      tokens: [
+        { start: 1000, end: 2000, value: 'A', role: '' },
+        { start: 1000, end: 2000, value: 'B', role: '' },
+        { start: 1000, end: 2000, value: 'C', role: '' },
+      ],
+    }
+
+    const first = resolveKaraokeTokenWindow(line, 0)
+    const second = resolveKaraokeTokenWindow(line, 1)
+    const third = resolveKaraokeTokenWindow(line, 2)
+
+    expect(first.start).toBeCloseTo(1000)
+    expect(first.end).toBeCloseTo(1333.3333333333333)
+    expect(second.start).toBeCloseTo(1333.3333333333333)
+    expect(second.end).toBeCloseTo(1666.6666666666667)
+    expect(third.start).toBeCloseTo(1666.6666666666667)
+    expect(third.end).toBeCloseTo(2000)
+  })
+
+  it('keeps token selection stable near tight token boundaries', () => {
+    const state = getActiveKaraokeState(
+      [
+        {
+          index: 0,
+          start: 1000,
+          end: 2000,
+          value: 'A B',
+          tokens: [
+            { start: 1000, end: 1100, value: 'A', role: '' },
+            { start: 1110, end: 1300, value: 'B', role: '' },
+          ],
+        },
+      ],
+      1108,
+    )
+
+    expect(state).toEqual({ lineIndex: 0, tokenIndex: 0 })
+  })
+
+  it('reports structured lyric content when token timing exists', () => {
+    expect(
+      hasStructuredLyricContent({
+        tokenLine: [{ token: [{ start: 100, value: 'a' }] }],
+      }),
+    ).toBe(true)
+  })
+})
--- a/ui/src/reducers/playerReducer.js
+++ b/ui/src/reducers/playerReducer.js
@ -7,6 +7,7 @@ import {
  PLAYER_CURRENT,
  PLAYER_PLAY_NEXT,
  PLAYER_PLAY_TRACKS,
+  PLAYER_UPDATE_LYRIC,
  PLAYER_SET_TRACK,
  PLAYER_SET_VOLUME,
  PLAYER_SYNC_QUEUE,
@ -60,21 +61,25 @@ const mapToAudioLists = (item) => {
  let lyricText = ''

  if (lyrics) {
-    const structured = JSON.parse(lyrics)
-    for (const structuredLyric of structured) {
-      if (structuredLyric.synced) {
-        for (const line of structuredLyric.line) {
-          let time = Math.floor(line.start / 10)
-          const ms = time % 100
-          time = Math.floor(time / 100)
-          const sec = time % 60
-          time = Math.floor(time / 60)
-          const min = time % 60
+    try {
+      const structured = JSON.parse(lyrics)
+      for (const structuredLyric of structured) {
+        if (structuredLyric.synced) {
+          for (const line of structuredLyric.line) {
+            let time = Math.floor(line.start / 10)
+            const ms = time % 100
+            time = Math.floor(time / 100)
+            const sec = time % 60
+            time = Math.floor(time / 60)
+            const min = time % 60

-          ms.toString()
-          lyricText += `[${pad(min)}:${pad(sec)}.${pad(ms)}] ${line.value}\n`
+            ms.toString()
+            lyricText += `[${pad(min)}:${pad(sec)}.${pad(ms)}] ${line.value}\n`
+          }
        }
      }
+    } catch {
+      lyricText = ''
    }
  }

@ -206,6 +211,45 @@ const reduceMode = (state, { data: { mode } }) => {
  }
 }

+const reduceUpdateLyric = (state, { data: { trackId, lyric } }) => {
+  if (!trackId) {
+    return state
+  }
+
+  let changed = false
+  const queue = state.queue.map((item) => {
+    if (item.trackId !== trackId) {
+      return item
+    }
+    if (item.lyric === lyric) {
+      return item
+    }
+    changed = true
+    return {
+      ...item,
+      lyric,
+    }
+  })
+
+  if (!changed) {
+    return state
+  }
+
+  const current =
+    state.current?.trackId === trackId
+      ? {
+          ...state.current,
+          lyric,
+        }
+      : state.current
+
+  return {
+    ...state,
+    queue,
+    current,
+  }
+}
+
 export const playerReducer = (previousState = initialState, payload) => {
  const { type } = payload
  switch (type) {
@ -243,6 +287,8 @@ export const playerReducer = (previousState = initialState, payload) => {
          previousState.savedPlayIndex >= 0 ? previousState.savedPlayIndex : 0,
      }
    }
+    case PLAYER_UPDATE_LYRIC:
+      return reduceUpdateLyric(previousState, payload)
    default:
      return previousState
  }
--- a/ui/src/reducers/playerReducer.test.js
+++ b/ui/src/reducers/playerReducer.test.js
@ -1,11 +1,24 @@
-import { describe, it, expect } from 'vitest'
+import { describe, expect, it, vi } from 'vitest'
 import { playerReducer } from './playerReducer'
 import {
-  PLAYER_SYNC_QUEUE,
  PLAYER_CURRENT,
  PLAYER_REFRESH_QUEUE,
+  PLAYER_SET_TRACK,
+  PLAYER_SYNC_QUEUE,
+  PLAYER_UPDATE_LYRIC,
 } from '../actions'

+vi.mock('uuid', () => ({
+  v4: () => 'test-uuid',
+}))
+
+vi.mock('../subsonic', () => ({
+  default: {
+    streamUrl: vi.fn((id) => `/rest/stream?id=${id}`),
+    getCoverArtUrl: vi.fn(() => '/rest/getCoverArt?id=test'),
+  },
+}))
+
 describe('playerReducer', () => {
  describe('pending track selection survives SYNC_QUEUE and premature CURRENT', () => {
    // Simulates the real sequence when clicking a new song while one is playing:
@ -54,8 +67,6 @@ describe('playerReducer', () => {
    })

    it('CURRENT for old track preserves pending playIndex', () => {
-      // After SYNC_QUEUE, queue has new UUIDs. The old track's UUID (zzz)
-      // is at index 2, but playIndex is 0. This is a premature callback.
      const stateAfterSync = {
        ...stateAfterPlayTracks,
        queue: [
@ -71,7 +82,7 @@ describe('playerReducer', () => {
      const result = playerReducer(stateAfterSync, action)
      expect(result.playIndex).toBe(0)
      expect(result.clear).toBe(true)
-      expect(result.savedPlayIndex).toBe(2) // preserved from before
+      expect(result.savedPlayIndex).toBe(2)
    })

    it('CURRENT for correct track consumes pending playIndex', () => {
@ -83,7 +94,6 @@ describe('playerReducer', () => {
          { trackId: 's3', uuid: 'zzz', name: 'Song 3' },
        ],
      }
-      // Player switched to Song 1 (uuid 'xxx', index 0 == playIndex)
      const action = {
        type: PLAYER_CURRENT,
        data: { uuid: 'xxx', name: 'Song 1', volume: 1 },
@ -142,4 +152,80 @@ describe('playerReducer', () => {
      expect(result.playIndex).toBe(0)
    })
  })
+
+  it('maps embedded synced lyrics to LRC text', () => {
+    const lyrics = JSON.stringify([
+      {
+        lang: 'eng',
+        synced: true,
+        line: [{ start: 1000, value: 'Line one' }],
+      },
+      {
+        lang: 'eng',
+        synced: false,
+        line: [{ value: 'Unsynced line' }],
+      },
+    ])
+
+    const state = playerReducer(undefined, {
+      type: PLAYER_SET_TRACK,
+      data: {
+        id: 'song-1',
+        title: 'Test Song',
+        artist: 'Test Artist',
+        album: 'Test Album',
+        duration: 60,
+        lyrics,
+      },
+    })
+
+    expect(state.queue).toHaveLength(1)
+    expect(state.queue[0].lyric).toBe('[00:01.00] Line one\n')
+  })
+
+  it('updates queue lyric by track id', () => {
+    const initial = playerReducer(undefined, {
+      type: PLAYER_SET_TRACK,
+      data: {
+        id: 'song-1',
+        title: 'Test Song',
+        artist: 'Test Artist',
+        album: 'Test Album',
+        duration: 60,
+      },
+    })
+
+    const updated = playerReducer(initial, {
+      type: PLAYER_UPDATE_LYRIC,
+      data: {
+        trackId: 'song-1',
+        lyric: '[00:01.00] Updated lyric\n',
+      },
+    })
+
+    expect(updated.queue[0].lyric).toBe('[00:01.00] Updated lyric\n')
+  })
+
+  it('returns same state when lyric update does not match any track', () => {
+    const initial = playerReducer(undefined, {
+      type: PLAYER_SET_TRACK,
+      data: {
+        id: 'song-1',
+        title: 'Test Song',
+        artist: 'Test Artist',
+        album: 'Test Album',
+        duration: 60,
+      },
+    })
+
+    const updated = playerReducer(initial, {
+      type: PLAYER_UPDATE_LYRIC,
+      data: {
+        trackId: 'missing-track',
+        lyric: '[00:01.00] Updated lyric\n',
+      },
+    })
+
+    expect(updated).toBe(initial)
+  })
 })
--- a/ui/src/subsonic/index.js
+++ b/ui/src/subsonic/index.js
@ -120,6 +120,10 @@ const getTopSongs = (artist, count = 50) => {
  return httpClient(url('getTopSongs', null, { artist, count }))
 }

+const getLyricsBySongId = (id) => {
+  return httpClient(url('getLyricsBySongId', id))
+}
+
 const streamUrl = (id, options) => {
  return baseUrl(
    url('stream', id, {
@ -149,4 +153,5 @@ export default {
  getArtistInfo,
  getTopSongs,
  getSimilarSongs2,
+  getLyricsBySongId,
 }
--- a/ui/src/subsonic/index.test.js
+++ b/ui/src/subsonic/index.test.js
@ -1,7 +1,12 @@
 import { vi } from 'vitest'
 import { COVER_ART_SIZE } from '../consts'
+import { httpClient } from '../dataProvider'
 import subsonic from './index'

+vi.mock('../dataProvider', () => ({
+  httpClient: vi.fn(() => Promise.resolve({})),
+}))
+
 describe('getCoverArtUrl', () => {
  beforeEach(() => {
    // Mock window.location
@ -178,3 +183,29 @@ describe('getAvatarUrl', () => {
    expect(url).toContain('username=john')
  })
 })
+
+describe('getLyricsBySongId', () => {
+  beforeEach(() => {
+    vi.clearAllMocks()
+    const localStorageMock = {
+      getItem: vi.fn((key) => {
+        const values = {
+          username: 'testuser',
+          'subsonic-token': 'testtoken',
+          'subsonic-salt': 'testsalt',
+        }
+        return values[key] || null
+      }),
+    }
+    Object.defineProperty(window, 'localStorage', { value: localStorageMock })
+  })
+
+  it('calls the getLyricsBySongId endpoint', async () => {
+    await subsonic.getLyricsBySongId('song-1')
+
+    expect(httpClient).toHaveBeenCalledTimes(1)
+    const calledUrl = httpClient.mock.calls[0][0]
+    expect(calledUrl).toContain('/rest/getLyricsBySongId?')
+    expect(calledUrl).toContain('id=song-1')
+  })
+})