From 9dcec350566ce3fc8f3dc2b7d36420ec1527bfab Mon Sep 17 00:00:00 2001
From: ranokay <github@ranokay.com>
Date: Fri, 20 Feb 2026 16:54:45 +0200
Subject: [PATCH 01/14] feat: add TTML lyrics support with token-level karaoke
 and translation/pronunciation layers

Add a full TTML (Timed Text Markup Language) sidecar lyrics parser that extracts
word/syllable-level timing from <span> elements, plus translation and pronunciation
(transliteration) tracks from Apple Music TTML metadata sections.

Backend changes:
- TTML parser (core/lyrics/ttml.go) with support for all TTML time formats,
  nested timing contexts, and bare decimal second offsets
- Translation/pronunciation tracks resolved via key-based metadata linking
- Line timing hydration from token-level start/end values
- 'kind' field added to Lyrics model and StructuredLyric API response
  (main/translation/pronunciation)
- 'tokenLine' array in API response for word-level timing data
- UTF-8 BOM and UTF-16 LE encoding support for TTML files
- Fix for ambiguous time resolution in pronunciation spans (pre-1-minute)

Frontend changes:
- KaraokeLyricsOverlay rewritten with scrollable multi-line layout,
  word-level wipe highlighting with eased alpha transitions,
  rAF-driven playback clock with drift correction
- Inline translation (above) and pronunciation (below) each main line,
  with smart filtering to hide redundant lines (same normalized text)
- TR/PR toggle buttons and layer selection via selectLyricLayers()
- Click-to-seek: click any lyric line to jump to that position
- Customization popover with font-size sliders and color presets
  for each line type (TR/Default/PR), persisted to localStorage
- Smooth font-size transition between active and inactive lines
- Resizable overlay height via drag handle
- lyrics.js: resolveKaraokeTokenWindow, buildSyntheticWordTokens,
  findLayerLineIndexForMain, token sorting, collapsed timing detection

API extension (non-breaking, additive):
- tokenLine[].token[] provides per-word start/end timing (ms)
- tokenLine[].index maps back to the corresponding line[] entry
- kind field: 'main', 'translation', 'pronunciation'
- Clients ignoring tokenLine/kind continue to work unchanged
---
 README.md                                   |    1 +
 conf/configuration.go                       |    2 +-
 core/lyrics/lyrics_test.go                  |   32 +-
 core/lyrics/sources.go                      |   28 +-
 core/lyrics/sources_test.go                 |   63 +
 core/lyrics/sources_ttml_test.go            |   92 ++
 core/lyrics/ttml.go                         |  886 +++++++++++++
 core/lyrics/ttml_test.go                    |  398 ++++++
 model/lyrics.go                             |   12 +-
 server/subsonic/helpers.go                  |   28 +
 server/subsonic/media_retrieval.go          |   39 +-
 server/subsonic/media_retrieval_test.go     |  301 +++++
 server/subsonic/opensubsonic.go             |    2 +-
 server/subsonic/opensubsonic_test.go        |    2 +-
 server/subsonic/responses/responses.go      |   29 +-
 tests/fixtures/bom-test.ttml                |    2 +
 tests/fixtures/bom-utf16-test.ttml          |  Bin 0 -> 414 bytes
 tests/fixtures/test-metadata.ttml           |   25 +
 tests/fixtures/test.ttml                    |   12 +
 ui/src/actions/player.js                    |    6 +
 ui/src/audioplayer/KaraokeLyricsOverlay.jsx | 1228 +++++++++++++++++++
 ui/src/audioplayer/Player.jsx               |  228 +++-
 ui/src/audioplayer/PlayerToolbar.jsx        |   31 +-
 ui/src/audioplayer/PlayerToolbar.test.jsx   |   19 +-
 ui/src/audioplayer/lyrics.js                |  617 ++++++++++
 ui/src/audioplayer/lyrics.test.js           |  416 +++++++
 ui/src/reducers/playerReducer.js            |   70 +-
 ui/src/reducers/playerReducer.test.js       |   98 +-
 ui/src/subsonic/index.js                    |    5 +
 ui/src/subsonic/index.test.js               |   60 +-
 30 files changed, 4651 insertions(+), 81 deletions(-)
 create mode 100644 core/lyrics/sources_ttml_test.go
 create mode 100644 core/lyrics/ttml.go
 create mode 100644 core/lyrics/ttml_test.go
 create mode 100644 tests/fixtures/bom-test.ttml
 create mode 100644 tests/fixtures/bom-utf16-test.ttml
 create mode 100644 tests/fixtures/test-metadata.ttml
 create mode 100644 tests/fixtures/test.ttml
 create mode 100644 ui/src/audioplayer/KaraokeLyricsOverlay.jsx
 create mode 100644 ui/src/audioplayer/lyrics.js
 create mode 100644 ui/src/audioplayer/lyrics.test.js
diff --git a/README.md b/README.md
index 0ae5bdfaf..6b9aff799 100644
--- a/README.md
+++ b/README.md
@@ -52,6 +52,7 @@ A share of the revenue helps fund the development of Navidrome at no additional
  - **Multi-platform**, runs on macOS, Linux and Windows. **Docker** images are also provided
  - Ready to use binaries for all major platforms, including **Raspberry Pi**
  - Automatically **monitors your library** for changes, importing new files and reloading new metadata 
+ - Supports synchronized lyrics from sidecar **.lrc** and **.ttml** files (via `lyricspriority`)
  - **Themeable**, modern and responsive **Web interface** based on [Material UI](https://material-ui.com)
  - **Compatible** with all Subsonic/Madsonic/Airsonic [clients](https://www.navidrome.org/docs/overview/#apps)
  - **Transcoding** on the fly. Can be set per user/player. **Opus encoding is supported**
diff --git a/conf/configuration.go b/conf/configuration.go
index a8b0e4c8a..af9f6c283 100644
--- a/conf/configuration.go
+++ b/conf/configuration.go
@@ -730,7 +730,7 @@ func setViperDefaults() {
 	viper.SetDefault("artistartpriority", "artist.*, album/artist.*, external")
 	viper.SetDefault("artistimagefolder", "")
 	viper.SetDefault("discartpriority", "disc*.*, cd*.*, cover.*, folder.*, front.*, discsubtitle, embedded")
-	viper.SetDefault("lyricspriority", ".lrc,.txt,embedded")
+	viper.SetDefault("lyricspriority", ".lrc,.ttml,.txt,embedded")
 	viper.SetDefault("enablegravatar", false)
 	viper.SetDefault("enablefavourites", true)
 	viper.SetDefault("enablestarrating", true)
diff --git a/core/lyrics/lyrics_test.go b/core/lyrics/lyrics_test.go
index 2e495a714..d5f79a4d0 100644
--- a/core/lyrics/lyrics_test.go
+++ b/core/lyrics/lyrics_test.go
@@ -44,6 +44,35 @@ var _ = Describe("sources", func() {
 		},
 	}
 
+	ttmlLyrics := model.LyricList{
+		model.Lyrics{
+			Kind: "main",
+			Lang: "eng",
+			Line: []model.Line{
+				{
+					Start: gg.P(int64(18800)),
+					Value: "We're no strangers to love",
+				},
+				{
+					Start: gg.P(int64(22800)),
+					Value: "You know the rules and so do I",
+				},
+			},
+			Synced: true,
+		},
+		model.Lyrics{
+			Kind: "main",
+			Lang: "por",
+			Line: []model.Line{
+				{
+					Start: gg.P(int64(18800)),
+					Value: "Nao somos estranhos ao amor",
+				},
+			},
+			Synced: true,
+		},
+	}
+
 	unsyncedLyrics := model.LyricList{
 		model.Lyrics{
 			Lang: "xxx",
@@ -80,7 +109,8 @@ var _ = Describe("sources", func() {
 	},
 		Entry("embedded > lrc > txt", "embedded,.lrc,.txt", embeddedLyrics),
 		Entry("lrc > embedded > txt", ".lrc,embedded,.txt", syncedLyrics),
-		Entry("txt > lrc > embedded", ".txt,.lrc,embedded", unsyncedLyrics))
+		Entry("txt > lrc > embedded", ".txt,.lrc,embedded", unsyncedLyrics),
+		Entry("ttml > lrc > embedded", ".ttml,.lrc,embedded", ttmlLyrics))
 
 	Context("Errors", func() {
 		var RegularUserContext = XContext
diff --git a/core/lyrics/sources.go b/core/lyrics/sources.go
index 82a10ca41..38a71cb8a 100644
--- a/core/lyrics/sources.go
+++ b/core/lyrics/sources.go
@@ -5,6 +5,7 @@ import (
 	"errors"
 	"os"
 	"path"
+	"strings"
 
 	"github.com/navidrome/navidrome/log"
 	"github.com/navidrome/navidrome/model"
@@ -36,18 +37,31 @@ func fromExternalFile(ctx context.Context, mf *model.MediaFile, suffix string) (
 		return nil, err
 	}
 
-	lyrics, err := model.ToLyrics("xxx", string(contents))
-	if err != nil {
-		log.Error(ctx, "error parsing lyric external file", "path", externalLyric, err)
-		return nil, err
-	} else if lyrics == nil {
+	var list model.LyricList
+	if strings.EqualFold(suffix, ".ttml") {
+		list, err = parseTTML(contents)
+		if err != nil {
+			log.Error(ctx, "error parsing ttml external file", "path", externalLyric, err)
+			return nil, err
+		}
+	} else {
+		lyrics, err := model.ToLyrics("xxx", string(contents))
+		if err != nil {
+			log.Error(ctx, "error parsing lyric external file", "path", externalLyric, err)
+			return nil, err
+		}
+		if lyrics != nil {
+			list = model.LyricList{*lyrics}
+		}
+	}
+
+	if len(list) == 0 {
 		log.Trace(ctx, "empty lyrics from external file", "path", externalLyric)
 		return nil, nil
 	}
 
 	log.Trace(ctx, "retrieved lyrics from external file", "path", externalLyric)
-
-	return model.LyricList{*lyrics}, nil
+	return list, nil
 }
 
 // fromPlugin attempts to load lyrics from a plugin with the given name.
diff --git a/core/lyrics/sources_test.go b/core/lyrics/sources_test.go
index b3d502101..8823a3175 100644
--- a/core/lyrics/sources_test.go
+++ b/core/lyrics/sources_test.go
@@ -109,6 +109,41 @@ var _ = Describe("sources", func() {
 			}))
 		})
 
+		It("should return synchronized multilingual lyrics from a TTML file", func() {
+			mf := model.MediaFile{Path: "tests/fixtures/test.mp3"}
+			lyrics, err := fromExternalFile(ctx, &mf, ".ttml")
+
+			Expect(err).To(BeNil())
+			Expect(lyrics).To(Equal(model.LyricList{
+				{
+					Kind: "main",
+					Lang: "eng",
+					Line: []model.Line{
+						{
+							Start: gg.P(int64(18800)),
+							Value: "We're no strangers to love",
+						},
+						{
+							Start: gg.P(int64(22800)),
+							Value: "You know the rules and so do I",
+						},
+					},
+					Synced: true,
+				},
+				{
+					Kind: "main",
+					Lang: "por",
+					Line: []model.Line{
+						{
+							Start: gg.P(int64(18800)),
+							Value: "Nao somos estranhos ao amor",
+						},
+					},
+					Synced: true,
+				},
+			}))
+		})
+
 		It("should handle LRC files with UTF-8 BOM marker (issue #4631)", func() {
 			// The function looks for <basePath-without-ext><suffix>, so we need to pass
 			// a MediaFile with .mp3 path and look for .lrc suffix
@@ -142,5 +177,33 @@ var _ = Describe("sources", func() {
 			Expect(lyrics[0].Line[1].Start).To(Equal(gg.P(int64(22801))))
 			Expect(lyrics[0].Line[1].Value).To(Equal("You know the rules and so do I"))
 		})
+
+		It("should handle TTML files with UTF-8 BOM marker", func() {
+			mf := model.MediaFile{Path: "tests/fixtures/bom-test.mp3"}
+			lyrics, err := fromExternalFile(ctx, &mf, ".ttml")
+
+			Expect(err).To(BeNil())
+			Expect(lyrics).To(HaveLen(1))
+			Expect(lyrics[0].Kind).To(Equal("main"))
+			Expect(lyrics[0].Synced).To(BeTrue())
+			Expect(lyrics[0].Line).To(HaveLen(1))
+			Expect(lyrics[0].Line[0].Start).To(Equal(gg.P(int64(0))))
+			Expect(lyrics[0].Line[0].Value).To(Equal("BOM test line"))
+		})
+
+		It("should handle UTF-16 LE encoded TTML files", func() {
+			mf := model.MediaFile{Path: "tests/fixtures/bom-utf16-test.mp3"}
+			lyrics, err := fromExternalFile(ctx, &mf, ".ttml")
+
+			Expect(err).To(BeNil())
+			Expect(lyrics).To(HaveLen(1))
+			Expect(lyrics[0].Kind).To(Equal("main"))
+			Expect(lyrics[0].Synced).To(BeTrue())
+			Expect(lyrics[0].Line).To(HaveLen(2))
+			Expect(lyrics[0].Line[0].Start).To(Equal(gg.P(int64(18800))))
+			Expect(lyrics[0].Line[0].Value).To(Equal("UTF16 line one"))
+			Expect(lyrics[0].Line[1].Start).To(Equal(gg.P(int64(22801))))
+			Expect(lyrics[0].Line[1].Value).To(Equal("UTF16 line two"))
+		})
 	})
 })
diff --git a/core/lyrics/sources_ttml_test.go b/core/lyrics/sources_ttml_test.go
new file mode 100644
index 000000000..217bf7b36
--- /dev/null
+++ b/core/lyrics/sources_ttml_test.go
@@ -0,0 +1,92 @@
+package lyrics
+
+import (
+	"context"
+	"os"
+	"path/filepath"
+	"testing"
+
+	"github.com/navidrome/navidrome/model"
+)
+
+func TestFromExternalFileTTML(t *testing.T) {
+	ctx := context.Background()
+	mf := model.MediaFile{Path: fixturePath("test.mp3")}
+
+	lyrics, err := fromExternalFile(ctx, &mf, ".ttml")
+	if err != nil {
+		t.Fatalf("fromExternalFile returned error: %v", err)
+	}
+	if len(lyrics) != 2 {
+		t.Fatalf("expected 2 lyric tracks, got %d", len(lyrics))
+	}
+	if lyrics[0].Lang != "eng" {
+		t.Fatalf("expected first language 'eng', got %q", lyrics[0].Lang)
+	}
+	if len(lyrics[0].Line) != 2 {
+		t.Fatalf("expected 2 english lines, got %d", len(lyrics[0].Line))
+	}
+	if lyrics[0].Line[0].Start == nil || *lyrics[0].Line[0].Start != 18800 {
+		t.Fatalf("expected first english line start to be 18800, got %v", lyrics[0].Line[0].Start)
+	}
+}
+
+func TestFromExternalFileTTMLWithUTF8BOM(t *testing.T) {
+	ctx := context.Background()
+	mf := model.MediaFile{Path: fixturePath("bom-test.ttml")}
+
+	lyrics, err := fromExternalFile(ctx, &mf, ".ttml")
+	if err != nil {
+		t.Fatalf("fromExternalFile returned error: %v", err)
+	}
+	if len(lyrics) != 1 {
+		t.Fatalf("expected 1 lyric track, got %d", len(lyrics))
+	}
+	if !lyrics[0].Synced {
+		t.Fatal("expected BOM TTML lyrics to be synced")
+	}
+	if len(lyrics[0].Line) != 1 {
+		t.Fatalf("expected 1 lyric line, got %d", len(lyrics[0].Line))
+	}
+	if lyrics[0].Line[0].Start == nil || *lyrics[0].Line[0].Start != 0 {
+		t.Fatalf("expected first line start 0, got %v", lyrics[0].Line[0].Start)
+	}
+}
+
+func TestFromExternalFileTTMLUTF16(t *testing.T) {
+	ctx := context.Background()
+	mf := model.MediaFile{Path: fixturePath("bom-utf16-test.ttml")}
+
+	lyrics, err := fromExternalFile(ctx, &mf, ".ttml")
+	if err != nil {
+		t.Fatalf("fromExternalFile returned error: %v", err)
+	}
+	if len(lyrics) != 1 {
+		t.Fatalf("expected 1 lyric track, got %d", len(lyrics))
+	}
+	if !lyrics[0].Synced {
+		t.Fatal("expected UTF16 TTML lyrics to be synced")
+	}
+	if len(lyrics[0].Line) != 2 {
+		t.Fatalf("expected 2 lyric lines, got %d", len(lyrics[0].Line))
+	}
+	if lyrics[0].Line[0].Start == nil || *lyrics[0].Line[0].Start != 18800 {
+		t.Fatalf("expected first line start 18800, got %v", lyrics[0].Line[0].Start)
+	}
+	if lyrics[0].Line[1].Start == nil || *lyrics[0].Line[1].Start != 22801 {
+		t.Fatalf("expected second line start 22801, got %v", lyrics[0].Line[1].Start)
+	}
+}
+
+func fixturePath(name string) string {
+	candidates := []string{
+		filepath.Join("tests", "fixtures", name),
+		filepath.Join("..", "..", "tests", "fixtures", name),
+	}
+	for _, candidate := range candidates {
+		if _, err := os.Stat(candidate); err == nil {
+			return candidate
+		}
+	}
+	return filepath.Join("tests", "fixtures", name)
+}
diff --git a/core/lyrics/ttml.go b/core/lyrics/ttml.go
new file mode 100644
index 000000000..3aae53aa0
--- /dev/null
+++ b/core/lyrics/ttml.go
@@ -0,0 +1,886 @@
+package lyrics
+
+import (
+	"bytes"
+	"encoding/xml"
+	"errors"
+	"io"
+	"math"
+	"regexp"
+	"sort"
+	"strconv"
+	"strings"
+
+	"github.com/navidrome/navidrome/log"
+	"github.com/navidrome/navidrome/model"
+	"github.com/navidrome/navidrome/utils/str"
+)
+
+const (
+	defaultTTMLFrameRate    = 30.0
+	defaultTTMLSubFrameRate = 1.0
+	defaultTTMLTickRate     = 1.0
+
+	ttmlLyricKindMain          = "main"
+	ttmlLyricKindTranslation   = "translation"
+	ttmlLyricKindPronunciation = "pronunciation"
+)
+
+var offsetTimeRegex = regexp.MustCompile(`^([0-9]+(?:\.[0-9]+)?)(h|m|s|ms|f|t)$`)
+var xmlEncodingRegex = regexp.MustCompile(`(?i)<\?xml([^>]*?)encoding\s*=\s*["'][^"']+["']([^>]*)\?>`)
+
+type ttmlTimeKind int
+
+const (
+	ttmlTimeAbsolute ttmlTimeKind = iota
+	ttmlTimeOffset
+	ttmlTimeAmbiguous
+)
+
+type ttmlTimingParams struct {
+	frameRate    float64
+	subFrameRate float64
+	tickRate     float64
+}
+
+type ttmlTimingContext struct {
+	lang     string
+	role     string
+	begin    int64
+	hasBegin bool
+	end      int64
+	hasEnd   bool
+	invalid  bool
+}
+
+type ttmlLineRef struct {
+	order int
+	line  model.Line
+}
+
+type ttmlMetadataEntry struct {
+	key  string
+	line model.Line
+	seq  int
+}
+
+type ttmlResolvedMetadataLine struct {
+	order int
+	seq   int
+	line  model.Line
+}
+
+type ttmlParser struct {
+	decoder *xml.Decoder
+	params  ttmlTimingParams
+
+	mainLangOrder   []string
+	mainLinesByLang map[string][]model.Line
+
+	mainLineRefsByKey map[string]ttmlLineRef
+	mainLineOrder     int
+
+	translationLangOrder   []string
+	translationEntriesByLg map[string][]ttmlMetadataEntry
+
+	pronunciationLangOrder   []string
+	pronunciationEntriesByLg map[string][]ttmlMetadataEntry
+
+	metadataSeq int
+}
+
+func parseTTML(contents []byte) (model.LyricList, error) {
+	contents = xmlEncodingRegex.ReplaceAll(contents, []byte(`<?xml$1encoding="UTF-8"$2?>`))
+
+	p := ttmlParser{
+		decoder: xml.NewDecoder(bytes.NewReader(contents)),
+		params: ttmlTimingParams{
+			frameRate:    defaultTTMLFrameRate,
+			subFrameRate: defaultTTMLSubFrameRate,
+			tickRate:     defaultTTMLTickRate,
+		},
+		mainLinesByLang:          make(map[string][]model.Line),
+		mainLineRefsByKey:        make(map[string]ttmlLineRef),
+		translationEntriesByLg:   make(map[string][]ttmlMetadataEntry),
+		pronunciationEntriesByLg: make(map[string][]ttmlMetadataEntry),
+	}
+
+	root := ttmlTimingContext{lang: "xxx"}
+
+	for {
+		token, err := p.decoder.Token()
+		if errors.Is(err, io.EOF) {
+			break
+		}
+		if err != nil {
+			return nil, err
+		}
+
+		start, ok := token.(xml.StartElement)
+		if !ok {
+			continue
+		}
+
+		if err := p.parseElement(start, root); err != nil {
+			return nil, err
+		}
+	}
+
+	return p.toLyricList(), nil
+}
+
+func (p *ttmlParser) parseElement(start xml.StartElement, parent ttmlTimingContext) error {
+	local := strings.ToLower(start.Name.Local)
+	if local == "tt" {
+		p.updateTimingParams(start.Attr)
+	}
+
+	switch local {
+	case "translation":
+		return p.parseMetadataTrack(start, parent, ttmlLyricKindTranslation)
+	case "transliteration":
+		return p.parseMetadataTrack(start, parent, ttmlLyricKindPronunciation)
+	}
+
+	ctx := p.childContext(start.Attr, parent)
+	if local == "p" {
+		lineText, tokens, err := p.parseParagraph(ctx)
+		if err != nil {
+			return err
+		}
+		if ctx.invalid || lineText == "" {
+			return nil
+		}
+
+		parsedLine := model.Line{Value: lineText}
+		if ctx.hasBegin {
+			startMs := ctx.begin
+			parsedLine.Start = &startMs
+		}
+		if ctx.hasEnd {
+			endMs := ctx.end
+			parsedLine.End = &endMs
+		}
+		if len(tokens) > 0 {
+			parsedLine.Token = tokens
+		}
+		parsedLine = hydrateLineTimingFromTokens(parsedLine)
+
+		lineKey, _ := attrValue(start.Attr, "key")
+		p.addMainLine(ctx.lang, lineKey, parsedLine)
+		return nil
+	}
+
+	for {
+		token, err := p.decoder.Token()
+		if err != nil {
+			return err
+		}
+
+		switch t := token.(type) {
+		case xml.StartElement:
+			nextParent := ctx
+			if ctx.invalid {
+				// Best effort: ignore invalid timing in container elements, and
+				// continue traversing descendants with parent context.
+				nextParent = parent
+			}
+			if err := p.parseElement(t, nextParent); err != nil {
+				return err
+			}
+		case xml.EndElement:
+			if strings.EqualFold(t.Name.Local, start.Name.Local) {
+				return nil
+			}
+		}
+	}
+}
+
+func (p *ttmlParser) parseMetadataTrack(start xml.StartElement, parent ttmlTimingContext, kind string) error {
+	ctx := p.childContext(start.Attr, parent)
+	lang := normalizeTTMLLang(ctx.lang)
+
+	for {
+		token, err := p.decoder.Token()
+		if err != nil {
+			return err
+		}
+
+		switch t := token.(type) {
+		case xml.StartElement:
+			if strings.EqualFold(t.Name.Local, "text") {
+				entry, ok, err := p.parseMetadataText(t, ctx)
+				if err != nil {
+					return err
+				}
+				if ok {
+					p.addMetadataEntry(kind, lang, entry)
+				}
+				continue
+			}
+
+			nextParent := ctx
+			if ctx.invalid {
+				nextParent = parent
+			}
+			if err := p.parseElement(t, nextParent); err != nil {
+				return err
+			}
+		case xml.EndElement:
+			if strings.EqualFold(t.Name.Local, start.Name.Local) {
+				return nil
+			}
+		}
+	}
+}
+
+func (p *ttmlParser) parseMetadataText(start xml.StartElement, parent ttmlTimingContext) (ttmlMetadataEntry, bool, error) {
+	forKey, hasFor := attrValue(start.Attr, "for")
+	forKey = strings.TrimSpace(forKey)
+
+	value, tokens, err := p.parseInlineElement(start, parent)
+	if err != nil {
+		return ttmlMetadataEntry{}, false, err
+	}
+	if !hasFor || forKey == "" {
+		return ttmlMetadataEntry{}, false, nil
+	}
+
+	ctx := p.childContext(start.Attr, parent)
+	if ctx.invalid {
+		return ttmlMetadataEntry{}, false, nil
+	}
+
+	line := model.Line{Value: sanitizeTTMLText(value)}
+	if ctx.hasBegin {
+		startMs := ctx.begin
+		line.Start = &startMs
+	}
+	if ctx.hasEnd {
+		endMs := ctx.end
+		line.End = &endMs
+	}
+	if len(tokens) > 0 {
+		line.Token = tokens
+	}
+	line = hydrateLineTimingFromTokens(line)
+
+	if line.Value == "" && len(line.Token) == 0 {
+		return ttmlMetadataEntry{}, false, nil
+	}
+
+	return ttmlMetadataEntry{key: forKey, line: line}, true, nil
+}
+
+func (p *ttmlParser) parseParagraph(parent ttmlTimingContext) (string, []model.Token, error) {
+	var text strings.Builder
+	var tokens []model.Token
+
+	for {
+		token, err := p.decoder.Token()
+		if err != nil {
+			return "", nil, err
+		}
+
+		switch t := token.(type) {
+		case xml.StartElement:
+			value, inlineTokens, err := p.parseInlineElement(t, parent)
+			if err != nil {
+				return "", nil, err
+			}
+			text.WriteString(value)
+			tokens = append(tokens, inlineTokens...)
+		case xml.EndElement:
+			if strings.EqualFold(t.Name.Local, "p") {
+				return sanitizeTTMLText(text.String()), tokens, nil
+			}
+		case xml.CharData:
+			text.WriteString(string(t))
+		}
+	}
+}
+
+func (p *ttmlParser) parseInlineElement(start xml.StartElement, parent ttmlTimingContext) (string, []model.Token, error) {
+	local := strings.ToLower(start.Name.Local)
+	if local == "br" {
+		return "\n", nil, nil
+	}
+
+	ctx := p.childContext(start.Attr, parent)
+	_, hasBegin := attrValue(start.Attr, "begin")
+	_, hasEnd := attrValue(start.Attr, "end")
+	_, hasDur := attrValue(start.Attr, "dur")
+	hasOwnTiming := hasBegin || hasEnd || hasDur
+
+	var text strings.Builder
+	var tokens []model.Token
+
+	for {
+		token, err := p.decoder.Token()
+		if err != nil {
+			return "", nil, err
+		}
+
+		switch t := token.(type) {
+		case xml.StartElement:
+			value, inlineTokens, err := p.parseInlineElement(t, ctx)
+			if err != nil {
+				return "", nil, err
+			}
+			text.WriteString(value)
+			tokens = append(tokens, inlineTokens...)
+		case xml.EndElement:
+			if !strings.EqualFold(t.Name.Local, start.Name.Local) {
+				continue
+			}
+
+			value := text.String()
+			tokenText := sanitizeTTMLText(value)
+			if local == "span" && hasOwnTiming && !ctx.invalid && tokenText != "" && len(tokens) == 0 {
+				parsedToken := model.Token{
+					Value: tokenText,
+					Role:  ctx.role,
+				}
+				if ctx.hasBegin {
+					startMs := ctx.begin
+					parsedToken.Start = &startMs
+				}
+				if ctx.hasEnd {
+					endMs := ctx.end
+					parsedToken.End = &endMs
+				}
+				tokens = append(tokens, parsedToken)
+			}
+
+			return value, tokens, nil
+		case xml.CharData:
+			text.WriteString(string(t))
+		}
+	}
+}
+
+func (p *ttmlParser) toLyricList() model.LyricList {
+	res := make(model.LyricList, 0, len(p.mainLangOrder)+len(p.translationLangOrder)+len(p.pronunciationLangOrder))
+	for _, lang := range p.mainLangOrder {
+		lines := p.mainLinesByLang[lang]
+		if len(lines) == 0 {
+			continue
+		}
+		res = append(res, model.Lyrics{
+			Kind:   ttmlLyricKindMain,
+			Lang:   lang,
+			Line:   lines,
+			Synced: linesAreSynced(lines),
+		})
+	}
+
+	res = append(res, p.buildMetadataLyrics(ttmlLyricKindTranslation, p.translationLangOrder, p.translationEntriesByLg)...)
+	res = append(res, p.buildMetadataLyrics(ttmlLyricKindPronunciation, p.pronunciationLangOrder, p.pronunciationEntriesByLg)...)
+	return res
+}
+
+func (p *ttmlParser) buildMetadataLyrics(kind string, langOrder []string, entriesByLang map[string][]ttmlMetadataEntry) model.LyricList {
+	res := make(model.LyricList, 0, len(langOrder))
+
+	for _, lang := range langOrder {
+		entries := entriesByLang[lang]
+		if len(entries) == 0 {
+			continue
+		}
+
+		seenKeys := make(map[string]struct{}, len(entries))
+		resolved := make([]ttmlResolvedMetadataLine, 0, len(entries))
+		for _, entry := range entries {
+			if _, exists := seenKeys[entry.key]; exists {
+				continue
+			}
+			seenKeys[entry.key] = struct{}{}
+
+			ref, ok := p.mainLineRefsByKey[entry.key]
+			if !ok {
+				log.Warn("Skipping TTML metadata line without matching key", "kind", kind, "lang", lang, "key", entry.key)
+				continue
+			}
+
+			line := entry.line
+			if line.Start == nil && ref.line.Start != nil {
+				startMs := *ref.line.Start
+				line.Start = &startMs
+			}
+			if line.End == nil && ref.line.End != nil {
+				endMs := *ref.line.End
+				line.End = &endMs
+			}
+			line = hydrateLineTimingFromTokens(line)
+
+			if line.Value == "" && len(line.Token) == 0 {
+				continue
+			}
+
+			resolved = append(resolved, ttmlResolvedMetadataLine{
+				order: ref.order,
+				seq:   entry.seq,
+				line:  line,
+			})
+		}
+
+		if len(resolved) == 0 {
+			continue
+		}
+
+		sort.SliceStable(resolved, func(i, j int) bool {
+			if resolved[i].order != resolved[j].order {
+				return resolved[i].order < resolved[j].order
+			}
+			return resolved[i].seq < resolved[j].seq
+		})
+
+		lines := make([]model.Line, len(resolved))
+		for i := range resolved {
+			lines[i] = resolved[i].line
+		}
+
+		res = append(res, model.Lyrics{
+			Kind:   kind,
+			Lang:   lang,
+			Line:   lines,
+			Synced: linesAreSynced(lines),
+		})
+	}
+
+	return res
+}
+
+func (p *ttmlParser) addMainLine(lang string, lineKey string, line model.Line) {
+	lang = normalizeTTMLLang(lang)
+	if _, ok := p.mainLinesByLang[lang]; !ok {
+		p.mainLangOrder = append(p.mainLangOrder, lang)
+	}
+	p.mainLinesByLang[lang] = append(p.mainLinesByLang[lang], line)
+
+	lineKey = strings.TrimSpace(lineKey)
+	if lineKey != "" {
+		if _, exists := p.mainLineRefsByKey[lineKey]; !exists {
+			p.mainLineRefsByKey[lineKey] = ttmlLineRef{
+				order: p.mainLineOrder,
+				line:  line,
+			}
+		}
+	}
+	p.mainLineOrder++
+}
+
+func (p *ttmlParser) addMetadataEntry(kind string, lang string, entry ttmlMetadataEntry) {
+	lang = normalizeTTMLLang(lang)
+	entry.seq = p.metadataSeq
+	p.metadataSeq++
+
+	switch kind {
+	case ttmlLyricKindTranslation:
+		if _, ok := p.translationEntriesByLg[lang]; !ok {
+			p.translationLangOrder = append(p.translationLangOrder, lang)
+		}
+		p.translationEntriesByLg[lang] = append(p.translationEntriesByLg[lang], entry)
+	case ttmlLyricKindPronunciation:
+		if _, ok := p.pronunciationEntriesByLg[lang]; !ok {
+			p.pronunciationLangOrder = append(p.pronunciationLangOrder, lang)
+		}
+		p.pronunciationEntriesByLg[lang] = append(p.pronunciationEntriesByLg[lang], entry)
+	}
+}
+
+func (p *ttmlParser) childContext(attrs []xml.Attr, parent ttmlTimingContext) ttmlTimingContext {
+	ctx := parent
+
+	if lang, ok := attrValue(attrs, "lang"); ok {
+		ctx.lang = normalizeTTMLLang(lang)
+	}
+	if role, ok := attrValue(attrs, "role"); ok {
+		role = strings.TrimSpace(role)
+		if role != "" {
+			if ctx.role == "" {
+				ctx.role = role
+			} else if !strings.Contains(ctx.role, role) {
+				ctx.role = ctx.role + " " + role
+			}
+		}
+	}
+
+	beginExpr, hasBegin := attrValue(attrs, "begin")
+	endExpr, hasEnd := attrValue(attrs, "end")
+	durExpr, hasDur := attrValue(attrs, "dur")
+
+	if hasBegin {
+		begin, kind, ok := parseTTMLTimeExpression(beginExpr, p.params)
+		if !ok {
+			ctx.invalid = true
+			return ctx
+		}
+
+		base := int64(0)
+		if parent.hasBegin {
+			base = parent.begin
+		}
+		ctx.begin = resolveTTMLTime(begin, kind, base, parent)
+		ctx.hasBegin = true
+	} else {
+		ctx.begin = parent.begin
+		ctx.hasBegin = parent.hasBegin
+	}
+
+	var calculatedEnd int64
+	calculatedHasEnd := false
+
+	if hasEnd {
+		end, kind, ok := parseTTMLTimeExpression(endExpr, p.params)
+		if !ok {
+			ctx.invalid = true
+			return ctx
+		}
+
+		base := ctx.begin
+		if !ctx.hasBegin {
+			base = parent.begin
+		}
+		calculatedEnd = resolveTTMLTime(end, kind, base, parent)
+		calculatedHasEnd = true
+	}
+
+	if hasDur {
+		dur, ok := parseTTMLDurationExpression(durExpr, p.params)
+		if !ok {
+			ctx.invalid = true
+			return ctx
+		}
+		if ctx.hasBegin {
+			durEnd := ctx.begin + dur
+			if !calculatedHasEnd || durEnd < calculatedEnd {
+				calculatedEnd = durEnd
+				calculatedHasEnd = true
+			}
+		}
+	}
+
+	if !calculatedHasEnd && parent.hasEnd {
+		calculatedEnd = parent.end
+		calculatedHasEnd = true
+	}
+
+	ctx.end = calculatedEnd
+	ctx.hasEnd = calculatedHasEnd
+	return ctx
+}
+
+func (p *ttmlParser) updateTimingParams(attrs []xml.Attr) {
+	frameRate := p.params.frameRate
+	if value, ok := attrValue(attrs, "frameRate"); ok {
+		if parsed, err := strconv.ParseFloat(value, 64); err == nil && parsed > 0 {
+			frameRate = parsed
+		}
+	}
+
+	if value, ok := attrValue(attrs, "frameRateMultiplier"); ok {
+		parts := strings.Fields(value)
+		if len(parts) == 2 {
+			numerator, errA := strconv.ParseFloat(parts[0], 64)
+			denominator, errB := strconv.ParseFloat(parts[1], 64)
+			if errA == nil && errB == nil && denominator > 0 {
+				frameRate = frameRate * (numerator / denominator)
+			}
+		}
+	}
+
+	subFrameRate := p.params.subFrameRate
+	if value, ok := attrValue(attrs, "subFrameRate"); ok {
+		if parsed, err := strconv.ParseFloat(value, 64); err == nil && parsed > 0 {
+			subFrameRate = parsed
+		}
+	}
+
+	tickRate := p.params.tickRate
+	if value, ok := attrValue(attrs, "tickRate"); ok {
+		if parsed, err := strconv.ParseFloat(value, 64); err == nil && parsed > 0 {
+			tickRate = parsed
+		}
+	}
+
+	p.params.frameRate = max(frameRate, defaultTTMLFrameRate)
+	p.params.subFrameRate = max(subFrameRate, defaultTTMLSubFrameRate)
+	p.params.tickRate = max(tickRate, defaultTTMLTickRate)
+}
+
+func parseTTMLDurationExpression(expr string, params ttmlTimingParams) (int64, bool) {
+	value, _, ok := parseTTMLTimeExpression(expr, params)
+	return value, ok
+}
+
+func resolveTTMLTime(value int64, kind ttmlTimeKind, base int64, parent ttmlTimingContext) int64 {
+	switch kind {
+	case ttmlTimeAbsolute:
+		return value
+	case ttmlTimeOffset:
+		return base + value
+	case ttmlTimeAmbiguous:
+		absolute := value
+		offset := base + value
+
+		// No parent timing context → no reference frame for offsets.
+		// Prefer absolute when offset differs (i.e., base > 0).
+		if !parent.hasBegin && !parent.hasEnd && base != 0 {
+			return absolute
+		}
+
+		if parent.hasBegin && parent.hasEnd {
+			absoluteInParent := absolute >= parent.begin && absolute <= parent.end
+			offsetInParent := offset >= parent.begin && offset <= parent.end
+			if absoluteInParent && !offsetInParent {
+				return absolute
+			}
+			if offsetInParent && !absoluteInParent {
+				return offset
+			}
+		}
+
+		if parent.hasBegin {
+			if absolute < parent.begin && offset >= parent.begin {
+				return offset
+			}
+			if absolute >= parent.begin && offset > absolute {
+				return absolute
+			}
+		}
+		return offset
+	default:
+		return base + value
+	}
+}
+
+func parseTTMLTimeExpression(expr string, params ttmlTimingParams) (int64, ttmlTimeKind, bool) {
+	expr = strings.TrimSpace(expr)
+	if expr == "" {
+		return 0, ttmlTimeOffset, false
+	}
+
+	lower := strings.ToLower(expr)
+	if strings.Contains(lower, "wallclock(") ||
+		strings.Contains(lower, ".begin") ||
+		strings.Contains(lower, ".end") {
+		log.Warn("Unsupported TTML time expression", "value", expr)
+		return 0, ttmlTimeOffset, false
+	}
+
+	// Best-effort support for non-standard TTML seen in the wild where a
+	// bare decimal value is used (implicitly seconds), e.g. "0.170".
+	if value, err := strconv.ParseFloat(lower, 64); err == nil && value >= 0 {
+		return int64(math.Round(value * 1000)), ttmlTimeAmbiguous, true
+	}
+
+	if matches := offsetTimeRegex.FindStringSubmatch(lower); len(matches) == 3 {
+		value, err := strconv.ParseFloat(matches[1], 64)
+		if err != nil {
+			return 0, ttmlTimeOffset, false
+		}
+
+		unit := matches[2]
+		seconds := 0.0
+		switch unit {
+		case "h":
+			seconds = value * 60 * 60
+		case "m":
+			seconds = value * 60
+		case "s":
+			seconds = value
+		case "ms":
+			seconds = value / 1000
+		case "f":
+			seconds = value / params.frameRate
+		case "t":
+			seconds = value / params.tickRate
+		default:
+			return 0, ttmlTimeOffset, false
+		}
+
+		return int64(math.Round(seconds * 1000)), ttmlTimeOffset, true
+	}
+
+	colonCount := strings.Count(expr, ":")
+	switch colonCount {
+	case 1, 2:
+		clockMs, ok := parseTTMLClockTime(expr)
+		if !ok {
+			return 0, ttmlTimeAbsolute, false
+		}
+		return clockMs, ttmlTimeAbsolute, true
+	case 3:
+		framesMs, ok := parseTTMLFrameTime(expr, params)
+		if !ok {
+			return 0, ttmlTimeAbsolute, false
+		}
+		return framesMs, ttmlTimeAbsolute, true
+	default:
+		log.Warn("Unsupported TTML time expression", "value", expr)
+		return 0, ttmlTimeOffset, false
+	}
+}
+
+func parseTTMLClockTime(value string) (int64, bool) {
+	parts := strings.Split(value, ":")
+	if len(parts) != 2 && len(parts) != 3 {
+		return 0, false
+	}
+
+	hours := int64(0)
+	minutesIdx := 0
+	if len(parts) == 3 {
+		h, err := strconv.ParseInt(parts[0], 10, 64)
+		if err != nil {
+			return 0, false
+		}
+		hours = h
+		minutesIdx = 1
+	}
+
+	minutes, err := strconv.ParseInt(parts[minutesIdx], 10, 64)
+	if err != nil {
+		return 0, false
+	}
+
+	seconds, err := strconv.ParseFloat(parts[minutesIdx+1], 64)
+	if err != nil {
+		return 0, false
+	}
+
+	totalSeconds := float64(hours*60*60+minutes*60) + seconds
+	return int64(math.Round(totalSeconds * 1000)), true
+}
+
+func parseTTMLFrameTime(value string, params ttmlTimingParams) (int64, bool) {
+	parts := strings.Split(value, ":")
+	if len(parts) != 4 {
+		return 0, false
+	}
+
+	hours, err := strconv.ParseInt(parts[0], 10, 64)
+	if err != nil {
+		return 0, false
+	}
+
+	minutes, err := strconv.ParseInt(parts[1], 10, 64)
+	if err != nil {
+		return 0, false
+	}
+
+	seconds, err := strconv.ParseInt(parts[2], 10, 64)
+	if err != nil {
+		return 0, false
+	}
+
+	frameParts := strings.SplitN(parts[3], ".", 2)
+	frames, err := strconv.ParseFloat(frameParts[0], 64)
+	if err != nil {
+		return 0, false
+	}
+
+	subFrames := 0.0
+	if len(frameParts) == 2 {
+		subFrames, err = strconv.ParseFloat(frameParts[1], 64)
+		if err != nil {
+			return 0, false
+		}
+	}
+
+	totalSeconds := float64(hours*60*60 + minutes*60 + seconds)
+	totalSeconds += frames / params.frameRate
+	totalSeconds += subFrames / (params.subFrameRate * params.frameRate)
+
+	return int64(math.Round(totalSeconds * 1000)), true
+}
+
+func attrValue(attrs []xml.Attr, key string) (string, bool) {
+	for _, attr := range attrs {
+		if strings.EqualFold(attr.Name.Local, key) {
+			return strings.TrimSpace(attr.Value), true
+		}
+	}
+	return "", false
+}
+
+func normalizeTTMLLang(lang string) string {
+	lang = strings.ToLower(strings.TrimSpace(lang))
+	if lang == "" {
+		return "xxx"
+	}
+	return lang
+}
+
+func sanitizeTTMLText(raw string) string {
+	raw = str.SanitizeText(raw)
+	raw = strings.ReplaceAll(raw, "\r\n", "\n")
+	raw = strings.ReplaceAll(raw, "\r", "\n")
+
+	lines := strings.Split(raw, "\n")
+	for i := range lines {
+		lines[i] = strings.TrimSpace(lines[i])
+	}
+	return strings.TrimSpace(strings.Join(lines, "\n"))
+}
+
+func linesAreSynced(lines []model.Line) bool {
+	for i := range lines {
+		if lines[i].Start != nil {
+			return true
+		}
+		for j := range lines[i].Token {
+			if lines[i].Token[j].Start != nil {
+				return true
+			}
+		}
+	}
+	return false
+}
+
+func hydrateLineTimingFromTokens(line model.Line) model.Line {
+	if len(line.Token) == 0 {
+		return line
+	}
+
+	var earliestStart *int64
+	var latestEnd *int64
+	for i := range line.Token {
+		token := line.Token[i]
+		if token.Start != nil {
+			if earliestStart == nil || *token.Start < *earliestStart {
+				v := *token.Start
+				earliestStart = &v
+			}
+		}
+
+		candidateEnd := token.End
+		if candidateEnd == nil {
+			candidateEnd = token.Start
+		}
+		if candidateEnd != nil {
+			if latestEnd == nil || *candidateEnd > *latestEnd {
+				v := *candidateEnd
+				latestEnd = &v
+			}
+		}
+	}
+
+	if line.Start == nil && earliestStart != nil {
+		v := *earliestStart
+		line.Start = &v
+	}
+	if line.End == nil && latestEnd != nil {
+		v := *latestEnd
+		line.End = &v
+	}
+	return line
+}
+
+func max(v float64, fallback float64) float64 {
+	if v <= 0 {
+		return fallback
+	}
+	return v
+}
diff --git a/core/lyrics/ttml_test.go b/core/lyrics/ttml_test.go
new file mode 100644
index 000000000..12270c27d
--- /dev/null
+++ b/core/lyrics/ttml_test.go
@@ -0,0 +1,398 @@
+package lyrics
+
+import (
+	"testing"
+
+	"github.com/navidrome/navidrome/model"
+)
+
+func TestParseTTML_MultiLanguageAndTiming(t *testing.T) {
+	content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
+<tt xmlns="http://www.w3.org/ns/ttml" xmlns:ttp="http://www.w3.org/ns/ttml#parameter" ttp:frameRate="30" ttp:subFrameRate="2" ttp:tickRate="10">
+  <body>
+    <div xml:lang="eng" begin="1s">
+      <p begin="2s">Line one</p>
+      <p begin="00:00:04:15.1"><span>Line two</span><br/>with break</p>
+    </div>
+    <div xml:lang="por">
+      <p begin="45t">Linha</p>
+    </div>
+  </body>
+</tt>`)
+
+	list, err := parseTTML(content)
+	if err != nil {
+		t.Fatalf("parseTTML returned error: %v", err)
+	}
+	if len(list) != 2 {
+		t.Fatalf("expected 2 lyric tracks, got %d", len(list))
+	}
+
+	eng := list[0]
+	if eng.Lang != "eng" {
+		t.Fatalf("expected first track language 'eng', got %q", eng.Lang)
+	}
+	if !eng.Synced {
+		t.Fatal("expected first track to be synced")
+	}
+	assertTimedLine(t, eng.Line[0], 3000, "Line one")
+	assertTimedLine(t, eng.Line[1], 4517, "Line two\nwith break")
+
+	por := list[1]
+	if por.Lang != "por" {
+		t.Fatalf("expected second track language 'por', got %q", por.Lang)
+	}
+	assertTimedLine(t, por.Line[0], 4500, "Linha")
+}
+
+func TestParseTTML_UnsupportedCueSkipped(t *testing.T) {
+	content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
+<tt xmlns="http://www.w3.org/ns/ttml">
+  <body xml:lang="eng">
+    <div>
+      <p begin="wallclock(2026-01-01T00:00:00Z)">Skip me</p>
+      <p begin="1s">Keep me</p>
+    </div>
+  </body>
+</tt>`)
+
+	list, err := parseTTML(content)
+	if err != nil {
+		t.Fatalf("parseTTML returned error: %v", err)
+	}
+	if len(list) != 1 {
+		t.Fatalf("expected 1 lyric track, got %d", len(list))
+	}
+	if len(list[0].Line) != 1 {
+		t.Fatalf("expected 1 line in lyric track, got %d", len(list[0].Line))
+	}
+	assertTimedLine(t, list[0].Line[0], 1000, "Keep me")
+}
+
+func TestParseTTML_BeginEndDurWithInheritance(t *testing.T) {
+	content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
+<tt xmlns="http://www.w3.org/ns/ttml">
+  <body xml:lang="eng" begin="10s">
+    <div begin="5s" dur="8s">
+      <p begin="1s" dur="2s">First line</p>
+      <p begin="3s" end="5s">Second line</p>
+    </div>
+  </body>
+</tt>`)
+
+	list, err := parseTTML(content)
+	if err != nil {
+		t.Fatalf("parseTTML returned error: %v", err)
+	}
+	if len(list) != 1 {
+		t.Fatalf("expected 1 lyric track, got %d", len(list))
+	}
+	if list[0].Lang != "eng" {
+		t.Fatalf("expected language 'eng', got %q", list[0].Lang)
+	}
+	if len(list[0].Line) != 2 {
+		t.Fatalf("expected 2 lines, got %d", len(list[0].Line))
+	}
+	assertTimedLine(t, list[0].Line[0], 16000, "First line")
+	assertTimedLine(t, list[0].Line[1], 18000, "Second line")
+}
+
+func TestParseTTML_NonStandardBareSecondOffsets(t *testing.T) {
+	content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
+<tt xmlns="http://www.w3.org/ns/ttml">
+  <body xml:lang="eng" begin="10">
+    <div>
+      <p begin="0.170">First line</p>
+      <p begin="3.710">Second line</p>
+    </div>
+  </body>
+</tt>`)
+
+	list, err := parseTTML(content)
+	if err != nil {
+		t.Fatalf("parseTTML returned error: %v", err)
+	}
+	if len(list) != 1 {
+		t.Fatalf("expected 1 lyric track, got %d", len(list))
+	}
+	if len(list[0].Line) != 2 {
+		t.Fatalf("expected 2 lines, got %d", len(list[0].Line))
+	}
+	assertTimedLine(t, list[0].Line[0], 10170, "First line")
+	assertTimedLine(t, list[0].Line[1], 13710, "Second line")
+}
+
+func TestParseTTML_WordTimingTokens(t *testing.T) {
+	content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
+<tt xmlns="http://www.w3.org/ns/ttml" xmlns:ttm="http://www.w3.org/ns/ttml#metadata">
+  <body xml:lang="eng">
+    <div>
+      <p begin="00:01.000" end="00:03.000">
+        <span begin="00:01.000" end="00:01.400">He</span><span begin="00:01.400" end="00:01.800">llo</span>
+        <span ttm:role="x-bg"><span begin="00:02.000" end="00:02.500">echo</span></span>
+      </p>
+    </div>
+  </body>
+</tt>`)
+
+	list, err := parseTTML(content)
+	if err != nil {
+		t.Fatalf("parseTTML returned error: %v", err)
+	}
+	if len(list) != 1 {
+		t.Fatalf("expected 1 lyric track, got %d", len(list))
+	}
+	if len(list[0].Line) != 1 {
+		t.Fatalf("expected 1 line, got %d", len(list[0].Line))
+	}
+
+	line := list[0].Line[0]
+	assertTimedLine(t, line, 1000, "Hello\necho")
+	if line.End == nil || *line.End != 3000 {
+		t.Fatalf("expected line end 3000, got %v", line.End)
+	}
+	if len(line.Token) != 3 {
+		t.Fatalf("expected 3 timed tokens, got %d", len(line.Token))
+	}
+
+	assertToken(t, line.Token[0], 1000, 1400, "He", "")
+	assertToken(t, line.Token[1], 1400, 1800, "llo", "")
+	assertToken(t, line.Token[2], 2000, 2500, "echo", "x-bg")
+}
+
+func TestParseTTML_AmbiguousDecimalTimingPrefersAbsoluteWhenInsideParentWindow(t *testing.T) {
+	content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
+<tt xmlns="http://www.w3.org/ns/ttml">
+  <body xml:lang="eng">
+    <div begin="37.870" end="45.570">
+      <p begin="43.444" end="45.570">
+        <span begin="43.444" end="43.716">go</span>
+        <span begin="43.716" end="43.887">go</span>
+      </p>
+    </div>
+  </body>
+</tt>`)
+
+	list, err := parseTTML(content)
+	if err != nil {
+		t.Fatalf("parseTTML returned error: %v", err)
+	}
+	if len(list) != 1 || len(list[0].Line) != 1 {
+		t.Fatalf("expected one parsed lyric line, got %#v", list)
+	}
+
+	line := list[0].Line[0]
+	assertTimedLine(t, line, 43444, "go\ngo")
+	if line.End == nil || *line.End != 45570 {
+		t.Fatalf("expected line end 45570, got %v", line.End)
+	}
+	if len(line.Token) != 2 {
+		t.Fatalf("expected 2 timed tokens, got %d", len(line.Token))
+	}
+	assertToken(t, line.Token[0], 43444, 43716, "go", "")
+	assertToken(t, line.Token[1], 43716, 43887, "go", "")
+}
+
+func TestParseTTML_UnsyncedFallback(t *testing.T) {
+	content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
+<tt xmlns="http://www.w3.org/ns/ttml">
+  <body>
+    <div>
+      <p>No timing here</p>
+    </div>
+  </body>
+</tt>`)
+
+	list, err := parseTTML(content)
+	if err != nil {
+		t.Fatalf("parseTTML returned error: %v", err)
+	}
+	if len(list) != 1 {
+		t.Fatalf("expected 1 lyric track, got %d", len(list))
+	}
+	if list[0].Lang != "xxx" {
+		t.Fatalf("expected default language 'xxx', got %q", list[0].Lang)
+	}
+	if list[0].Synced {
+		t.Fatal("expected lyric track to be unsynced")
+	}
+	if len(list[0].Line) != 1 {
+		t.Fatalf("expected 1 line, got %d", len(list[0].Line))
+	}
+	if list[0].Line[0].Start != nil {
+		t.Fatalf("expected line start to be nil, got %v", *list[0].Line[0].Start)
+	}
+	if list[0].Line[0].Value != "No timing here" {
+		t.Fatalf("expected line value %q, got %q", "No timing here", list[0].Line[0].Value)
+	}
+}
+
+func TestParseTTML_MetadataTracksByKey(t *testing.T) {
+	content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
+<tt xmlns="http://www.w3.org/ns/ttml" xmlns:itunes="http://music.apple.com/lyric-ttml-internal">
+  <head>
+    <metadata>
+      <iTunesMetadata xmlns="http://music.apple.com/lyric-ttml-internal">
+        <translations>
+          <translation xml:lang="es">
+            <text for="L1">Hola</text>
+            <text for="MISSING">Skip me</text>
+          </translation>
+        </translations>
+        <transliterations>
+          <transliteration xml:lang="ja-Latn">
+            <text for="L2"><span begin="00:02.000" end="00:02.300" xmlns="http://www.w3.org/ns/ttml">ko</span><span begin="00:02.300" end="00:02.600" xmlns="http://www.w3.org/ns/ttml">nni</span></text>
+          </transliteration>
+        </transliterations>
+      </iTunesMetadata>
+    </metadata>
+  </head>
+  <body xml:lang="ja">
+    <div>
+      <p begin="00:01.000" end="00:01.500" itunes:key="L1">こんにちは</p>
+      <p begin="00:02.000" end="00:02.700" itunes:key="L2">こんばんは</p>
+    </div>
+  </body>
+</tt>`)
+
+	list, err := parseTTML(content)
+	if err != nil {
+		t.Fatalf("parseTTML returned error: %v", err)
+	}
+	if len(list) != 3 {
+		t.Fatalf("expected 3 lyric tracks, got %d", len(list))
+	}
+
+	main := list[0]
+	if main.Kind != "main" {
+		t.Fatalf("expected main track kind %q, got %q", "main", main.Kind)
+	}
+	if main.Lang != "ja" {
+		t.Fatalf("expected main track language %q, got %q", "ja", main.Lang)
+	}
+	if len(main.Line) != 2 {
+		t.Fatalf("expected 2 lines in main track, got %d", len(main.Line))
+	}
+
+	translation := list[1]
+	if translation.Kind != "translation" {
+		t.Fatalf("expected translation kind %q, got %q", "translation", translation.Kind)
+	}
+	if translation.Lang != "es" {
+		t.Fatalf("expected translation language %q, got %q", "es", translation.Lang)
+	}
+	if len(translation.Line) != 1 {
+		t.Fatalf("expected 1 translation line, got %d", len(translation.Line))
+	}
+	assertTimedLine(t, translation.Line[0], 1000, "Hola")
+	if translation.Line[0].End == nil || *translation.Line[0].End != 1500 {
+		t.Fatalf("expected translation line end %d, got %v", 1500, translation.Line[0].End)
+	}
+
+	pronunciation := list[2]
+	if pronunciation.Kind != "pronunciation" {
+		t.Fatalf("expected pronunciation kind %q, got %q", "pronunciation", pronunciation.Kind)
+	}
+	if pronunciation.Lang != "ja-latn" {
+		t.Fatalf("expected pronunciation language %q, got %q", "ja-latn", pronunciation.Lang)
+	}
+	if len(pronunciation.Line) != 1 {
+		t.Fatalf("expected 1 pronunciation line, got %d", len(pronunciation.Line))
+	}
+	assertTimedLine(t, pronunciation.Line[0], 2000, "konni")
+	if pronunciation.Line[0].End == nil || *pronunciation.Line[0].End != 2600 {
+		t.Fatalf("expected pronunciation line end %d, got %v", 2600, pronunciation.Line[0].End)
+	}
+	if len(pronunciation.Line[0].Token) != 2 {
+		t.Fatalf("expected 2 pronunciation tokens, got %d", len(pronunciation.Line[0].Token))
+	}
+	assertToken(t, pronunciation.Line[0].Token[0], 2000, 2300, "ko", "")
+	assertToken(t, pronunciation.Line[0].Token[1], 2300, 2600, "nni", "")
+}
+
+func TestParseTTML_PronunciationBareDecimalEndTimes(t *testing.T) {
+	content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
+<tt xmlns="http://www.w3.org/ns/ttml" xmlns:itunes="http://music.apple.com/lyric-ttml-internal">
+  <head>
+    <metadata>
+      <iTunesMetadata xmlns="http://music.apple.com/lyric-ttml-internal">
+        <transliterations>
+          <transliteration xml:lang="ja-Latn">
+            <text for="L1"><span begin="2.747" end="3.018" xmlns="http://www.w3.org/ns/ttml">I</span> <span begin="3.018" end="3.179" xmlns="http://www.w3.org/ns/ttml">woke</span> <span begin="3.179" end="3.582" xmlns="http://www.w3.org/ns/ttml">up</span></text>
+          </transliteration>
+        </transliterations>
+      </iTunesMetadata>
+    </metadata>
+  </head>
+  <body xml:lang="ja">
+    <div>
+      <p begin="00:02.747" end="00:04.000" itunes:key="L1">起きた</p>
+    </div>
+  </body>
+</tt>`)
+
+	list, err := parseTTML(content)
+	if err != nil {
+		t.Fatalf("parseTTML returned error: %v", err)
+	}
+
+	var pronunciation *model.Lyrics
+	for i := range list {
+		if list[i].Kind == "pronunciation" {
+			pronunciation = &list[i]
+			break
+		}
+	}
+	if pronunciation == nil {
+		t.Fatal("expected a pronunciation track")
+	}
+	if len(pronunciation.Line) != 1 {
+		t.Fatalf("expected 1 pronunciation line, got %d", len(pronunciation.Line))
+	}
+
+	line := pronunciation.Line[0]
+	assertTimedLine(t, line, 2747, "I woke up")
+	if len(line.Token) != 3 {
+		t.Fatalf("expected 3 tokens, got %d", len(line.Token))
+	}
+	assertToken(t, line.Token[0], 2747, 3018, "I", "")
+	assertToken(t, line.Token[1], 3018, 3179, "woke", "")
+	assertToken(t, line.Token[2], 3179, 3582, "up", "")
+}
+
+func assertTimedLine(t *testing.T, line model.Line, expectedStart int64, expectedValue string) {
+	t.Helper()
+
+	if line.Start == nil {
+		t.Fatal("expected line start to be set, got nil")
+	}
+	if *line.Start != expectedStart {
+		t.Fatalf("expected line start %d, got %d", expectedStart, *line.Start)
+	}
+	if line.Value != expectedValue {
+		t.Fatalf("expected line value %q, got %q", expectedValue, line.Value)
+	}
+}
+
+func assertToken(t *testing.T, token model.Token, expectedStart int64, expectedEnd int64, expectedValue string, expectedRole string) {
+	t.Helper()
+
+	if token.Start == nil {
+		t.Fatal("expected token start to be set, got nil")
+	}
+	if *token.Start != expectedStart {
+		t.Fatalf("expected token start %d, got %d", expectedStart, *token.Start)
+	}
+	if token.End == nil {
+		t.Fatal("expected token end to be set, got nil")
+	}
+	if *token.End != expectedEnd {
+		t.Fatalf("expected token end %d, got %d", expectedEnd, *token.End)
+	}
+	if token.Value != expectedValue {
+		t.Fatalf("expected token value %q, got %q", expectedValue, token.Value)
+	}
+	if token.Role != expectedRole {
+		t.Fatalf("expected token role %q, got %q", expectedRole, token.Role)
+	}
+}
diff --git a/model/lyrics.go b/model/lyrics.go
index f75f3b11b..220eec7b5 100644
--- a/model/lyrics.go
+++ b/model/lyrics.go
@@ -11,14 +11,24 @@ import (
 	"github.com/navidrome/navidrome/utils/str"
 )
 
-type Line struct {
+type Token struct {
 	Start *int64 `structs:"start,omitempty" json:"start,omitempty"`
+	End   *int64 `structs:"end,omitempty"   json:"end,omitempty"`
 	Value string `structs:"value"           json:"value"`
+	Role  string `structs:"role,omitempty"  json:"role,omitempty"`
+}
+
+type Line struct {
+	Start *int64  `structs:"start,omitempty" json:"start,omitempty"`
+	End   *int64  `structs:"end,omitempty"   json:"end,omitempty"`
+	Value string  `structs:"value"           json:"value"`
+	Token []Token `structs:"token,omitempty" json:"token,omitempty"`
 }
 
 type Lyrics struct {
 	DisplayArtist string `structs:"displayArtist,omitempty" json:"displayArtist,omitempty"`
 	DisplayTitle  string `structs:"displayTitle,omitempty"  json:"displayTitle,omitempty"`
+	Kind          string `structs:"kind,omitempty"          json:"kind,omitempty"`
 	Lang          string `structs:"lang"                    json:"lang"`
 	Line          []Line `structs:"line"                    json:"line"`
 	Offset        *int64 `structs:"offset,omitempty"        json:"offset,omitempty"`
diff --git a/server/subsonic/helpers.go b/server/subsonic/helpers.go
index 74d57ade4..3b9412fb1 100644
--- a/server/subsonic/helpers.go
+++ b/server/subsonic/helpers.go
@@ -495,19 +495,47 @@ func mapExplicitStatus(explicitStatus string) string {
 
 func buildStructuredLyric(mf *model.MediaFile, lyrics model.Lyrics) responses.StructuredLyric {
 	lines := make([]responses.Line, len(lyrics.Line))
+	tokenLines := make([]responses.TokenLine, 0, len(lyrics.Line))
 
 	for i, line := range lyrics.Line {
 		lines[i] = responses.Line{
 			Start: line.Start,
 			Value: line.Value,
 		}
+		if len(line.Token) == 0 {
+			continue
+		}
+
+		tokens := make([]responses.LyricToken, len(line.Token))
+		for j, token := range line.Token {
+			tokens[j] = responses.LyricToken{
+				Start: token.Start,
+				End:   token.End,
+				Value: token.Value,
+				Role:  token.Role,
+			}
+		}
+		tokenLines = append(tokenLines, responses.TokenLine{
+			Index: int32(i),
+			Start: line.Start,
+			End:   line.End,
+			Value: line.Value,
+			Token: tokens,
+		})
+	}
+
+	kind := strings.TrimSpace(lyrics.Kind)
+	if kind == "" {
+		kind = "main"
 	}
 
 	structured := responses.StructuredLyric{
 		DisplayArtist: lyrics.DisplayArtist,
 		DisplayTitle:  lyrics.DisplayTitle,
+		Kind:          kind,
 		Lang:          lyrics.Lang,
 		Line:          lines,
+		TokenLine:     tokenLines,
 		Offset:        lyrics.Offset,
 		Synced:        lyrics.Synced,
 	}
diff --git a/server/subsonic/media_retrieval.go b/server/subsonic/media_retrieval.go
index 3faae1650..963db067c 100644
--- a/server/subsonic/media_retrieval.go
+++ b/server/subsonic/media_retrieval.go
@@ -98,7 +98,9 @@ func (api *Router) GetLyrics(r *http.Request) (*responses.Subsonic, error) {
 	response := newResponse()
 	lyricsResponse := responses.Lyrics{}
 	response.Lyrics = &lyricsResponse
-	mediaFiles, err := api.ds.MediaFile(r.Context()).GetAll(filter.SongsByArtistTitleWithLyricsFirst(artist, title))
+	opts := filter.SongsByArtistTitleWithLyricsFirst(artist, title)
+	opts.Max = 0
+	mediaFiles, err := api.ds.MediaFile(r.Context()).GetAll(opts)
 
 	if err != nil {
 		return nil, err
@@ -108,25 +110,26 @@ func (api *Router) GetLyrics(r *http.Request) (*responses.Subsonic, error) {
 		return response, nil
 	}
 
-	structuredLyrics, err := api.lyrics.GetLyrics(r.Context(), &mediaFiles[0])
-	if err != nil {
-		return nil, err
+	for i := range mediaFiles {
+		structuredLyrics, err := api.lyrics.GetLyrics(r.Context(), &mediaFiles[i])
+		if err != nil {
+			return nil, err
+		}
+		if len(structuredLyrics) == 0 {
+			continue
+		}
+
+		lyricsResponse.Artist = artist
+		lyricsResponse.Title = title
+
+		var lyricsText strings.Builder
+		for _, line := range structuredLyrics[0].Line {
+			lyricsText.WriteString(line.Value + "\n")
+		}
+		lyricsResponse.Value = lyricsText.String()
+		break
 	}
 
-	if len(structuredLyrics) == 0 {
-		return response, nil
-	}
-
-	lyricsResponse.Artist = artist
-	lyricsResponse.Title = title
-
-	var lyricsText strings.Builder
-	for _, line := range structuredLyrics[0].Line {
-		lyricsText.WriteString(line.Value + "\n")
-	}
-
-	lyricsResponse.Value = lyricsText.String()
-
 	return response, nil
 }
 
diff --git a/server/subsonic/media_retrieval_test.go b/server/subsonic/media_retrieval_test.go
index 7f64fb47f..6c52d38bc 100644
--- a/server/subsonic/media_retrieval_test.go
+++ b/server/subsonic/media_retrieval_test.go
@@ -186,6 +186,36 @@ var _ = Describe("MediaRetrievalController", func() {
 			Expect(response.Lyrics.Title).To(Equal("Never Gonna Give You Up"))
 			Expect(response.Lyrics.Value).To(Equal("We're no strangers to love\nYou know the rules and so do I\n"))
 		})
+
+		It("should continue searching candidates for sidecar lyrics", func() {
+			conf.Server.LyricsPriority = ".ttml,embedded"
+			r := newGetRequest("artist=Rick+Astley", "title=Never+Gonna+Give+You+Up")
+			baseTime := time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC)
+			mockRepo.SetData(model.MediaFiles{
+				{
+					ID:        "1",
+					Path:      "tests/fixtures/01 Invisible (RED) Edit Version.mp3",
+					Artist:    "Rick Astley",
+					Title:     "Never Gonna Give You Up",
+					Lyrics:    "[]",
+					UpdatedAt: baseTime.Add(2 * time.Hour), // Newer, but no TTML sidecar
+				},
+				{
+					ID:        "2",
+					Path:      "tests/fixtures/test.mp3",
+					Artist:    "Rick Astley",
+					Title:     "Never Gonna Give You Up",
+					Lyrics:    "[]",
+					UpdatedAt: baseTime.Add(1 * time.Hour), // Older, but has TTML sidecar
+				},
+			})
+
+			response, err := router.GetLyrics(r)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(response.Lyrics.Artist).To(Equal("Rick Astley"))
+			Expect(response.Lyrics.Title).To(Equal("Never Gonna Give You Up"))
+			Expect(response.Lyrics.Value).To(Equal("We're no strangers to love\nYou know the rules and so do I\n"))
+		})
 	})
 
 	Describe("GetLyricsBySongId", func() {
@@ -202,6 +232,11 @@ var _ = Describe("MediaRetrievalController", func() {
 
 				Expect(realLyric.DisplayArtist).To(Equal(expectedLyric.DisplayArtist))
 				Expect(realLyric.DisplayTitle).To(Equal(expectedLyric.DisplayTitle))
+				expectedKind := expectedLyric.Kind
+				if expectedKind == "" {
+					expectedKind = "main"
+				}
+				Expect(realLyric.Kind).To(Equal(expectedKind))
 				Expect(realLyric.Lang).To(Equal(expectedLyric.Lang))
 				Expect(realLyric.Synced).To(Equal(expectedLyric.Synced))
 
@@ -222,6 +257,40 @@ var _ = Describe("MediaRetrievalController", func() {
 						Expect(*realLine.Start).To(Equal(*expectedLine.Start))
 					}
 				}
+
+				Expect(realLyric.TokenLine).To(HaveLen(len(expectedLyric.TokenLine)))
+				for j, realTokenLine := range realLyric.TokenLine {
+					expectedTokenLine := expectedLyric.TokenLine[j]
+					Expect(realTokenLine.Index).To(Equal(expectedTokenLine.Index))
+					Expect(realTokenLine.Value).To(Equal(expectedTokenLine.Value))
+					if expectedTokenLine.Start == nil {
+						Expect(realTokenLine.Start).To(BeNil())
+					} else {
+						Expect(*realTokenLine.Start).To(Equal(*expectedTokenLine.Start))
+					}
+					if expectedTokenLine.End == nil {
+						Expect(realTokenLine.End).To(BeNil())
+					} else {
+						Expect(*realTokenLine.End).To(Equal(*expectedTokenLine.End))
+					}
+
+					Expect(realTokenLine.Token).To(HaveLen(len(expectedTokenLine.Token)))
+					for k, realToken := range realTokenLine.Token {
+						expectedToken := expectedTokenLine.Token[k]
+						Expect(realToken.Value).To(Equal(expectedToken.Value))
+						Expect(realToken.Role).To(Equal(expectedToken.Role))
+						if expectedToken.Start == nil {
+							Expect(realToken.Start).To(BeNil())
+						} else {
+							Expect(*realToken.Start).To(Equal(*expectedToken.Start))
+						}
+						if expectedToken.End == nil {
+							Expect(realToken.End).To(BeNil())
+						} else {
+							Expect(*realToken.End).To(Equal(*expectedToken.End))
+						}
+					}
+				}
 			}
 		}
 
@@ -323,6 +392,238 @@ var _ = Describe("MediaRetrievalController", func() {
 				},
 			})
 		})
+
+		It("should return multilingual TTML sidecar lyrics", func() {
+			conf.Server.LyricsPriority = ".ttml,embedded"
+			r := newGetRequest("id=1")
+
+			mockRepo.SetData(model.MediaFiles{
+				{
+					ID:     "1",
+					Path:   "tests/fixtures/test.mp3",
+					Artist: "Rick Astley",
+					Title:  "Never Gonna Give You Up",
+					Lyrics: "[]",
+				},
+			})
+
+			response, err := router.GetLyricsBySongId(r)
+			Expect(err).ToNot(HaveOccurred())
+
+			porTime := int64(18800)
+			ttmlTime := int64(22800)
+			compareResponses(response.LyricsList, responses.LyricsList{
+				StructuredLyrics: responses.StructuredLyrics{
+					{
+						DisplayArtist: "Rick Astley",
+						DisplayTitle:  "Never Gonna Give You Up",
+						Lang:          "eng",
+						Synced:        true,
+						Line: []responses.Line{
+							{
+								Start: &times[0],
+								Value: "We're no strangers to love",
+							},
+							{
+								Start: &ttmlTime,
+								Value: "You know the rules and so do I",
+							},
+						},
+					},
+					{
+						DisplayArtist: "Rick Astley",
+						DisplayTitle:  "Never Gonna Give You Up",
+						Lang:          "por",
+						Synced:        true,
+						Line: []responses.Line{
+							{
+								Start: &porTime,
+								Value: "Nao somos estranhos ao amor",
+							},
+						},
+					},
+				},
+			})
+		})
+
+		It("should return metadata-linked translation and pronunciation tracks from TTML", func() {
+			conf.Server.LyricsPriority = ".ttml,embedded"
+			r := newGetRequest("id=1")
+
+			mockRepo.SetData(model.MediaFiles{
+				{
+					ID:     "1",
+					Path:   "tests/fixtures/test-metadata.mp3",
+					Artist: "Rick Astley",
+					Title:  "Never Gonna Give You Up",
+					Lyrics: "[]",
+				},
+			})
+
+			response, err := router.GetLyricsBySongId(r)
+			Expect(err).ToNot(HaveOccurred())
+
+			mainStartA := int64(1000)
+			mainStartB := int64(2000)
+			tokenStartA := int64(2000)
+			tokenEndA := int64(2300)
+			tokenStartB := int64(2300)
+			tokenEndB := int64(2600)
+			compareResponses(response.LyricsList, responses.LyricsList{
+				StructuredLyrics: responses.StructuredLyrics{
+					{
+						DisplayArtist: "Rick Astley",
+						DisplayTitle:  "Never Gonna Give You Up",
+						Kind:          "main",
+						Lang:          "ja",
+						Synced:        true,
+						Line: []responses.Line{
+							{
+								Start: &mainStartA,
+								Value: "こんにちは",
+							},
+							{
+								Start: &mainStartB,
+								Value: "こんばんは",
+							},
+						},
+					},
+					{
+						DisplayArtist: "Rick Astley",
+						DisplayTitle:  "Never Gonna Give You Up",
+						Kind:          "translation",
+						Lang:          "es",
+						Synced:        true,
+						Line: []responses.Line{
+							{
+								Start: &mainStartA,
+								Value: "Hola",
+							},
+						},
+					},
+					{
+						DisplayArtist: "Rick Astley",
+						DisplayTitle:  "Never Gonna Give You Up",
+						Kind:          "pronunciation",
+						Lang:          "ja-latn",
+						Synced:        true,
+						Line: []responses.Line{
+							{
+								Start: &mainStartB,
+								Value: "konni",
+							},
+						},
+						TokenLine: []responses.TokenLine{
+							{
+								Index: 0,
+								Start: &mainStartB,
+								End:   &tokenEndB,
+								Value: "konni",
+								Token: []responses.LyricToken{
+									{
+										Start: &tokenStartA,
+										End:   &tokenEndA,
+										Value: "ko",
+									},
+									{
+										Start: &tokenStartB,
+										End:   &tokenEndB,
+										Value: "nni",
+									},
+								},
+							},
+						},
+					},
+				},
+			})
+		})
+
+		It("should return tokenized lines for songLyrics v2 clients", func() {
+			r := newGetRequest("id=1")
+
+			lineStart := int64(1000)
+			lineEnd := int64(3000)
+			tokenStartA := int64(1000)
+			tokenEndA := int64(1400)
+			tokenStartB := int64(2000)
+			tokenEndB := int64(2500)
+			lyricsJson, err := json.Marshal(model.LyricList{
+				{
+					Lang:   "eng",
+					Synced: true,
+					Line: []model.Line{
+						{
+							Start: &lineStart,
+							End:   &lineEnd,
+							Value: "Hello echo",
+							Token: []model.Token{
+								{
+									Start: &tokenStartA,
+									End:   &tokenEndA,
+									Value: "Hello",
+								},
+								{
+									Start: &tokenStartB,
+									End:   &tokenEndB,
+									Value: "echo",
+									Role:  "x-bg",
+								},
+							},
+						},
+					},
+				},
+			})
+			Expect(err).ToNot(HaveOccurred())
+
+			mockRepo.SetData(model.MediaFiles{
+				{
+					ID:     "1",
+					Artist: "Rick Astley",
+					Title:  "Never Gonna Give You Up",
+					Lyrics: string(lyricsJson),
+				},
+			})
+
+			response, err := router.GetLyricsBySongId(r)
+			Expect(err).ToNot(HaveOccurred())
+			compareResponses(response.LyricsList, responses.LyricsList{
+				StructuredLyrics: responses.StructuredLyrics{
+					{
+						DisplayArtist: "Rick Astley",
+						DisplayTitle:  "Never Gonna Give You Up",
+						Lang:          "eng",
+						Synced:        true,
+						Line: []responses.Line{
+							{
+								Start: &lineStart,
+								Value: "Hello echo",
+							},
+						},
+						TokenLine: []responses.TokenLine{
+							{
+								Index: 0,
+								Start: &lineStart,
+								End:   &lineEnd,
+								Value: "Hello echo",
+								Token: []responses.LyricToken{
+									{
+										Start: &tokenStartA,
+										End:   &tokenEndA,
+										Value: "Hello",
+									},
+									{
+										Start: &tokenStartB,
+										End:   &tokenEndB,
+										Value: "echo",
+										Role:  "x-bg",
+									},
+								},
+							},
+						},
+					},
+				},
+			})
+		})
 	})
 })
 
diff --git a/server/subsonic/opensubsonic.go b/server/subsonic/opensubsonic.go
index 353cf1077..f0917baa2 100644
--- a/server/subsonic/opensubsonic.go
+++ b/server/subsonic/opensubsonic.go
@@ -11,7 +11,7 @@ func (api *Router) GetOpenSubsonicExtensions(_ *http.Request) (*responses.Subson
 	response.OpenSubsonicExtensions = &responses.OpenSubsonicExtensions{
 		{Name: "transcodeOffset", Versions: []int32{1}},
 		{Name: "formPost", Versions: []int32{1}},
-		{Name: "songLyrics", Versions: []int32{1}},
+		{Name: "songLyrics", Versions: []int32{1, 2}},
 		{Name: "indexBasedQueue", Versions: []int32{1}},
 		{Name: "transcoding", Versions: []int32{1}},
 	}
diff --git a/server/subsonic/opensubsonic_test.go b/server/subsonic/opensubsonic_test.go
index 92d1c3e84..068030ec8 100644
--- a/server/subsonic/opensubsonic_test.go
+++ b/server/subsonic/opensubsonic_test.go
@@ -38,7 +38,7 @@ var _ = Describe("GetOpenSubsonicExtensions", func() {
 			HaveLen(5),
 			ContainElement(responses.OpenSubsonicExtension{Name: "transcodeOffset", Versions: []int32{1}}),
 			ContainElement(responses.OpenSubsonicExtension{Name: "formPost", Versions: []int32{1}}),
-			ContainElement(responses.OpenSubsonicExtension{Name: "songLyrics", Versions: []int32{1}}),
+			ContainElement(responses.OpenSubsonicExtension{Name: "songLyrics", Versions: []int32{1, 2}}),
 			ContainElement(responses.OpenSubsonicExtension{Name: "indexBasedQueue", Versions: []int32{1}}),
 			ContainElement(responses.OpenSubsonicExtension{Name: "transcoding", Versions: []int32{1}}),
 		))
diff --git a/server/subsonic/responses/responses.go b/server/subsonic/responses/responses.go
index f0bb26f66..ff5ae0d3b 100644
--- a/server/subsonic/responses/responses.go
+++ b/server/subsonic/responses/responses.go
@@ -537,13 +537,30 @@ type Line struct {
 	Value string `xml:",chardata"            json:"value"`
 }
 
+type LyricToken struct {
+	Start *int64 `xml:"start,attr,omitempty" json:"start,omitempty"`
+	End   *int64 `xml:"end,attr,omitempty"   json:"end,omitempty"`
+	Value string `xml:"value,attr"           json:"value"`
+	Role  string `xml:"role,attr,omitempty"  json:"role,omitempty"`
+}
+
+type TokenLine struct {
+	Index int32        `xml:"index,attr"                    json:"index"`
+	Start *int64       `xml:"start,attr,omitempty"         json:"start,omitempty"`
+	End   *int64       `xml:"end,attr,omitempty"           json:"end,omitempty"`
+	Value string       `xml:"value,attr,omitempty"         json:"value,omitempty"`
+	Token []LyricToken `xml:"token,omitempty"        json:"token,omitempty"`
+}
+
 type StructuredLyric struct {
-	DisplayArtist string `xml:"displayArtist,attr,omitempty" json:"displayArtist,omitempty"`
-	DisplayTitle  string `xml:"displayTitle,attr,omitempty"  json:"displayTitle,omitempty"`
-	Lang          string `xml:"lang,attr"                    json:"lang"`
-	Line          []Line `xml:"line"                         json:"line"`
-	Offset        *int64 `xml:"offset,attr,omitempty"        json:"offset,omitempty"`
-	Synced        bool   `xml:"synced,attr"                  json:"synced"`
+	DisplayArtist string      `xml:"displayArtist,attr,omitempty" json:"displayArtist,omitempty"`
+	DisplayTitle  string      `xml:"displayTitle,attr,omitempty"  json:"displayTitle,omitempty"`
+	Kind          string      `xml:"kind,attr,omitempty"          json:"kind,omitempty"`
+	Lang          string      `xml:"lang,attr"                    json:"lang"`
+	Line          []Line      `xml:"line"                         json:"line"`
+	TokenLine     []TokenLine `xml:"tokenLine,omitempty"     json:"tokenLine,omitempty"`
+	Offset        *int64      `xml:"offset,attr,omitempty"        json:"offset,omitempty"`
+	Synced        bool        `xml:"synced,attr"                  json:"synced"`
 }
 
 type StructuredLyrics []StructuredLyric
diff --git a/tests/fixtures/bom-test.ttml b/tests/fixtures/bom-test.ttml
new file mode 100644
index 000000000..319ab1f07
--- /dev/null
+++ b/tests/fixtures/bom-test.ttml
@@ -0,0 +1,2 @@
+﻿<?xml version="1.0" encoding="UTF-8"?>
+<tt xmlns="http://www.w3.org/ns/ttml"><body><div xml:lang="eng"><p begin="00:00:00.00">BOM test line</p></div></body></tt>
diff --git a/tests/fixtures/bom-utf16-test.ttml b/tests/fixtures/bom-utf16-test.ttml
new file mode 100644
index 0000000000000000000000000000000000000000..a5621ef5d54ddd1a6a748046809f0ac7cf81ead1
GIT binary patch
literal 414
zcmaKo;R=F45QOJ<PjUD^PYpt(^j}X<50E8SP^Xkwy?iyhiWEdFoX6h!&CEVuSfIci
zXPjWrp~3}M98tq#i2yM|MEn}Qc<k8U^VP%Y>jrDAFy+*oGX-)?$ZJ_<V0zMobI@*s
z43>4%3VF`Ruc_(Sm07EE;wB(%fl?J8dKcwxBxju2j!wj#8~$lHQ_`<fr=lLQvf+%8
zQZv<5Ir;?R-;gKCD&8c0MRkitmH!hHBm*&42fvvu)7BqMtDEeUZ@+T(JK!$gMtwl}

literal 0
HcmV?d00001

diff --git a/tests/fixtures/test-metadata.ttml b/tests/fixtures/test-metadata.ttml
new file mode 100644
index 000000000..c0243c18f
--- /dev/null
+++ b/tests/fixtures/test-metadata.ttml
@@ -0,0 +1,25 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<tt xmlns="http://www.w3.org/ns/ttml" xmlns:itunes="http://music.apple.com/lyric-ttml-internal">
+  <head>
+    <metadata>
+      <iTunesMetadata xmlns="http://music.apple.com/lyric-ttml-internal">
+        <translations>
+          <translation xml:lang="es">
+            <text for="L1">Hola</text>
+          </translation>
+        </translations>
+        <transliterations>
+          <transliteration xml:lang="ja-Latn">
+            <text for="L2"><span begin="00:02.000" end="00:02.300" xmlns="http://www.w3.org/ns/ttml">ko</span><span begin="00:02.300" end="00:02.600" xmlns="http://www.w3.org/ns/ttml">nni</span></text>
+          </transliteration>
+        </transliterations>
+      </iTunesMetadata>
+    </metadata>
+  </head>
+  <body xml:lang="ja">
+    <div>
+      <p begin="00:01.000" end="00:01.500" itunes:key="L1">こんにちは</p>
+      <p begin="00:02.000" end="00:02.700" itunes:key="L2">こんばんは</p>
+    </div>
+  </body>
+</tt>
diff --git a/tests/fixtures/test.ttml b/tests/fixtures/test.ttml
new file mode 100644
index 000000000..a85673a1b
--- /dev/null
+++ b/tests/fixtures/test.ttml
@@ -0,0 +1,12 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<tt xmlns="http://www.w3.org/ns/ttml" xmlns:ttp="http://www.w3.org/ns/ttml#parameter" ttp:frameRate="30" ttp:subFrameRate="2" ttp:tickRate="10">
+  <body>
+    <div xml:lang="eng">
+      <p begin="00:00:18.80">We're no strangers to love</p>
+      <p begin="00:00:22:24">You know the rules and so do I</p>
+    </div>
+    <div xml:lang="por">
+      <p begin="188t">Nao somos estranhos ao amor</p>
+    </div>
+  </body>
+</tt>
diff --git a/ui/src/actions/player.js b/ui/src/actions/player.js
index 9056abeb6..f55102207 100644
--- a/ui/src/actions/player.js
+++ b/ui/src/actions/player.js
@@ -9,6 +9,7 @@ export const PLAYER_SET_VOLUME = 'PLAYER_SET_VOLUME'
 export const PLAYER_SET_MODE = 'PLAYER_SET_MODE'
 export const TRANSCODING_SET_PROFILE = 'TRANSCODING_SET_PROFILE'
 export const PLAYER_REFRESH_QUEUE = 'PLAYER_REFRESH_QUEUE'
+export const PLAYER_UPDATE_LYRIC = 'PLAYER_UPDATE_LYRIC'
 
 export const setTrack = (data) => ({
   type: PLAYER_SET_TRACK,
@@ -114,3 +115,8 @@ export const refreshQueue = (resolvedUrls) => ({
   type: PLAYER_REFRESH_QUEUE,
   data: resolvedUrls,
 })
+
+export const updateQueueLyric = (trackId, lyric) => ({
+  type: PLAYER_UPDATE_LYRIC,
+  data: { trackId, lyric },
+})
diff --git a/ui/src/audioplayer/KaraokeLyricsOverlay.jsx b/ui/src/audioplayer/KaraokeLyricsOverlay.jsx
new file mode 100644
index 000000000..3814cbee6
--- /dev/null
+++ b/ui/src/audioplayer/KaraokeLyricsOverlay.jsx
@@ -0,0 +1,1228 @@
+import React, {
+  memo,
+  useCallback,
+  useEffect,
+  useMemo,
+  useRef,
+  useState,
+} from 'react'
+import clsx from 'clsx'
+import Button from '@material-ui/core/Button'
+import IconButton from '@material-ui/core/IconButton'
+import Popover from '@material-ui/core/Popover'
+import Slider from '@material-ui/core/Slider'
+import Typography from '@material-ui/core/Typography'
+import CloseIcon from '@material-ui/icons/Close'
+import TuneIcon from '@material-ui/icons/Tune'
+import { makeStyles } from '@material-ui/core/styles'
+import {
+  buildKaraokeLines,
+  getActiveKaraokeState,
+  hasStructuredLyricContent,
+  resolveLayerLineForMain,
+  resolveKaraokeTokenWindow,
+} from './lyrics'
+
+const KARAOKE_RENDER_LEAD_MS = 24
+const KARAOKE_CLOCK_DRIFT_RESET_MS = 140
+const KARAOKE_CLOCK_RESET_THRESHOLD_MS = 320
+const KARAOKE_MONOTONIC_JITTER_MS = 60
+const KARAOKE_RENDER_UPDATE_EPSILON_MS = 6
+const KARAOKE_WORD_SETTLE_MS = 96
+const KARAOKE_ANIMATION_MS = 150
+const KARAOKE_DEFAULT_HEIGHT_PX = 300
+const KARAOKE_MIN_HEIGHT_PX = 150
+const KARAOKE_MAX_HEIGHT_RATIO = 0.72
+const KARAOKE_MAX_HEIGHT_PX = 760
+const KARAOKE_CENTER_SPACER_RATIO = 0.5
+const KARAOKE_CENTER_SPACER_MIN_PX = 132
+
+const TOKEN_DONE_ALPHA = 1
+const TOKEN_FUTURE_ALPHA = 0.34
+const TOKEN_ACTIVE_ALPHA = 1
+const TOKEN_WIPE_EDGE_PCT = 8
+const TOKEN_WIPE_GLOW_PCT = 16
+
+const COLOR_PRESETS = [
+  { key: 'white', label: 'White', value: 'rgba(255, 255, 255, 0.92)' },
+  { key: 'blue', label: 'Blue', value: 'rgba(120, 160, 220, 0.75)' },
+  { key: 'green', label: 'Green', value: 'rgba(100, 200, 130, 0.7)' },
+  { key: 'pink', label: 'Pink', value: 'rgba(240, 140, 170, 0.75)' },
+  { key: 'purple', label: 'Purple', value: 'rgba(180, 140, 240, 0.75)' },
+  { key: 'orange', label: 'Orange', value: 'rgba(240, 180, 100, 0.75)' },
+  { key: 'cyan', label: 'Cyan', value: 'rgba(100, 210, 220, 0.75)' },
+  { key: 'yellow', label: 'Yellow', value: 'rgba(240, 230, 110, 0.75)' },
+]
+
+const DEFAULT_LYRICS_SETTINGS = {
+  tr: { fontSize: 14, colorKey: 'blue' },
+  main: { fontSize: 24, colorKey: 'white' },
+  pr: { fontSize: 14, colorKey: 'green' },
+}
+
+const SETTINGS_STORAGE_KEY = 'karaoke-lyrics-settings'
+
+const loadLyricsSettings = () => {
+  try {
+    const raw = localStorage.getItem(SETTINGS_STORAGE_KEY)
+    if (raw) {
+      const parsed = JSON.parse(raw)
+      return {
+        tr: { ...DEFAULT_LYRICS_SETTINGS.tr, ...parsed.tr },
+        main: { ...DEFAULT_LYRICS_SETTINGS.main, ...parsed.main },
+        pr: { ...DEFAULT_LYRICS_SETTINGS.pr, ...parsed.pr },
+      }
+    }
+  } catch {
+    /* ignore */
+  }
+  return { ...DEFAULT_LYRICS_SETTINGS }
+}
+
+const saveLyricsSettings = (settings) => {
+  try {
+    localStorage.setItem(SETTINGS_STORAGE_KEY, JSON.stringify(settings))
+  } catch {
+    /* ignore */
+  }
+}
+
+const getColorValue = (colorKey) =>
+  COLOR_PRESETS.find((c) => c.key === colorKey)?.value || COLOR_PRESETS[0].value
+
+const useStyles = makeStyles((theme) => ({
+  overlay: {
+    position: 'fixed',
+    left: '50%',
+    bottom: 100,
+    transform: 'translateX(-50%)',
+    zIndex: 1400,
+    width: 'min(900px, calc(100vw - 32px))',
+    minHeight: KARAOKE_MIN_HEIGHT_PX,
+    background: 'rgba(6, 8, 12, 0.9)',
+    borderRadius: 12,
+    border: '1px solid rgba(255, 255, 255, 0.12)',
+    boxShadow: '0 18px 48px rgba(0, 0, 0, 0.42)',
+    backdropFilter: 'blur(10px)',
+    color: theme.palette.common.white,
+    display: 'flex',
+    flexDirection: 'column',
+    overflow: 'hidden',
+    '@media (max-width:810px)': {
+      bottom: 78,
+      width: 'calc(100vw - 12px)',
+      borderRadius: 8,
+      minHeight: 180,
+      maxHeight: '65vh',
+    },
+  },
+  resizeHandle: {
+    height: 14,
+    cursor: 'ns-resize',
+    flexShrink: 0,
+    position: 'relative',
+    '&::after': {
+      content: '""',
+      position: 'absolute',
+      left: '50%',
+      top: 4,
+      transform: 'translateX(-50%)',
+      width: 56,
+      height: 3,
+      borderRadius: 999,
+      background: 'rgba(255, 255, 255, 0.22)',
+    },
+    '@media (max-width:810px)': {
+      display: 'none',
+    },
+  },
+  header: {
+    display: 'flex',
+    alignItems: 'center',
+    justifyContent: 'space-between',
+    gap: theme.spacing(1),
+    padding: theme.spacing(0.3, 1.3, 0.4, 1.3),
+  },
+  headerLeft: {
+    display: 'flex',
+    alignItems: 'center',
+    gap: theme.spacing(1),
+    minWidth: 0,
+  },
+  language: {
+    fontSize: 11,
+    letterSpacing: '0.08em',
+    opacity: 0.72,
+    textTransform: 'uppercase',
+    whiteSpace: 'nowrap',
+  },
+  layerControls: {
+    display: 'flex',
+    alignItems: 'center',
+    gap: theme.spacing(0.5),
+  },
+  layerToggle: {
+    minWidth: 34,
+    minHeight: 24,
+    padding: theme.spacing(0, 0.8),
+    fontSize: 10,
+    letterSpacing: '0.08em',
+    borderRadius: 999,
+    color: 'rgba(203, 213, 225, 0.95)',
+    background: 'rgba(100, 116, 139, 0.26)',
+    border: '1px solid rgba(148, 163, 184, 0.45)',
+    transition: `all ${KARAOKE_ANIMATION_MS}ms ease-in-out`,
+    '&.Mui-disabled': {
+      color: 'rgba(148, 163, 184, 0.45)',
+      borderColor: 'rgba(100, 116, 139, 0.3)',
+      background: 'rgba(71, 85, 105, 0.2)',
+    },
+  },
+  layerToggleActive: {
+    color: 'rgba(220, 252, 231, 0.98)',
+    borderColor: 'rgba(34, 197, 94, 0.96)',
+    background: 'rgba(34, 197, 94, 0.28)',
+  },
+  closeButton: {
+    color: 'rgba(255, 255, 255, 0.72)',
+  },
+  inlineTr: {
+    margin: '0 0 2px 0',
+    textAlign: 'center',
+    fontWeight: 400,
+    lineHeight: 1.2,
+    letterSpacing: '0.01em',
+    transition: `opacity ${KARAOKE_ANIMATION_MS}ms ease-in-out, font-size ${KARAOKE_ANIMATION_MS}ms ease-in-out`,
+  },
+  inlinePr: {
+    margin: '2px 0 0 0',
+    textAlign: 'center',
+    fontWeight: 400,
+    lineHeight: 1.2,
+    letterSpacing: '0.01em',
+    transition: `opacity ${KARAOKE_ANIMATION_MS}ms ease-in-out, font-size ${KARAOKE_ANIMATION_MS}ms ease-in-out`,
+  },
+  body: {
+    padding: theme.spacing(0.5, 2, 1.4, 2),
+    overflowY: 'auto',
+    overflowX: 'hidden',
+    scrollBehavior: 'smooth',
+    flex: 1,
+    overscrollBehavior: 'contain',
+    scrollbarWidth: 'none',
+    msOverflowStyle: 'none',
+    '&::-webkit-scrollbar': {
+      display: 'none',
+      width: 0,
+      height: 0,
+    },
+    '@media (max-width:810px)': {
+      padding: theme.spacing(0.35, 1.2, 1.2, 1.2),
+    },
+  },
+  lines: {
+    display: 'flex',
+    flexDirection: 'column',
+    gap: theme.spacing(1.24),
+    paddingBottom: theme.spacing(1),
+  },
+  line: {
+    margin: 0,
+    fontWeight: 600,
+    lineHeight: 1.24,
+    letterSpacing: '0.01em',
+    textAlign: 'center',
+    color: 'rgba(255, 255, 255, 0.62)',
+    transition: `opacity ${KARAOKE_ANIMATION_MS}ms ease-in-out, color ${KARAOKE_ANIMATION_MS}ms ease-in-out, font-size 280ms ease-in-out`,
+  },
+  token: {
+    display: 'inline-block',
+    whiteSpace: 'pre-wrap',
+    transition: `color ${KARAOKE_ANIMATION_MS}ms ease-in-out, text-shadow ${KARAOKE_ANIMATION_MS}ms ease-in-out`,
+  },
+  settingsButton: {
+    color: 'rgba(255, 255, 255, 0.55)',
+    padding: 4,
+    '&:hover': {
+      color: 'rgba(255, 255, 255, 0.85)',
+    },
+  },
+  settingsPanel: {
+    background: 'rgba(12, 14, 20, 0.96)',
+    border: '1px solid rgba(255, 255, 255, 0.12)',
+    borderRadius: 10,
+    padding: theme.spacing(1.5, 2),
+    width: 260,
+    backdropFilter: 'blur(12px)',
+  },
+  settingsSection: {
+    marginBottom: theme.spacing(1.2),
+    '&:last-child': {
+      marginBottom: 0,
+    },
+  },
+  settingsLabel: {
+    fontSize: 10,
+    fontWeight: 600,
+    letterSpacing: '0.1em',
+    textTransform: 'uppercase',
+    color: 'rgba(255, 255, 255, 0.55)',
+    marginBottom: 4,
+  },
+  settingsRow: {
+    display: 'flex',
+    alignItems: 'center',
+    gap: theme.spacing(1),
+  },
+  settingsSlider: {
+    flex: 1,
+    color: 'rgba(255, 255, 255, 0.6)',
+    '& .MuiSlider-thumb': {
+      width: 12,
+      height: 12,
+    },
+    '& .MuiSlider-rail': {
+      opacity: 0.3,
+    },
+  },
+  settingsSliderValue: {
+    fontSize: 11,
+    color: 'rgba(255, 255, 255, 0.5)',
+    minWidth: 22,
+    textAlign: 'right',
+  },
+  colorDots: {
+    display: 'flex',
+    gap: 5,
+    marginTop: 4,
+  },
+  colorDot: {
+    width: 16,
+    height: 16,
+    borderRadius: '50%',
+    border: '2px solid transparent',
+    cursor: 'pointer',
+    transition: 'border-color 120ms ease, transform 120ms ease',
+    '&:hover': {
+      transform: 'scale(1.2)',
+    },
+  },
+  colorDotActive: {
+    borderColor: 'rgba(255, 255, 255, 0.85)',
+  },
+}))
+
+const clamp = (v, min, max) => Math.max(min, Math.min(max, v))
+const lerp = (from, to, t) => from + (to - from) * t
+
+const normalizeForComparison = (text) =>
+  (text || '').replace(/[\s\p{P}]/gu, '').toLowerCase()
+
+const shouldShowAuxLine = (mainLine, auxLine) => {
+  if (!auxLine || !auxLine.value) return false
+  return (
+    normalizeForComparison(auxLine.value) !==
+    normalizeForComparison(mainLine.value)
+  )
+}
+
+const SettingsSection = ({ label, layer, settings, onChange, classes }) => {
+  const s = settings[layer]
+  return (
+    <div className={classes.settingsSection}>
+      <div className={classes.settingsLabel}>{label}</div>
+      <div className={classes.settingsRow}>
+        <Slider
+          className={classes.settingsSlider}
+          min={8}
+          max={40}
+          step={1}
+          value={s.fontSize}
+          onChange={(_, val) =>
+            onChange({ ...settings, [layer]: { ...s, fontSize: val } })
+          }
+        />
+        <span className={classes.settingsSliderValue}>{s.fontSize}</span>
+      </div>
+      <div className={classes.colorDots}>
+        {COLOR_PRESETS.map((preset) => (
+          <div
+            key={preset.key}
+            className={clsx(classes.colorDot, {
+              [classes.colorDotActive]: s.colorKey === preset.key,
+            })}
+            style={{ background: preset.value }}
+            title={preset.label}
+            onClick={() =>
+              onChange({ ...settings, [layer]: { ...s, colorKey: preset.key } })
+            }
+          />
+        ))}
+      </div>
+    </div>
+  )
+}
+
+const LyricsSettingsPopover = ({ settings, onChange }) => {
+  const classes = useStyles()
+  const [anchorEl, setAnchorEl] = useState(null)
+
+  const handleToggle = useCallback((e) => {
+    e.stopPropagation()
+    setAnchorEl((prev) => (prev ? null : e.currentTarget))
+  }, [])
+
+  const handleClose = useCallback(() => setAnchorEl(null), [])
+
+  return (
+    <>
+      <IconButton
+        className={classes.settingsButton}
+        size="small"
+        onClick={handleToggle}
+        aria-label="Lyrics settings"
+      >
+        <TuneIcon style={{ fontSize: 18 }} />
+      </IconButton>
+      <Popover
+        open={Boolean(anchorEl)}
+        anchorEl={anchorEl}
+        onClose={handleClose}
+        anchorOrigin={{ vertical: 'top', horizontal: 'center' }}
+        transformOrigin={{ vertical: 'bottom', horizontal: 'center' }}
+        PaperProps={{ className: classes.settingsPanel }}
+        style={{ zIndex: 1500 }}
+      >
+        <SettingsSection
+          label="Translation"
+          layer="tr"
+          settings={settings}
+          onChange={onChange}
+          classes={classes}
+        />
+        <SettingsSection
+          label="Default"
+          layer="main"
+          settings={settings}
+          onChange={onChange}
+          classes={classes}
+        />
+        <SettingsSection
+          label="Pronunciation"
+          layer="pr"
+          settings={settings}
+          onChange={onChange}
+          classes={classes}
+        />
+      </Popover>
+    </>
+  )
+}
+
+const easeInOut = (v) => {
+  const clamped = clamp(v, 0, 1)
+  return clamped < 0.5
+    ? 2 * clamped * clamped
+    : 1 - Math.pow(-2 * clamped + 2, 2) / 2
+}
+
+const getMaxHeightPx = () => {
+  if (typeof window === 'undefined') {
+    return KARAOKE_MAX_HEIGHT_PX
+  }
+  return Math.min(
+    Math.floor(window.innerHeight * KARAOKE_MAX_HEIGHT_RATIO),
+    KARAOKE_MAX_HEIGHT_PX,
+  )
+}
+
+const buildSegmentsFromLine = (line) => {
+  if (!line || !Array.isArray(line.tokens) || line.tokens.length === 0) {
+    return [{ text: line?.value || '', token: null, tokenIndex: -1 }]
+  }
+
+  const text = line.value || ''
+  const matchedSegments = []
+  const fallbackSegments = []
+  let cursor = 0
+  let allMatched = text.length > 0
+  let anyMatched = false
+
+  const pushFallbackSeparatorIfNeeded = (nextTokenText) => {
+    if (fallbackSegments.length === 0) {
+      return
+    }
+    const prevText = fallbackSegments[fallbackSegments.length - 1].text || ''
+    if (!prevText || !nextTokenText) {
+      return
+    }
+    if (/\s$/.test(prevText) || /^\s/.test(nextTokenText)) {
+      return
+    }
+    if (/[A-Za-z0-9]$/.test(prevText) && /^[A-Za-z0-9]/.test(nextTokenText)) {
+      fallbackSegments.push({ text: ' ', token: null, tokenIndex: -1 })
+    }
+  }
+
+  for (let tokenIndex = 0; tokenIndex < line.tokens.length; tokenIndex += 1) {
+    const token = line.tokens[tokenIndex]
+    const tokenText = token.value || ''
+    if (!tokenText) {
+      continue
+    }
+
+    pushFallbackSeparatorIfNeeded(tokenText)
+    fallbackSegments.push({ text: tokenText, token, tokenIndex })
+
+    if (!text) {
+      allMatched = false
+      continue
+    }
+
+    const foundAt = text.indexOf(tokenText, cursor)
+    const normalizedFoundAt =
+      foundAt >= 0
+        ? foundAt
+        : text.toLowerCase().indexOf(tokenText.toLowerCase(), cursor)
+
+    if (normalizedFoundAt >= 0) {
+      anyMatched = true
+      if (normalizedFoundAt > cursor) {
+        matchedSegments.push({
+          text: text.slice(cursor, normalizedFoundAt),
+          token: null,
+          tokenIndex: -1,
+        })
+      }
+      const matchedTokenText = text.slice(
+        normalizedFoundAt,
+        normalizedFoundAt + tokenText.length,
+      )
+      matchedSegments.push({
+        text: matchedTokenText || tokenText,
+        token,
+        tokenIndex,
+      })
+      cursor = normalizedFoundAt + tokenText.length
+    } else {
+      allMatched = false
+    }
+  }
+
+  if (allMatched && anyMatched) {
+    if (cursor < text.length) {
+      matchedSegments.push({
+        text: text.slice(cursor),
+        token: null,
+        tokenIndex: -1,
+      })
+    }
+    return matchedSegments
+  }
+
+  if (fallbackSegments.length > 0) {
+    return fallbackSegments
+  }
+
+  return [{ text, token: null, tokenIndex: -1 }]
+}
+
+const getLineRenderWindow = (line, nextLineStart) => {
+  let start = Number.isFinite(Number(line?.start)) ? Number(line.start) : null
+  let end = Number.isFinite(Number(line?.end)) ? Number(line.end) : null
+  const fallbackEnd = Number.isFinite(Number(nextLineStart))
+    ? Number(nextLineStart)
+    : null
+
+  if (end == null) {
+    end = fallbackEnd
+  }
+
+  const tokens = Array.isArray(line?.tokens) ? line.tokens : []
+  if (tokens.length > 0) {
+    const firstWindow = resolveKaraokeTokenWindow(line, 0, nextLineStart)
+    const lastWindow = resolveKaraokeTokenWindow(
+      line,
+      tokens.length - 1,
+      nextLineStart,
+    )
+
+    if (
+      firstWindow.start != null &&
+      (start == null || firstWindow.start < start)
+    ) {
+      start = firstWindow.start
+    }
+    if (lastWindow.end != null && (end == null || lastWindow.end > end)) {
+      end = lastWindow.end
+    }
+  }
+
+  return { start, end }
+}
+
+const shouldSkipLineFrame = (
+  prevPlaybackMs,
+  nextPlaybackMs,
+  line,
+  nextLineStart,
+) => {
+  if (prevPlaybackMs === nextPlaybackMs) {
+    return true
+  }
+
+  const { start, end } = getLineRenderWindow(line, nextLineStart)
+
+  if (start != null) {
+    const activationStart = start - 220
+    if (prevPlaybackMs < activationStart && nextPlaybackMs < activationStart) {
+      return true
+    }
+  }
+
+  if (end != null) {
+    const settleEnd = end + KARAOKE_WORD_SETTLE_MS + 160
+    if (prevPlaybackMs > settleEnd && nextPlaybackMs > settleEnd) {
+      return true
+    }
+  }
+
+  return false
+}
+
+const areLineStylesEqual = (prevStyle, nextStyle) => {
+  const a = prevStyle || {}
+  const b = nextStyle || {}
+  return (
+    a.opacity === b.opacity &&
+    a.color === b.color &&
+    a.fontSize === b.fontSize &&
+    a.fontWeight === b.fontWeight
+  )
+}
+
+const parseColorRGB = (rgba) => {
+  const m = (rgba || '').match(/rgba?\((\d+),\s*(\d+),\s*(\d+)/)
+  return m ? [parseInt(m[1]), parseInt(m[2]), parseInt(m[3])] : [255, 255, 255]
+}
+
+const buildTokenWipeStyle = ({
+  fillProgress,
+  highlightAlpha,
+  futureAlpha,
+  rgb,
+}) => {
+  const [r, g, b] = rgb || [255, 255, 255]
+  const fillPct = clamp(fillProgress, 0, 1) * 100
+  const doneColor = `rgba(${r}, ${g}, ${b}, ${clamp(highlightAlpha, TOKEN_DONE_ALPHA, TOKEN_ACTIVE_ALPHA)})`
+  const futureColor = `rgba(${r}, ${g}, ${b}, ${futureAlpha})`
+  const activeShadow = `0 0 8px rgba(${r}, ${g}, ${b}, 0.34)`
+
+  if (fillPct <= 0) {
+    return { color: futureColor, textShadow: 'none' }
+  }
+
+  const edgeStart = clamp(fillPct - TOKEN_WIPE_EDGE_PCT, 0, 100)
+  const glowStop = clamp(fillPct + TOKEN_WIPE_GLOW_PCT, 0, 100)
+  const glowColor = `rgba(${r}, ${g}, ${b}, ${clamp(highlightAlpha + 0.18, TOKEN_DONE_ALPHA, TOKEN_ACTIVE_ALPHA)})`
+  return {
+    color: 'transparent',
+    WebkitTextFillColor: 'transparent',
+    backgroundImage: `linear-gradient(90deg, ${doneColor} 0%, ${doneColor} ${edgeStart}%, ${glowColor} ${fillPct}%, ${futureColor} ${glowStop}%, ${futureColor} 100%)`,
+    backgroundClip: 'text',
+    WebkitBackgroundClip: 'text',
+    textShadow: activeShadow,
+  }
+}
+
+const KaraokeLineRow = memo(
+  ({
+    line,
+    nextLineStart,
+    renderPlaybackMs,
+    className,
+    style,
+    tokenClassName,
+    highlightTokens = true,
+  }) => {
+    const segments = buildSegmentsFromLine(line)
+    const tokenRGB = useMemo(
+      () => (style?.color ? parseColorRGB(style.color) : [255, 255, 255]),
+      [style?.color],
+    )
+
+    return (
+      <Typography className={className} component="div" style={style}>
+        {segments.map((segment, idx) => {
+          if (!segment.token) {
+            return <span key={`text-${idx}`}>{segment.text}</span>
+          }
+
+          if (!highlightTokens) {
+            return <span key={`token-plain-${idx}`}>{segment.text}</span>
+          }
+
+          const { start: tokenStart, end: tokenEnd } =
+            resolveKaraokeTokenWindow(line, segment.tokenIndex, nextLineStart)
+
+          const isDone = tokenEnd != null ? renderPlaybackMs >= tokenEnd : false
+          const isActive =
+            !isDone && tokenStart != null && renderPlaybackMs >= tokenStart
+
+          const progress =
+            isDone ||
+            tokenStart == null ||
+            tokenEnd == null ||
+            tokenEnd <= tokenStart
+              ? isDone
+                ? 1
+                : 0
+              : clamp(
+                  (renderPlaybackMs - tokenStart) / (tokenEnd - tokenStart),
+                  0,
+                  1,
+                )
+
+          const justEnded =
+            tokenEnd != null &&
+            renderPlaybackMs > tokenEnd &&
+            renderPlaybackMs <= tokenEnd + KARAOKE_WORD_SETTLE_MS
+
+          const settleProgress =
+            justEnded && tokenEnd != null
+              ? clamp(
+                  (renderPlaybackMs - tokenEnd) / KARAOKE_WORD_SETTLE_MS,
+                  0,
+                  1,
+                )
+              : 0
+
+          let alpha = TOKEN_FUTURE_ALPHA
+          if (isDone) {
+            alpha = TOKEN_DONE_ALPHA
+          } else if (isActive) {
+            alpha = lerp(
+              TOKEN_FUTURE_ALPHA,
+              TOKEN_ACTIVE_ALPHA,
+              easeInOut(progress),
+            )
+          }
+          if (justEnded) {
+            alpha = lerp(
+              TOKEN_ACTIVE_ALPHA,
+              TOKEN_DONE_ALPHA,
+              easeInOut(settleProgress),
+            )
+          }
+          alpha = clamp(alpha, TOKEN_FUTURE_ALPHA, TOKEN_ACTIVE_ALPHA)
+          const fillProgress = isDone ? 1 : isActive ? progress : 0
+
+          return (
+            <span
+              key={`token-${idx}-${tokenStart ?? 'na'}`}
+              className={tokenClassName}
+              style={buildTokenWipeStyle({
+                fillProgress,
+                highlightAlpha: alpha,
+                futureAlpha: TOKEN_FUTURE_ALPHA,
+                rgb: tokenRGB,
+              })}
+            >
+              {segment.text}
+            </span>
+          )
+        })}
+      </Typography>
+    )
+  },
+  (prevProps, nextProps) => {
+    if (
+      prevProps.line !== nextProps.line ||
+      prevProps.nextLineStart !== nextProps.nextLineStart ||
+      prevProps.className !== nextProps.className ||
+      prevProps.tokenClassName !== nextProps.tokenClassName ||
+      prevProps.highlightTokens !== nextProps.highlightTokens ||
+      !areLineStylesEqual(prevProps.style, nextProps.style)
+    ) {
+      return false
+    }
+
+    return shouldSkipLineFrame(
+      prevProps.renderPlaybackMs,
+      nextProps.renderPlaybackMs,
+      nextProps.line,
+      nextProps.nextLineStart,
+    )
+  },
+)
+
+KaraokeLineRow.displayName = 'KaraokeLineRow'
+
+const KaraokeLyricsOverlay = ({
+  visible,
+  mainLyric,
+  translationLyric,
+  pronunciationLyric,
+  showTranslation,
+  showPronunciation,
+  translationEnabled,
+  pronunciationEnabled,
+  onToggleTranslation,
+  onTogglePronunciation,
+  audioInstance,
+  onClose,
+}) => {
+  const classes = useStyles()
+  const [playbackMs, setPlaybackMs] = useState(0)
+  const [overlayHeight, setOverlayHeight] = useState(KARAOKE_DEFAULT_HEIGHT_PX)
+  const [maxHeightPx, setMaxHeightPx] = useState(getMaxHeightPx())
+  const [bodyViewportHeight, setBodyViewportHeight] = useState(0)
+  const [isCompact, setIsCompact] = useState(
+    typeof window !== 'undefined' ? window.innerWidth <= 810 : false,
+  )
+  const [lyricsSettings, setLyricsSettings] = useState(loadLyricsSettings)
+
+  const handleSettingsChange = useCallback((next) => {
+    setLyricsSettings(next)
+    saveLyricsSettings(next)
+  }, [])
+
+  const bodyRef = useRef(null)
+  const activeLineRef = useRef(null)
+
+  const mainLines = useMemo(() => buildKaraokeLines(mainLyric), [mainLyric])
+  const translationLines = useMemo(
+    () => buildKaraokeLines(translationLyric),
+    [translationLyric],
+  )
+  const pronunciationLines = useMemo(
+    () => buildKaraokeLines(pronunciationLyric),
+    [pronunciationLyric],
+  )
+
+  useEffect(() => {
+    const onResize = () => {
+      const nextMaxHeight = getMaxHeightPx()
+      setIsCompact(window.innerWidth <= 810)
+      setMaxHeightPx(nextMaxHeight)
+      setOverlayHeight((previous) =>
+        clamp(previous, KARAOKE_MIN_HEIGHT_PX, nextMaxHeight),
+      )
+    }
+
+    onResize()
+    window.addEventListener('resize', onResize)
+    return () => window.removeEventListener('resize', onResize)
+  }, [])
+
+  useEffect(() => {
+    const body = bodyRef.current
+    if (!body) {
+      return undefined
+    }
+
+    const updateViewportHeight = () => {
+      setBodyViewportHeight(body.clientHeight || 0)
+    }
+
+    updateViewportHeight()
+
+    if (typeof ResizeObserver !== 'undefined') {
+      const observer = new ResizeObserver(updateViewportHeight)
+      observer.observe(body)
+      return () => observer.disconnect()
+    }
+
+    window.addEventListener('resize', updateViewportHeight)
+    return () => window.removeEventListener('resize', updateViewportHeight)
+  }, [overlayHeight, isCompact, showTranslation, showPronunciation, visible])
+
+  const onResizeStart = useCallback(
+    (event) => {
+      if (isCompact) {
+        return
+      }
+
+      event.preventDefault()
+      const startY = event.clientY
+      const startHeight = overlayHeight
+
+      const onMove = (moveEvent) => {
+        const delta = startY - moveEvent.clientY
+        setOverlayHeight(
+          clamp(startHeight + delta, KARAOKE_MIN_HEIGHT_PX, maxHeightPx),
+        )
+      }
+
+      const onUp = () => {
+        window.removeEventListener('mousemove', onMove)
+        window.removeEventListener('mouseup', onUp)
+      }
+
+      window.addEventListener('mousemove', onMove)
+      window.addEventListener('mouseup', onUp)
+    },
+    [isCompact, maxHeightPx, overlayHeight],
+  )
+
+  useEffect(() => {
+    if (!visible || !audioInstance) {
+      setPlaybackMs(0)
+      return
+    }
+
+    let rafId = 0
+    let cancelled = false
+    let anchorAudioMs = 0
+    let anchorPerfMs = 0
+    let lastRenderMs = 0
+
+    const readPlaybackMs = () => {
+      const seconds = Number(audioInstance.currentTime)
+      if (!Number.isFinite(seconds) || seconds < 0) {
+        return 0
+      }
+      return seconds * 1000
+    }
+
+    const resetAnchor = (perfNow, observedMs) => {
+      anchorAudioMs = observedMs
+      anchorPerfMs = perfNow
+    }
+
+    const tick = () => {
+      if (cancelled) {
+        return
+      }
+
+      const observedMs = readPlaybackMs()
+      const perfNow = performance.now()
+      const playbackRate = Number(audioInstance.playbackRate)
+      const canInterpolate =
+        !audioInstance.paused &&
+        !audioInstance.seeking &&
+        Number.isFinite(playbackRate) &&
+        playbackRate > 0
+
+      let nowMs = observedMs
+
+      if (!canInterpolate) {
+        resetAnchor(perfNow, observedMs)
+      } else if (anchorPerfMs === 0) {
+        resetAnchor(perfNow, observedMs)
+      } else {
+        const predicted =
+          anchorAudioMs + (perfNow - anchorPerfMs) * playbackRate
+        const drift = observedMs - predicted
+        if (Math.abs(drift) > KARAOKE_CLOCK_DRIFT_RESET_MS) {
+          nowMs = observedMs
+          resetAnchor(perfNow, observedMs)
+        } else {
+          nowMs = predicted
+        }
+      }
+
+      const backwardsDrift = lastRenderMs - nowMs
+      if (canInterpolate && backwardsDrift > 0) {
+        nowMs = lastRenderMs
+      }
+
+      if (canInterpolate && backwardsDrift > KARAOKE_CLOCK_RESET_THRESHOLD_MS) {
+        resetAnchor(perfNow, observedMs)
+      } else if (
+        !canInterpolate &&
+        backwardsDrift > 0 &&
+        backwardsDrift <= KARAOKE_MONOTONIC_JITTER_MS
+      ) {
+        nowMs = lastRenderMs
+      }
+
+      nowMs = Math.max(0, nowMs)
+      lastRenderMs = nowMs
+
+      setPlaybackMs((prev) =>
+        Math.abs(prev - nowMs) >= KARAOKE_RENDER_UPDATE_EPSILON_MS
+          ? nowMs
+          : prev,
+      )
+      rafId = window.requestAnimationFrame(tick)
+    }
+
+    const initialMs = readPlaybackMs()
+    resetAnchor(performance.now(), initialMs)
+    lastRenderMs = initialMs
+    setPlaybackMs(initialMs)
+    rafId = window.requestAnimationFrame(tick)
+
+    return () => {
+      cancelled = true
+      if (rafId) {
+        window.cancelAnimationFrame(rafId)
+      }
+    }
+  }, [audioInstance, visible])
+
+  const renderPlaybackMs = playbackMs + KARAOKE_RENDER_LEAD_MS
+
+  const { lineIndex } = useMemo(
+    () => getActiveKaraokeState(mainLines, renderPlaybackMs),
+    [mainLines, renderPlaybackMs],
+  )
+
+  const activeIndex = lineIndex >= 0 ? lineIndex : 0
+
+  const trByMainIndex = useMemo(() => {
+    if (!showTranslation || translationLines.length === 0) return {}
+    const map = {}
+    for (let i = 0; i < mainLines.length; i++) {
+      const { line } = resolveLayerLineForMain(mainLines, translationLines, i)
+      if (line) map[i] = line
+    }
+    return map
+  }, [mainLines, translationLines, showTranslation])
+
+  const prByMainIndex = useMemo(() => {
+    if (!showPronunciation || pronunciationLines.length === 0) return {}
+    const map = {}
+    for (let i = 0; i < mainLines.length; i++) {
+      const { line } = resolveLayerLineForMain(mainLines, pronunciationLines, i)
+      if (line) map[i] = line
+    }
+    return map
+  }, [mainLines, pronunciationLines, showPronunciation])
+
+  const hasTranslationLine = showTranslation && translationLines.length > 0
+  const hasPronunciationLine =
+    showPronunciation && pronunciationLines.length > 0
+  const measuredViewportHeight = bodyRef.current?.clientHeight || 0
+  const estimatedViewportHeight =
+    measuredViewportHeight > 0
+      ? measuredViewportHeight
+      : bodyViewportHeight > 0
+        ? bodyViewportHeight
+        : isCompact
+          ? 260
+          : Math.max(220, overlayHeight - 170)
+  const centerSpacerPx = Math.max(
+    KARAOKE_CENTER_SPACER_MIN_PX,
+    Math.floor(estimatedViewportHeight * KARAOKE_CENTER_SPACER_RATIO),
+  )
+
+  useEffect(() => {
+    if (!visible) {
+      return
+    }
+
+    const rafId = window.requestAnimationFrame(() => {
+      const body = bodyRef.current
+      const activeNode = activeLineRef.current
+      if (!body || !activeNode) {
+        return
+      }
+
+      const bodyRect = body.getBoundingClientRect()
+      const activeRect = activeNode.getBoundingClientRect()
+      const deltaWithinBody =
+        activeRect.top -
+        bodyRect.top -
+        (body.clientHeight - activeRect.height) / 2
+      const maxTop = Math.max(0, body.scrollHeight - body.clientHeight)
+      const centeredTop = clamp(body.scrollTop + deltaWithinBody, 0, maxTop)
+
+      if (Math.abs(body.scrollTop - centeredTop) < 2) {
+        return
+      }
+
+      if (typeof body.scrollTo === 'function') {
+        body.scrollTo({
+          top: centeredTop,
+          behavior: 'smooth',
+        })
+      } else {
+        body.scrollTop = centeredTop
+      }
+    })
+
+    return () => window.cancelAnimationFrame(rafId)
+  }, [
+    centerSpacerPx,
+    hasPronunciationLine,
+    hasTranslationLine,
+    lineIndex,
+    overlayHeight,
+    visible,
+  ])
+
+  if (
+    !visible ||
+    !hasStructuredLyricContent(mainLyric) ||
+    mainLines.length === 0
+  ) {
+    return null
+  }
+
+  const getMainLineStyle = (idx) => {
+    const delta = idx - activeIndex
+    const isActive = delta === 0
+    let opacity = isActive ? 1 : delta < 0 ? 0.6 : 0.72
+    const [r, g, b] = parseColorRGB(getColorValue(lyricsSettings.main.colorKey))
+    let color = isActive
+      ? `rgba(${r}, ${g}, ${b}, 0.98)`
+      : delta < 0
+        ? `rgba(${r}, ${g}, ${b}, 0.4)`
+        : `rgba(${r}, ${g}, ${b}, 0.54)`
+
+    if (delta > 1) {
+      const level = clamp(delta, 1, 6)
+      opacity = Math.max(0.36, 0.74 - level * 0.08)
+    }
+
+    if (delta < -1) {
+      const level = clamp(Math.abs(delta), 1, 6)
+      opacity = Math.max(0.28, 0.62 - level * 0.08)
+    }
+
+    const baseFontSize = lyricsSettings.main.fontSize
+    const fontSize = isActive ? baseFontSize : Math.round(baseFontSize * 0.8)
+
+    return {
+      opacity,
+      color,
+      fontSize,
+    }
+  }
+
+  const overlayStyle = isCompact
+    ? undefined
+    : {
+        height: overlayHeight,
+        maxHeight: maxHeightPx,
+      }
+
+  return (
+    <div
+      className={classes.overlay}
+      data-testid="karaoke-lyrics-overlay"
+      style={overlayStyle}
+    >
+      <div className={classes.resizeHandle} onMouseDown={onResizeStart} />
+
+      <div className={classes.header}>
+        <div className={classes.headerLeft}>
+          <Typography className={classes.language}>
+            {mainLyric?.lang || 'xxx'}
+          </Typography>
+          <div className={classes.layerControls}>
+            <Button
+              size="small"
+              onClick={onToggleTranslation}
+              disabled={!translationEnabled}
+              className={clsx(classes.layerToggle, {
+                [classes.layerToggleActive]: showTranslation,
+              })}
+              data-testid="lyrics-toggle-translation"
+            >
+              TR
+            </Button>
+            <Button
+              size="small"
+              onClick={onTogglePronunciation}
+              disabled={!pronunciationEnabled}
+              className={clsx(classes.layerToggle, {
+                [classes.layerToggleActive]: showPronunciation,
+              })}
+              data-testid="lyrics-toggle-pronunciation"
+            >
+              PR
+            </Button>
+          </div>
+        </div>
+
+        <div style={{ display: 'flex', alignItems: 'center', gap: 2 }}>
+          <LyricsSettingsPopover
+            settings={lyricsSettings}
+            onChange={handleSettingsChange}
+          />
+          <IconButton
+            className={classes.closeButton}
+            size="small"
+            onClick={onClose}
+            aria-label="Close lyrics"
+          >
+            <CloseIcon fontSize="small" />
+          </IconButton>
+        </div>
+      </div>
+
+      <div className={classes.body} ref={bodyRef}>
+        <div className={classes.lines}>
+          <div aria-hidden style={{ height: centerSpacerPx }} />
+          {mainLines.map((line, idx) => {
+            const trLine = trByMainIndex[idx]
+            const prLine = prByMainIndex[idx]
+            const showTr = shouldShowAuxLine(line, trLine)
+            const showPr = shouldShowAuxLine(line, prLine)
+            const lineStyle = getMainLineStyle(idx)
+            const auxOpacity =
+              lineStyle.opacity != null ? lineStyle.opacity * 0.85 : 1
+            const trStyle = {
+              opacity: auxOpacity,
+              fontSize: lyricsSettings.tr.fontSize,
+              color: getColorValue(lyricsSettings.tr.colorKey),
+            }
+            const prStyle = {
+              opacity: auxOpacity,
+              fontSize: lyricsSettings.pr.fontSize,
+              color: getColorValue(lyricsSettings.pr.colorKey),
+            }
+            return (
+              <div
+                key={`line-${line.index}-${line.start ?? idx}`}
+                ref={idx === activeIndex ? activeLineRef : null}
+                style={{ cursor: line.start != null ? 'pointer' : undefined }}
+                onClick={() => {
+                  if (audioInstance && line.start != null) {
+                    audioInstance.currentTime = line.start / 1000
+                  }
+                }}
+              >
+                {showTr && (
+                  <KaraokeLineRow
+                    line={trLine}
+                    nextLineStart={null}
+                    renderPlaybackMs={renderPlaybackMs}
+                    className={classes.inlineTr}
+                    style={trStyle}
+                    tokenClassName={classes.token}
+                    highlightTokens={false}
+                  />
+                )}
+                <KaraokeLineRow
+                  line={line}
+                  nextLineStart={mainLines[idx + 1]?.start ?? null}
+                  renderPlaybackMs={renderPlaybackMs}
+                  className={classes.line}
+                  style={lineStyle}
+                  tokenClassName={classes.token}
+                />
+                {showPr && (
+                  <KaraokeLineRow
+                    line={prLine}
+                    nextLineStart={null}
+                    renderPlaybackMs={renderPlaybackMs}
+                    className={classes.inlinePr}
+                    style={prStyle}
+                    tokenClassName={classes.token}
+                  />
+                )}
+              </div>
+            )
+          })}
+          <div aria-hidden style={{ height: centerSpacerPx }} />
+        </div>
+      </div>
+    </div>
+  )
+}
+
+export default KaraokeLyricsOverlay
diff --git a/ui/src/audioplayer/Player.jsx b/ui/src/audioplayer/Player.jsx
index eba3b82d7..b8b33b6d5 100644
--- a/ui/src/audioplayer/Player.jsx
+++ b/ui/src/audioplayer/Player.jsx
@@ -22,6 +22,7 @@ import {
   refreshQueue,
   setPlayMode,
   setTranscodingProfile,
+  updateQueueLyric,
   setVolume,
   syncQueue,
 } from '../actions'
@@ -33,6 +34,25 @@ import { keyMap } from '../hotkeys'
 import keyHandlers from './keyHandlers'
 import { calculateGain } from '../utils/calculateReplayGain'
 import { detectBrowserProfile, decisionService } from '../transcode'
+import {
+  getPreferredLyricLanguage,
+  hasStructuredLyricContent,
+  selectLyricLayers,
+  structuredLyricToLrc,
+} from './lyrics'
+import KaraokeLyricsOverlay from './KaraokeLyricsOverlay'
+
+const emptyLyricLayers = {
+  main: null,
+  translation: null,
+  pronunciation: null,
+}
+
+const normalizeLyricLayers = (layers) => ({
+  main: layers?.main || null,
+  translation: layers?.translation || null,
+  pronunciation: layers?.pronunciation || null,
+})
 
 const Player = () => {
   const theme = useCurrentTheme()
@@ -120,6 +140,72 @@ const Player = () => {
   const gainInfo = useSelector((state) => state.replayGain)
   const [context, setContext] = useState(null)
   const [gainNode, setGainNode] = useState(null)
+  const lyricCacheRef = useRef(new Map())
+  const lyricRequestIdRef = useRef(0)
+  const playerRef = useRef(null)
+  const [karaokeVisible, setKaraokeVisible] = useState(false)
+  const [selectedLyricLayers, setSelectedLyricLayers] =
+    useState(emptyLyricLayers)
+  const [showTranslation, setShowTranslation] = useState(false)
+  const [showPronunciation, setShowPronunciation] = useState(false)
+  const currentTrackId = playerState.current?.trackId
+  const currentTrackIsRadio = playerState.current?.isRadio
+  const selectedStructuredLyric = selectedLyricLayers.main
+  const hasKaraokeLyric = hasStructuredLyricContent(selectedStructuredLyric)
+  const hasTranslationLyric = hasStructuredLyricContent(
+    selectedLyricLayers.translation,
+  )
+  const hasPronunciationLyric = hasStructuredLyricContent(
+    selectedLyricLayers.pronunciation,
+  )
+
+  const applyLyricToRuntimePlayer = useCallback((trackId, lyric) => {
+    if (!trackId) {
+      return
+    }
+
+    const player = playerRef.current
+    if (!player || typeof player.setState !== 'function') {
+      return
+    }
+
+    player.setState((prevState) => {
+      const prevLists = Array.isArray(prevState.audioLists)
+        ? prevState.audioLists
+        : []
+      let changed = false
+      const audioLists = prevLists.map((item) => {
+        if (item.trackId !== trackId) {
+          return item
+        }
+        if (item.lyric === lyric) {
+          return item
+        }
+        changed = true
+        return {
+          ...item,
+          lyric,
+        }
+      })
+
+      const currentItem = audioLists.find(
+        (item) => item.musicSrc === prevState.musicSrc,
+      )
+      const currentLyric =
+        typeof currentItem?.lyric === 'string'
+          ? currentItem.lyric
+          : prevState.lyric
+
+      if (!changed && currentLyric === prevState.lyric) {
+        return null
+      }
+
+      return {
+        audioLists,
+        lyric: currentLyric,
+      }
+    })
+  }, [])
 
   useEffect(() => {
     if (
@@ -166,6 +252,107 @@ const Player = () => {
     return () => window.removeEventListener('beforeunload', handleBeforeUnload)
   }, [playerState, audioInstance])
 
+  useEffect(() => {
+    if (!currentTrackId || currentTrackIsRadio) {
+      setSelectedLyricLayers(emptyLyricLayers)
+      setShowTranslation(false)
+      setShowPronunciation(false)
+      setKaraokeVisible(false)
+      return
+    }
+
+    const cached = lyricCacheRef.current.get(currentTrackId)
+    let layers = emptyLyricLayers
+    if (cached && typeof cached !== 'string') {
+      if (cached.layers) {
+        layers = normalizeLyricLayers(cached.layers)
+      } else if (cached.structuredLyric) {
+        layers = normalizeLyricLayers({
+          main: cached.structuredLyric,
+        })
+      }
+    }
+    setSelectedLyricLayers(layers)
+    setShowTranslation(false)
+    setShowPronunciation(hasStructuredLyricContent(layers.pronunciation))
+  }, [currentTrackId, currentTrackIsRadio])
+
+  useEffect(() => {
+    lyricRequestIdRef.current += 1
+    const requestId = lyricRequestIdRef.current
+
+    if (!currentTrackId || currentTrackIsRadio) {
+      return
+    }
+
+    const cached = lyricCacheRef.current.get(currentTrackId)
+    if (cached !== undefined) {
+      const cachedLyric =
+        typeof cached === 'string' ? cached : cached?.lrc || ''
+      const cachedLayers =
+        typeof cached === 'string'
+          ? emptyLyricLayers
+          : cached?.layers
+            ? normalizeLyricLayers(cached.layers)
+            : normalizeLyricLayers({ main: cached?.structuredLyric })
+
+      setSelectedLyricLayers(cachedLayers)
+      setShowTranslation(false)
+      setShowPronunciation(
+        hasStructuredLyricContent(cachedLayers.pronunciation),
+      )
+      if (cachedLyric) {
+        dispatch(updateQueueLyric(currentTrackId, cachedLyric))
+        applyLyricToRuntimePlayer(currentTrackId, cachedLyric)
+      }
+      return
+    }
+
+    subsonic
+      .getLyricsBySongId(currentTrackId)
+      .then((resp) => {
+        if (lyricRequestIdRef.current !== requestId) {
+          return
+        }
+
+        const structuredLyrics =
+          resp?.json?.['subsonic-response']?.lyricsList?.structuredLyrics || []
+        const layers = selectLyricLayers(
+          structuredLyrics,
+          getPreferredLyricLanguage(),
+        )
+        const lyric = layers.main ? structuredLyricToLrc(layers.main) : ''
+        lyricCacheRef.current.set(currentTrackId, {
+          lrc: lyric,
+          layers,
+        })
+        setSelectedLyricLayers(layers)
+        setShowTranslation(false)
+        setShowPronunciation(hasStructuredLyricContent(layers.pronunciation))
+
+        if (lyric !== '') {
+          dispatch(updateQueueLyric(currentTrackId, lyric))
+          applyLyricToRuntimePlayer(currentTrackId, lyric)
+        }
+      })
+      .catch(() => {
+        if (lyricRequestIdRef.current !== requestId) {
+          return
+        }
+        setSelectedLyricLayers(emptyLyricLayers)
+        setShowTranslation(false)
+        setShowPronunciation(false)
+        // Do not cache network/request failures as empty lyrics, so we can retry.
+        lyricCacheRef.current.delete(currentTrackId)
+      })
+  }, [dispatch, currentTrackId, currentTrackIsRadio, applyLyricToRuntimePlayer])
+
+  useEffect(() => {
+    if (!hasKaraokeLyric && karaokeVisible) {
+      setKaraokeVisible(false)
+    }
+  }, [hasKaraokeLyric, karaokeVisible])
+
   const defaultOptions = useMemo(
     () => ({
       theme: playerTheme,
@@ -177,7 +364,7 @@ const Player = () => {
       clearPriorAudioLists: false,
       showDestroy: true,
       showDownload: false,
-      showLyric: true,
+      showLyric: false,
       showReload: false,
       toggleMode: !isDesktop,
       glassBg: false,
@@ -214,12 +401,24 @@ const Player = () => {
         (playerState.clear || playerState.playIndex === 0),
       clearPriorAudioLists: playerState.clear,
       extendsContent: (
-        <PlayerToolbar id={current.trackId} isRadio={current.isRadio} />
+        <PlayerToolbar
+          id={current.trackId}
+          isRadio={current.isRadio}
+          onToggleLyrics={() => setKaraokeVisible((visible) => !visible)}
+          lyricsActive={karaokeVisible}
+          lyricsDisabled={!hasKaraokeLyric}
+        />
       ),
       defaultVolume: isMobilePlayer ? 1 : playerState.volume,
       showMediaSession: !current.isRadio,
     }
-  }, [playerState, defaultOptions, isMobilePlayer])
+  }, [
+    playerState,
+    defaultOptions,
+    isMobilePlayer,
+    karaokeVisible,
+    hasKaraokeLyric,
+  ])
 
   const onAudioListsChange = useCallback(
     (_, audioLists, audioInfo) => dispatch(syncQueue(audioInfo, audioLists)),
@@ -391,6 +590,7 @@ const Player = () => {
   return (
     <ThemeProvider theme={createMuiTheme(theme)}>
       <ReactJkMusicPlayer
+        ref={playerRef}
         {...options}
         className={classes.player}
         onAudioListsChange={onAudioListsChange}
@@ -406,6 +606,28 @@ const Player = () => {
         onBeforeDestroy={onBeforeDestroy}
         getAudioInstance={setAudioInstance}
       />
+      <KaraokeLyricsOverlay
+        visible={karaokeVisible}
+        mainLyric={selectedLyricLayers.main}
+        translationLyric={selectedLyricLayers.translation}
+        pronunciationLyric={selectedLyricLayers.pronunciation}
+        showTranslation={showTranslation}
+        showPronunciation={showPronunciation}
+        translationEnabled={hasTranslationLyric}
+        pronunciationEnabled={hasPronunciationLyric}
+        onToggleTranslation={() =>
+          setShowTranslation((previous) =>
+            hasTranslationLyric ? !previous : false,
+          )
+        }
+        onTogglePronunciation={() =>
+          setShowPronunciation((previous) =>
+            hasPronunciationLyric ? !previous : false,
+          )
+        }
+        audioInstance={audioInstance}
+        onClose={() => setKaraokeVisible(false)}
+      />
       <GlobalHotKeys handlers={handlers} keyMap={keyMap} allowChanges />
     </ThemeProvider>
   )
diff --git a/ui/src/audioplayer/PlayerToolbar.jsx b/ui/src/audioplayer/PlayerToolbar.jsx
index 4812141ab..869df475d 100644
--- a/ui/src/audioplayer/PlayerToolbar.jsx
+++ b/ui/src/audioplayer/PlayerToolbar.jsx
@@ -4,7 +4,9 @@ import { useGetOne } from 'react-admin'
 import { GlobalHotKeys } from 'react-hotkeys'
 import IconButton from '@material-ui/core/IconButton'
 import { useMediaQuery } from '@material-ui/core'
+import Tooltip from '@material-ui/core/Tooltip'
 import { RiSaveLine } from 'react-icons/ri'
+import { RiFileMusicLine } from 'react-icons/ri'
 import { LoveButton, useToggleLove } from '../common'
 import { openSaveQueueDialog } from '../actions'
 import { keyMap } from '../hotkeys'
@@ -55,7 +57,13 @@ const useStyles = makeStyles((theme) => ({
   },
 }))
 
-const PlayerToolbar = ({ id, isRadio }) => {
+const PlayerToolbar = ({
+  id,
+  isRadio,
+  onToggleLyrics,
+  lyricsActive = false,
+  lyricsDisabled = false,
+}) => {
   const dispatch = useDispatch()
   const { data, loading } = useGetOne('song', id, { enabled: !!id && !isRadio })
   const [toggleLove, toggling] = useToggleLove('song', data)
@@ -99,6 +107,25 @@ const PlayerToolbar = ({ id, isRadio }) => {
     />
   )
 
+  const toggleLyricsButton = (
+    <Tooltip title="Toggle synchronized lyrics">
+      <span>
+        <IconButton
+          size={isDesktop ? 'small' : undefined}
+          onClick={onToggleLyrics}
+          disabled={!onToggleLyrics || lyricsDisabled}
+          data-testid="toggle-lyrics-button"
+          className={buttonClass}
+          color={lyricsActive ? 'primary' : 'default'}
+        >
+          <RiFileMusicLine
+            className={!isDesktop ? classes.mobileIcon : undefined}
+          />
+        </IconButton>
+      </span>
+    </Tooltip>
+  )
+
   return (
     <>
       <GlobalHotKeys keyMap={keyMap} handlers={handlers} allowChanges />
@@ -106,11 +133,13 @@ const PlayerToolbar = ({ id, isRadio }) => {
         <li className={`${listItemClass} item`}>
           {saveQueueButton}
           {loveButton}
+          {toggleLyricsButton}
         </li>
       ) : (
         <>
           <li className={`${listItemClass} item`}>{saveQueueButton}</li>
           <li className={`${listItemClass} item`}>{loveButton}</li>
+          <li className={`${listItemClass} item`}>{toggleLyricsButton}</li>
         </>
       )}
     </>
diff --git a/ui/src/audioplayer/PlayerToolbar.test.jsx b/ui/src/audioplayer/PlayerToolbar.test.jsx
index d0368b0f0..3041001eb 100644
--- a/ui/src/audioplayer/PlayerToolbar.test.jsx
+++ b/ui/src/audioplayer/PlayerToolbar.test.jsx
@@ -71,6 +71,7 @@ describe('<PlayerToolbar />', () => {
       // Verify both buttons are rendered
       expect(screen.getByTestId('save-queue-button')).toBeInTheDocument()
       expect(screen.getByTestId('love-button')).toBeInTheDocument()
+      expect(screen.getByTestId('toggle-lyrics-button')).toBeInTheDocument()
 
       // Verify desktop classes are applied
       expect(listItems[0].className).toContain('toolbar')
@@ -102,6 +103,14 @@ describe('<PlayerToolbar />', () => {
         type: 'OPEN_SAVE_QUEUE_DIALOG',
       })
     })
+
+    it('triggers lyric toggle callback when lyrics button is clicked', () => {
+      const onToggleLyrics = vi.fn()
+      render(<PlayerToolbar id="song-1" onToggleLyrics={onToggleLyrics} />)
+
+      fireEvent.click(screen.getByTestId('toggle-lyrics-button'))
+      expect(onToggleLyrics).toHaveBeenCalledTimes(1)
+    })
   })
 
   describe('Mobile layout', () => {
@@ -114,11 +123,12 @@ describe('<PlayerToolbar />', () => {
 
       // Each button should be in its own list item
       const listItems = screen.getAllByRole('listitem')
-      expect(listItems).toHaveLength(2)
+      expect(listItems).toHaveLength(3)
 
       // Verify both buttons are rendered
       expect(screen.getByTestId('save-queue-button')).toBeInTheDocument()
       expect(screen.getByTestId('love-button')).toBeInTheDocument()
+      expect(screen.getByTestId('toggle-lyrics-button')).toBeInTheDocument()
 
       // Verify mobile classes are applied
       expect(listItems[0].className).toContain('mobileListItem')
@@ -140,6 +150,13 @@ describe('<PlayerToolbar />', () => {
       const loveButton = screen.getByTestId('love-button')
       expect(loveButton).toBeDisabled()
     })
+
+    it('disables lyrics button when lyrics are unavailable', () => {
+      render(<PlayerToolbar id="song-1" lyricsDisabled={true} />)
+
+      const lyricsButton = screen.getByTestId('toggle-lyrics-button')
+      expect(lyricsButton).toBeDisabled()
+    })
   })
 
   describe('Common behavior', () => {
diff --git a/ui/src/audioplayer/lyrics.js b/ui/src/audioplayer/lyrics.js
new file mode 100644
index 000000000..3dcf9b0f9
--- /dev/null
+++ b/ui/src/audioplayer/lyrics.js
@@ -0,0 +1,617 @@
+const normalizeLanguageTag = (language) =>
+  (language || '').toLowerCase().replace('_', '-')
+
+const KARAOKE_SWITCH_EPSILON_MS = 18
+const LYRIC_KIND_MAIN = 'main'
+const LYRIC_KIND_TRANSLATION = 'translation'
+const LYRIC_KIND_PRONUNCIATION = 'pronunciation'
+
+const padTime = (value) => {
+  const str = value.toString()
+  return str.length === 1 ? `0${str}` : str
+}
+
+const toTime = (value) => {
+  const numeric = Number(value)
+  return Number.isFinite(numeric) ? numeric : null
+}
+
+const compareNullableTime = (a, b) => {
+  if (a == null && b == null) {
+    return 0
+  }
+  if (a == null) {
+    return 1
+  }
+  if (b == null) {
+    return -1
+  }
+  return a - b
+}
+
+const sortTokensByStart = (tokens) =>
+  tokens
+    .map((token, order) => ({ ...token, order }))
+    .sort((a, b) => {
+      const byStart = compareNullableTime(a.start, b.start)
+      if (byStart !== 0) {
+        return byStart
+      }
+      const byEnd = compareNullableTime(a.end, b.end)
+      if (byEnd !== 0) {
+        return byEnd
+      }
+      return a.order - b.order
+    })
+    .map(({ order, ...token }) => token)
+
+const languageMatch = (candidate, preferred) => {
+  if (!candidate || !preferred) {
+    return false
+  }
+  return (
+    candidate === preferred ||
+    candidate.startsWith(`${preferred}-`) ||
+    preferred.startsWith(`${candidate}-`)
+  )
+}
+
+const hasTimedLines = (lyric) =>
+  lyric &&
+  lyric.synced &&
+  Array.isArray(lyric.line) &&
+  lyric.line.some((line) => Number.isFinite(Number(line.start)))
+
+const normalizeToken = (token) => {
+  if (!token) {
+    return null
+  }
+  const value = typeof token.value === 'string' ? token.value : ''
+  if (!value.trim()) {
+    return null
+  }
+  return {
+    start: toTime(token.start),
+    end: toTime(token.end),
+    value,
+    role: typeof token.role === 'string' ? token.role : '',
+  }
+}
+
+const normalizeTokenLine = (tokenLine, fallbackIndex) => {
+  const index = Number.isFinite(Number(tokenLine?.index))
+    ? Number(tokenLine.index)
+    : fallbackIndex
+  const tokens = sortTokensByStart(
+    Array.isArray(tokenLine?.token)
+      ? tokenLine.token.map(normalizeToken).filter(Boolean)
+      : [],
+  )
+
+  return {
+    index,
+    start: toTime(tokenLine?.start),
+    end: toTime(tokenLine?.end),
+    value: typeof tokenLine?.value === 'string' ? tokenLine.value : '',
+    tokens,
+  }
+}
+
+const normalizeLyricKind = (kind) => {
+  const normalized = (kind || '').toLowerCase().trim()
+  switch (normalized) {
+    case LYRIC_KIND_TRANSLATION:
+      return LYRIC_KIND_TRANSLATION
+    case LYRIC_KIND_PRONUNCIATION:
+      return LYRIC_KIND_PRONUNCIATION
+    default:
+      return LYRIC_KIND_MAIN
+  }
+}
+
+const pickLyricByLanguage = (lyrics, preferredLanguage) => {
+  if (!Array.isArray(lyrics) || lyrics.length === 0) {
+    return null
+  }
+
+  const preferred = normalizeLanguageTag(preferredLanguage)
+  const preferredBase = preferred.split('-')[0]
+
+  return (
+    lyrics.find((lyric) =>
+      languageMatch(normalizeLanguageTag(lyric.lang), preferred),
+    ) ||
+    lyrics.find((lyric) =>
+      languageMatch(normalizeLanguageTag(lyric.lang), preferredBase),
+    ) ||
+    lyrics.find((lyric) =>
+      languageMatch(normalizeLanguageTag(lyric.lang), 'en'),
+    ) ||
+    lyrics[0]
+  )
+}
+
+const lineTimeWindow = (lines, index) => {
+  const line = lines[index]
+  if (!line) {
+    return { start: null, end: null }
+  }
+
+  const start = toTime(line.start)
+  const end = toTime(line.end) ?? toTime(lines[index + 1]?.start)
+  return { start, end }
+}
+
+const buildSyntheticWordTokens = (line, token) => {
+  const text = typeof line?.value === 'string' ? line.value : ''
+  if (!text.trim()) {
+    return null
+  }
+
+  const chunks = text.match(/\S+\s*/g) || []
+  if (chunks.length < 2) {
+    return null
+  }
+
+  const normalizedLine = text.replace(/\s+/g, ' ').trim().toLowerCase()
+  const normalizedTokenValue = (token?.value || '')
+    .replace(/\s+/g, ' ')
+    .trim()
+    .toLowerCase()
+  if (!normalizedTokenValue || !normalizedLine) {
+    return null
+  }
+
+  const compressedLine = normalizedLine.replace(/\s+/g, '')
+  const compressedToken = normalizedTokenValue.replace(/\s+/g, '')
+  const tokenLooksLikeWholeLine =
+    compressedToken === compressedLine ||
+    compressedToken.length >= Math.floor(compressedLine.length * 0.8)
+  if (!tokenLooksLikeWholeLine) {
+    return null
+  }
+
+  const tokenStart = toTime(token?.start)
+  const tokenEnd = toTime(token?.end)
+  const lineStart = toTime(line?.start)
+  const lineEnd = toTime(line?.end)
+
+  const baseStart = tokenStart ?? lineStart
+  const baseEnd = tokenEnd ?? lineEnd
+  if (
+    baseStart == null ||
+    baseEnd == null ||
+    !Number.isFinite(baseStart) ||
+    !Number.isFinite(baseEnd) ||
+    baseEnd <= baseStart
+  ) {
+    return null
+  }
+
+  const duration = baseEnd - baseStart
+  return chunks.map((chunk, idx) => ({
+    start: baseStart + (duration * idx) / chunks.length,
+    end: baseStart + (duration * (idx + 1)) / chunks.length,
+    value: chunk,
+    role: typeof token?.role === 'string' ? token.role : '',
+  }))
+}
+
+export const hasTokenTiming = (structuredLyric) =>
+  Boolean(
+    structuredLyric &&
+    Array.isArray(structuredLyric.tokenLine) &&
+    structuredLyric.tokenLine.some(
+      (tokenLine) =>
+        Array.isArray(tokenLine?.token) &&
+        tokenLine.token.some((token) => Number.isFinite(Number(token?.start))),
+    ),
+  )
+
+export const hasStructuredLyricContent = (structuredLyric) =>
+  Boolean(
+    structuredLyric &&
+    ((Array.isArray(structuredLyric.line) &&
+      structuredLyric.line.some(
+        (line) => typeof line?.value === 'string' && line.value.trim() !== '',
+      )) ||
+      hasTokenTiming(structuredLyric)),
+  )
+
+export const getPreferredLyricLanguage = () => {
+  if (typeof window !== 'undefined' && window.localStorage) {
+    const stored = window.localStorage.getItem('locale')
+    if (stored) {
+      return stored
+    }
+  }
+  if (typeof navigator !== 'undefined' && navigator.language) {
+    return navigator.language
+  }
+  return 'en'
+}
+
+export const selectLyricLayers = (structuredLyrics, preferredLanguage) => {
+  if (!Array.isArray(structuredLyrics)) {
+    return {
+      main: null,
+      translation: null,
+      pronunciation: null,
+    }
+  }
+
+  const synced = structuredLyrics.filter(hasTimedLines)
+  if (synced.length === 0) {
+    return {
+      main: null,
+      translation: null,
+      pronunciation: null,
+    }
+  }
+
+  const grouped = {
+    [LYRIC_KIND_MAIN]: [],
+    [LYRIC_KIND_TRANSLATION]: [],
+    [LYRIC_KIND_PRONUNCIATION]: [],
+  }
+
+  for (const lyric of synced) {
+    grouped[normalizeLyricKind(lyric?.kind)].push(lyric)
+  }
+
+  const mainCandidates = grouped[LYRIC_KIND_MAIN].length
+    ? grouped[LYRIC_KIND_MAIN]
+    : synced
+
+  return {
+    main: pickLyricByLanguage(mainCandidates, preferredLanguage),
+    translation: pickLyricByLanguage(
+      grouped[LYRIC_KIND_TRANSLATION],
+      preferredLanguage,
+    ),
+    pronunciation: pickLyricByLanguage(
+      grouped[LYRIC_KIND_PRONUNCIATION],
+      preferredLanguage,
+    ),
+  }
+}
+
+export const pickStructuredLyric = (structuredLyrics, preferredLanguage) =>
+  selectLyricLayers(structuredLyrics, preferredLanguage).main
+
+export const structuredLyricToLrc = (structuredLyric) => {
+  if (!structuredLyric || !Array.isArray(structuredLyric.line)) {
+    return ''
+  }
+
+  let lyricText = ''
+  for (const line of structuredLyric.line) {
+    const start = Number(line.start)
+    if (!Number.isFinite(start) || start < 0) {
+      continue
+    }
+
+    let time = Math.floor(start / 10)
+    const ms = time % 100
+    time = Math.floor(time / 100)
+    const sec = time % 60
+    time = Math.floor(time / 60)
+    const min = time % 60
+
+    lyricText += `[${padTime(min)}:${padTime(sec)}.${padTime(ms)}] ${line.value || ''}\n`
+  }
+  return lyricText
+}
+
+export const structuredLyricsToLrc = (structuredLyrics, preferredLanguage) => {
+  const selected = pickStructuredLyric(structuredLyrics, preferredLanguage)
+  if (!selected) {
+    return ''
+  }
+  return structuredLyricToLrc(selected)
+}
+
+export const buildKaraokeLines = (structuredLyric) => {
+  if (!structuredLyric) {
+    return []
+  }
+
+  const baseLines = Array.isArray(structuredLyric.line)
+    ? structuredLyric.line
+    : []
+  const rawTokenLines = Array.isArray(structuredLyric.tokenLine)
+    ? structuredLyric.tokenLine
+    : []
+
+  const lines =
+    rawTokenLines.length > 0
+      ? rawTokenLines.map((tokenLine, fallbackIndex) => {
+          const normalized = normalizeTokenLine(tokenLine, fallbackIndex)
+          const baseLine = baseLines[normalized.index] || {}
+          const tokens = normalized.tokens
+          const fallbackStart =
+            tokens.find((token) => token.start != null)?.start ?? null
+          const fallbackEnd =
+            [...tokens].reverse().find((token) => token.end != null)?.end ??
+            null
+          const value =
+            normalized.value ||
+            (typeof baseLine.value === 'string' ? baseLine.value : '') ||
+            tokens.map((token) => token.value).join('')
+
+          return {
+            index: normalized.index,
+            start: normalized.start ?? toTime(baseLine.start) ?? fallbackStart,
+            end: normalized.end ?? toTime(baseLine.end) ?? fallbackEnd,
+            value,
+            tokens,
+          }
+        })
+      : baseLines.map((line, index) => ({
+          index,
+          start: toTime(line.start),
+          end: toTime(line.end),
+          value: typeof line.value === 'string' ? line.value : '',
+          tokens: [],
+        }))
+
+  const normalized = lines
+    .filter((line) => line.value || line.tokens.length > 0)
+    .sort((a, b) => {
+      if (a.start == null && b.start == null) {
+        return a.index - b.index
+      }
+      if (a.start == null) {
+        return 1
+      }
+      if (b.start == null) {
+        return -1
+      }
+      if (a.start !== b.start) {
+        return a.start - b.start
+      }
+      return a.index - b.index
+    })
+    .map((line) => {
+      const nextLine = { ...line }
+      if (nextLine.tokens.length === 1) {
+        const syntheticTokens = buildSyntheticWordTokens(
+          nextLine,
+          nextLine.tokens[0],
+        )
+        if (syntheticTokens) {
+          nextLine.tokens = syntheticTokens
+        }
+      }
+      return nextLine
+    })
+
+  for (let i = 0; i < normalized.length; i += 1) {
+    if (normalized[i].end == null) {
+      const nextStart = normalized[i + 1]?.start
+      if (nextStart != null) {
+        normalized[i].end = nextStart
+      }
+    }
+  }
+
+  return normalized
+}
+
+export const resolveKaraokeTokenWindow = (
+  line,
+  tokenIndex,
+  lineEndFallback = null,
+) => {
+  const tokens = Array.isArray(line?.tokens) ? line.tokens : []
+  const token = tokens[tokenIndex]
+  if (!token) {
+    return { start: null, end: null }
+  }
+
+  const prevToken = tokenIndex > 0 ? tokens[tokenIndex - 1] : null
+  const nextToken =
+    tokenIndex + 1 < tokens.length ? tokens[tokenIndex + 1] : null
+
+  const lineStart = toTime(line?.start)
+  const lineEnd = toTime(line?.end) ?? toTime(lineEndFallback)
+  const tokenCount = tokens.length
+  const hasLineWindow =
+    lineStart != null &&
+    lineEnd != null &&
+    Number.isFinite(lineStart) &&
+    Number.isFinite(lineEnd) &&
+    lineEnd > lineStart
+  const estimatedStart =
+    hasLineWindow && tokenCount > 0
+      ? lineStart + ((lineEnd - lineStart) * tokenIndex) / tokenCount
+      : null
+  const estimatedEnd =
+    hasLineWindow && tokenCount > 0
+      ? lineStart + ((lineEnd - lineStart) * (tokenIndex + 1)) / tokenCount
+      : null
+
+  let explicitStartCount = 0
+  let explicitEndCount = 0
+  const uniqueStarts = new Set()
+  const uniqueEnds = new Set()
+
+  for (let i = 0; i < tokenCount; i += 1) {
+    const explicitStart = toTime(tokens[i]?.start)
+    if (explicitStart != null) {
+      explicitStartCount += 1
+      uniqueStarts.add(explicitStart)
+    }
+
+    const explicitEnd = toTime(tokens[i]?.end)
+    if (explicitEnd != null) {
+      explicitEndCount += 1
+      uniqueEnds.add(explicitEnd)
+    }
+  }
+
+  const collapsedStarts =
+    explicitStartCount > 1 && uniqueStarts.size <= Math.max(1, tokenCount / 4)
+  const collapsedEnds =
+    explicitEndCount > 1 && uniqueEnds.size <= Math.max(1, tokenCount / 4)
+  const shouldForceEstimated =
+    hasLineWindow && tokenCount > 1 && (collapsedStarts || collapsedEnds)
+
+  if (shouldForceEstimated) {
+    return {
+      start: estimatedStart,
+      end: estimatedEnd,
+    }
+  }
+  const prevEnd = toTime(prevToken?.end) ?? toTime(prevToken?.start)
+
+  let start = toTime(token.start)
+  if (start == null) {
+    start = prevEnd ?? estimatedStart ?? lineStart
+  }
+
+  let end = toTime(token.end)
+  if (end == null) {
+    const nextDirectStart = toTime(nextToken?.start)
+    const nextEstimatedStart =
+      hasLineWindow && tokenIndex + 1 < tokenCount
+        ? lineStart + ((lineEnd - lineStart) * (tokenIndex + 1)) / tokenCount
+        : null
+    end = nextDirectStart ?? nextEstimatedStart ?? estimatedEnd ?? lineEnd
+  }
+
+  if (
+    tokenCount === 1 &&
+    hasLineWindow &&
+    (start == null || end == null || end <= start + 1)
+  ) {
+    start = lineStart
+    end = lineEnd
+  }
+
+  if (start != null && end != null && end < start) {
+    end = start
+  }
+
+  return { start, end }
+}
+
+export const getActiveKaraokeState = (lines, currentTimeMs) => {
+  if (!Array.isArray(lines) || lines.length === 0) {
+    return { lineIndex: -1, tokenIndex: -1 }
+  }
+
+  const current = Number.isFinite(Number(currentTimeMs))
+    ? Number(currentTimeMs)
+    : 0
+  let lineIndex = 0
+  for (let i = 0; i < lines.length; i += 1) {
+    const lineStart = toTime(lines[i]?.start)
+    if (lineStart == null || lineStart <= current + KARAOKE_SWITCH_EPSILON_MS) {
+      lineIndex = i
+      continue
+    }
+    break
+  }
+
+  for (let i = lineIndex; i >= 0; i -= 1) {
+    const lineStart = toTime(lines[i]?.start)
+    const lineEnd = toTime(lines[i]?.end) ?? toTime(lines[i + 1]?.start)
+    if (lineStart != null && current + KARAOKE_SWITCH_EPSILON_MS < lineStart) {
+      continue
+    }
+    if (lineEnd == null || current <= lineEnd + KARAOKE_SWITCH_EPSILON_MS) {
+      lineIndex = i
+      break
+    }
+  }
+
+  const activeLine = lines[lineIndex] || null
+  const tokens = Array.isArray(activeLine?.tokens) ? activeLine.tokens : []
+  let tokenIndex = -1
+  for (let i = 0; i < tokens.length; i += 1) {
+    const { start: tokenStart, end: tokenEnd } = resolveKaraokeTokenWindow(
+      activeLine,
+      i,
+      lines[lineIndex + 1]?.start,
+    )
+    if (
+      tokenStart == null ||
+      tokenStart <= current + KARAOKE_SWITCH_EPSILON_MS
+    ) {
+      tokenIndex = i
+      if (tokenEnd != null && current <= tokenEnd + KARAOKE_SWITCH_EPSILON_MS) {
+        break
+      }
+      continue
+    }
+    break
+  }
+
+  return { lineIndex, tokenIndex }
+}
+
+export const findLayerLineIndexForMain = (mainLines, layerLines, mainIndex) => {
+  if (
+    !Array.isArray(mainLines) ||
+    !Array.isArray(layerLines) ||
+    mainLines.length === 0 ||
+    layerLines.length === 0 ||
+    mainIndex < 0 ||
+    mainIndex >= mainLines.length
+  ) {
+    return -1
+  }
+
+  const { start: mainStart, end: mainEnd } = lineTimeWindow(
+    mainLines,
+    mainIndex,
+  )
+
+  if (mainStart == null) {
+    return -1
+  }
+  const mainWindowEnd = mainEnd ?? mainStart
+  const mainWindowDuration = Math.max(0, mainWindowEnd - mainStart)
+  const maxDelta = Math.max(550, Math.min(1400, mainWindowDuration + 420))
+
+  let bestIdx = -1
+  let bestScore = Number.POSITIVE_INFINITY
+
+  for (let i = 0; i < layerLines.length; i += 1) {
+    const { start, end } = lineTimeWindow(layerLines, i)
+
+    if (start != null && end != null) {
+      const overlap = Math.min(end, mainEnd ?? end) - Math.max(start, mainStart)
+      if (overlap >= 0) {
+        const score = Math.abs(start - mainStart) + Math.abs(i - mainIndex) * 30
+        if (score < bestScore) {
+          bestScore = score
+          bestIdx = i
+        }
+        continue
+      }
+    }
+
+    if (start != null) {
+      if (Math.abs(start - mainStart) > maxDelta) {
+        continue
+      }
+      const score = Math.abs(start - mainStart) + Math.abs(i - mainIndex) * 45
+      if (score < bestScore) {
+        bestScore = score
+        bestIdx = i
+      }
+    }
+  }
+
+  return bestIdx
+}
+
+export const resolveLayerLineForMain = (mainLines, layerLines, mainIndex) => {
+  const index = findLayerLineIndexForMain(mainLines, layerLines, mainIndex)
+  return {
+    index,
+    line: index >= 0 ? layerLines[index] : null,
+  }
+}
diff --git a/ui/src/audioplayer/lyrics.test.js b/ui/src/audioplayer/lyrics.test.js
new file mode 100644
index 000000000..c60605a6f
--- /dev/null
+++ b/ui/src/audioplayer/lyrics.test.js
@@ -0,0 +1,416 @@
+import {
+  buildKaraokeLines,
+  findLayerLineIndexForMain,
+  getPreferredLyricLanguage,
+  getActiveKaraokeState,
+  hasStructuredLyricContent,
+  pickStructuredLyric,
+  resolveKaraokeTokenWindow,
+  resolveLayerLineForMain,
+  selectLyricLayers,
+  structuredLyricToLrc,
+  structuredLyricsToLrc,
+} from './lyrics'
+
+describe('lyrics helpers', () => {
+  beforeEach(() => {
+    localStorage.clear()
+  })
+
+  it('prefers a lyric track that matches the locale', () => {
+    const selected = pickStructuredLyric(
+      [
+        {
+          lang: 'eng',
+          synced: true,
+          line: [{ start: 1000, value: 'English line' }],
+        },
+        {
+          lang: 'pt-BR',
+          synced: true,
+          line: [{ start: 1000, value: 'Linha em portugues' }],
+        },
+      ],
+      'pt-BR',
+    )
+
+    expect(selected.lang).toBe('pt-BR')
+  })
+
+  it('falls back to english when preferred locale is not available', () => {
+    const selected = pickStructuredLyric(
+      [
+        {
+          lang: 'eng',
+          synced: true,
+          line: [{ start: 1000, value: 'English line' }],
+        },
+        {
+          lang: 'deu',
+          synced: true,
+          line: [{ start: 1000, value: 'Deutsche Zeile' }],
+        },
+      ],
+      'pt-BR',
+    )
+
+    expect(selected.lang).toBe('eng')
+  })
+
+  it('falls back to first synced track when english is missing', () => {
+    const selected = pickStructuredLyric(
+      [
+        {
+          lang: 'jpn',
+          synced: true,
+          line: [{ start: 1000, value: 'Nihongo' }],
+        },
+        {
+          lang: 'deu',
+          synced: true,
+          line: [{ start: 1000, value: 'Deutsch' }],
+        },
+      ],
+      'pt-BR',
+    )
+
+    expect(selected.lang).toBe('jpn')
+  })
+
+  it('selects translation and pronunciation layers by kind', () => {
+    const layers = selectLyricLayers(
+      [
+        {
+          kind: 'main',
+          lang: 'ja',
+          synced: true,
+          line: [{ start: 1000, value: 'こんにちは' }],
+        },
+        {
+          kind: 'translation',
+          lang: 'es',
+          synced: true,
+          line: [{ start: 1000, value: 'Hola' }],
+        },
+        {
+          kind: 'pronunciation',
+          lang: 'ja-Latn',
+          synced: true,
+          line: [{ start: 1000, value: 'konnichiwa' }],
+        },
+      ],
+      'es-MX',
+    )
+
+    expect(layers.main.lang).toBe('ja')
+    expect(layers.translation.lang).toBe('es')
+    expect(layers.pronunciation.lang).toBe('ja-Latn')
+  })
+
+  it('treats missing kind as main for backward compatibility', () => {
+    const layers = selectLyricLayers(
+      [
+        {
+          lang: 'eng',
+          synced: true,
+          line: [{ start: 1000, value: 'Main' }],
+        },
+      ],
+      'eng',
+    )
+
+    expect(layers.main.lang).toBe('eng')
+    expect(layers.translation).toBeNull()
+    expect(layers.pronunciation).toBeNull()
+  })
+
+  it('matches layer line by timing for the active main line', () => {
+    const mainLines = [
+      { index: 0, start: 1000, end: 1800, value: 'Line A', tokens: [] },
+      { index: 1, start: 2000, end: 2800, value: 'Line B', tokens: [] },
+    ]
+    const layerLines = [
+      { index: 0, start: 900, end: 1750, value: 'A2', tokens: [] },
+      { index: 1, start: 2050, end: 2900, value: 'B2', tokens: [] },
+    ]
+
+    expect(findLayerLineIndexForMain(mainLines, layerLines, 1)).toBe(1)
+    expect(resolveLayerLineForMain(mainLines, layerLines, 0).line.value).toBe(
+      'A2',
+    )
+  })
+
+  it('matches metadata layers by nearest timing even when indexes differ', () => {
+    const mainLines = [
+      { index: 0, start: 1000, end: 1800, value: 'Line A', tokens: [] },
+      { index: 1, start: 2000, end: 2800, value: 'Line B', tokens: [] },
+      { index: 2, start: 3000, end: 3800, value: 'Line C', tokens: [] },
+    ]
+    const layerLines = [
+      { index: 2, start: 3020, end: 3820, value: 'C2', tokens: [] },
+      { index: 0, start: 980, end: 1760, value: 'A2', tokens: [] },
+      { index: 1, start: 2010, end: 2810, value: 'B2', tokens: [] },
+    ]
+
+    expect(findLayerLineIndexForMain(mainLines, layerLines, 1)).toBe(2)
+    expect(resolveLayerLineForMain(mainLines, layerLines, 2).line.value).toBe(
+      'C2',
+    )
+  })
+
+  it('returns no layer match when the nearest line is too far in time', () => {
+    const mainLines = [
+      { index: 0, start: 1000, end: 1800, value: 'Line A', tokens: [] },
+      { index: 1, start: 2000, end: 2800, value: 'Line B', tokens: [] },
+    ]
+    const layerLines = [
+      { index: 0, start: 60000, end: 60800, value: 'Far line', tokens: [] },
+    ]
+
+    expect(findLayerLineIndexForMain(mainLines, layerLines, 1)).toBe(-1)
+    expect(resolveLayerLineForMain(mainLines, layerLines, 1).line).toBeNull()
+  })
+
+  it('converts a structured lyric track to LRC', () => {
+    const lrc = structuredLyricToLrc({
+      lang: 'eng',
+      synced: true,
+      line: [
+        { start: 18800, value: "We're no strangers to love" },
+        { start: 22801, value: 'You know the rules and so do I' },
+      ],
+    })
+
+    expect(lrc).toBe(
+      "[00:18.80] We're no strangers to love\n[00:22.80] You know the rules and so do I\n",
+    )
+  })
+
+  it('returns empty text when no synced lyrics are available', () => {
+    const lrc = structuredLyricsToLrc(
+      [{ lang: 'eng', synced: false, line: [{ value: 'Unsynced line' }] }],
+      'eng',
+    )
+
+    expect(lrc).toBe('')
+  })
+
+  it('reads preferred language from localStorage first', () => {
+    localStorage.setItem('locale', 'pt-BR')
+    expect(getPreferredLyricLanguage()).toBe('pt-BR')
+  })
+
+  it('builds karaoke lines from tokenLine payload', () => {
+    const lines = buildKaraokeLines({
+      lang: 'eng',
+      synced: true,
+      line: [{ start: 1000, end: 3000, value: 'Hello world' }],
+      tokenLine: [
+        {
+          index: 0,
+          start: 1000,
+          end: 3000,
+          value: 'Hello world',
+          token: [
+            { start: 1000, end: 1500, value: 'Hello' },
+            { start: 2000, end: 2500, value: 'world', role: 'x-bg' },
+          ],
+        },
+      ],
+    })
+
+    expect(lines).toEqual([
+      {
+        index: 0,
+        start: 1000,
+        end: 3000,
+        value: 'Hello world',
+        tokens: [
+          { start: 1000, end: 1500, value: 'Hello', role: '' },
+          { start: 2000, end: 2500, value: 'world', role: 'x-bg' },
+        ],
+      },
+    ])
+  })
+
+  it('sorts token timing by start to keep playback stable', () => {
+    const lines = buildKaraokeLines({
+      lang: 'eng',
+      synced: true,
+      line: [{ start: 1000, end: 3000, value: 'Hello world' }],
+      tokenLine: [
+        {
+          index: 0,
+          start: 1000,
+          end: 3000,
+          value: 'Hello world',
+          token: [
+            { start: 2000, end: 2500, value: 'world', role: '' },
+            { start: 1000, end: 1500, value: 'Hello', role: '' },
+          ],
+        },
+      ],
+    })
+
+    expect(lines[0].tokens.map((token) => token.value)).toEqual([
+      'Hello',
+      'world',
+    ])
+  })
+
+  it('splits a single full-line token into synthetic word tokens', () => {
+    const lines = buildKaraokeLines({
+      lang: 'ko-Latn',
+      synced: true,
+      line: [{ start: 1000, end: 2000, value: 'Da-la-lun, dun' }],
+      tokenLine: [
+        {
+          index: 0,
+          start: 1000,
+          end: 2000,
+          value: 'Da-la-lun, dun',
+          token: [{ start: 1000, end: 2000, value: 'Da-la-lun, dun' }],
+        },
+      ],
+    })
+
+    expect(lines).toHaveLength(1)
+    expect(lines[0].tokens).toHaveLength(2)
+    expect(lines[0].tokens[0].value).toBe('Da-la-lun, ')
+    expect(lines[0].tokens[1].value).toBe('dun')
+
+    const firstWindow = resolveKaraokeTokenWindow(lines[0], 0)
+    const secondWindow = resolveKaraokeTokenWindow(lines[0], 1)
+
+    expect(firstWindow.start).toBeCloseTo(1000)
+    expect(firstWindow.end).toBeCloseTo(1500)
+    expect(secondWindow.start).toBeCloseTo(1500)
+    expect(secondWindow.end).toBeCloseTo(2000)
+  })
+
+  it('detects active line and token for karaoke timing', () => {
+    const state = getActiveKaraokeState(
+      [
+        {
+          index: 0,
+          start: 1000,
+          end: 3000,
+          value: 'Hello world',
+          tokens: [
+            { start: 1000, end: 1500, value: 'Hello', role: '' },
+            { start: 2000, end: 2500, value: 'world', role: '' },
+          ],
+        },
+        {
+          index: 1,
+          start: 3500,
+          end: 5000,
+          value: 'Second line',
+          tokens: [],
+        },
+      ],
+      2200,
+    )
+
+    expect(state).toEqual({ lineIndex: 0, tokenIndex: 1 })
+  })
+
+  it('resolves token window fallback boundaries from neighboring tokens', () => {
+    const line = {
+      start: 1000,
+      end: 3000,
+      value: 'Hello world',
+      tokens: [
+        { start: 1200, value: 'Hello', role: '' },
+        { start: 1800, value: 'world', role: '' },
+      ],
+    }
+
+    expect(resolveKaraokeTokenWindow(line, 0)).toEqual({
+      start: 1200,
+      end: 1800,
+    })
+    expect(resolveKaraokeTokenWindow(line, 1)).toEqual({
+      start: 1800,
+      end: 3000,
+    })
+  })
+
+  it('infers sequential token windows when token timings are missing', () => {
+    const line = {
+      start: 1000,
+      end: 2000,
+      value: 'A B C',
+      tokens: [
+        { value: 'A', role: '' },
+        { value: 'B', role: '' },
+        { value: 'C', role: '' },
+      ],
+    }
+
+    const first = resolveKaraokeTokenWindow(line, 0)
+    const second = resolveKaraokeTokenWindow(line, 1)
+    const third = resolveKaraokeTokenWindow(line, 2)
+
+    expect(first.start).toBeCloseTo(1000)
+    expect(first.end).toBeCloseTo(1333.3333333333333)
+
+    expect(second.start).toBeCloseTo(1333.3333333333333)
+    expect(second.end).toBeCloseTo(1666.6666666666667)
+
+    expect(third.start).toBeCloseTo(1666.6666666666667)
+    expect(third.end).toBeCloseTo(2000)
+  })
+
+  it('falls back to sequential windows when token timings are collapsed', () => {
+    const line = {
+      start: 1000,
+      end: 2000,
+      value: 'A B C',
+      tokens: [
+        { start: 1000, end: 2000, value: 'A', role: '' },
+        { start: 1000, end: 2000, value: 'B', role: '' },
+        { start: 1000, end: 2000, value: 'C', role: '' },
+      ],
+    }
+
+    const first = resolveKaraokeTokenWindow(line, 0)
+    const second = resolveKaraokeTokenWindow(line, 1)
+    const third = resolveKaraokeTokenWindow(line, 2)
+
+    expect(first.start).toBeCloseTo(1000)
+    expect(first.end).toBeCloseTo(1333.3333333333333)
+    expect(second.start).toBeCloseTo(1333.3333333333333)
+    expect(second.end).toBeCloseTo(1666.6666666666667)
+    expect(third.start).toBeCloseTo(1666.6666666666667)
+    expect(third.end).toBeCloseTo(2000)
+  })
+
+  it('keeps token selection stable near tight token boundaries', () => {
+    const state = getActiveKaraokeState(
+      [
+        {
+          index: 0,
+          start: 1000,
+          end: 2000,
+          value: 'A B',
+          tokens: [
+            { start: 1000, end: 1100, value: 'A', role: '' },
+            { start: 1110, end: 1300, value: 'B', role: '' },
+          ],
+        },
+      ],
+      1108,
+    )
+
+    expect(state).toEqual({ lineIndex: 0, tokenIndex: 0 })
+  })
+
+  it('reports structured lyric content when token timing exists', () => {
+    expect(
+      hasStructuredLyricContent({
+        tokenLine: [{ token: [{ start: 100, value: 'a' }] }],
+      }),
+    ).toBe(true)
+  })
+})
diff --git a/ui/src/reducers/playerReducer.js b/ui/src/reducers/playerReducer.js
index 466a3ec87..2c3b2ba7a 100644
--- a/ui/src/reducers/playerReducer.js
+++ b/ui/src/reducers/playerReducer.js
@@ -7,6 +7,7 @@ import {
   PLAYER_CURRENT,
   PLAYER_PLAY_NEXT,
   PLAYER_PLAY_TRACKS,
+  PLAYER_UPDATE_LYRIC,
   PLAYER_SET_TRACK,
   PLAYER_SET_VOLUME,
   PLAYER_SYNC_QUEUE,
@@ -60,21 +61,25 @@ const mapToAudioLists = (item) => {
   let lyricText = ''
 
   if (lyrics) {
-    const structured = JSON.parse(lyrics)
-    for (const structuredLyric of structured) {
-      if (structuredLyric.synced) {
-        for (const line of structuredLyric.line) {
-          let time = Math.floor(line.start / 10)
-          const ms = time % 100
-          time = Math.floor(time / 100)
-          const sec = time % 60
-          time = Math.floor(time / 60)
-          const min = time % 60
+    try {
+      const structured = JSON.parse(lyrics)
+      for (const structuredLyric of structured) {
+        if (structuredLyric.synced) {
+          for (const line of structuredLyric.line) {
+            let time = Math.floor(line.start / 10)
+            const ms = time % 100
+            time = Math.floor(time / 100)
+            const sec = time % 60
+            time = Math.floor(time / 60)
+            const min = time % 60
 
-          ms.toString()
-          lyricText += `[${pad(min)}:${pad(sec)}.${pad(ms)}] ${line.value}\n`
+            ms.toString()
+            lyricText += `[${pad(min)}:${pad(sec)}.${pad(ms)}] ${line.value}\n`
+          }
         }
       }
+    } catch {
+      lyricText = ''
     }
   }
 
@@ -206,6 +211,45 @@ const reduceMode = (state, { data: { mode } }) => {
   }
 }
 
+const reduceUpdateLyric = (state, { data: { trackId, lyric } }) => {
+  if (!trackId) {
+    return state
+  }
+
+  let changed = false
+  const queue = state.queue.map((item) => {
+    if (item.trackId !== trackId) {
+      return item
+    }
+    if (item.lyric === lyric) {
+      return item
+    }
+    changed = true
+    return {
+      ...item,
+      lyric,
+    }
+  })
+
+  if (!changed) {
+    return state
+  }
+
+  const current =
+    state.current?.trackId === trackId
+      ? {
+          ...state.current,
+          lyric,
+        }
+      : state.current
+
+  return {
+    ...state,
+    queue,
+    current,
+  }
+}
+
 export const playerReducer = (previousState = initialState, payload) => {
   const { type } = payload
   switch (type) {
@@ -243,6 +287,8 @@ export const playerReducer = (previousState = initialState, payload) => {
           previousState.savedPlayIndex >= 0 ? previousState.savedPlayIndex : 0,
       }
     }
+    case PLAYER_UPDATE_LYRIC:
+      return reduceUpdateLyric(previousState, payload)
     default:
       return previousState
   }
diff --git a/ui/src/reducers/playerReducer.test.js b/ui/src/reducers/playerReducer.test.js
index 10e9512d7..1c399859d 100644
--- a/ui/src/reducers/playerReducer.test.js
+++ b/ui/src/reducers/playerReducer.test.js
@@ -1,11 +1,24 @@
-import { describe, it, expect } from 'vitest'
+import { describe, expect, it, vi } from 'vitest'
 import { playerReducer } from './playerReducer'
 import {
-  PLAYER_SYNC_QUEUE,
   PLAYER_CURRENT,
   PLAYER_REFRESH_QUEUE,
+  PLAYER_SET_TRACK,
+  PLAYER_SYNC_QUEUE,
+  PLAYER_UPDATE_LYRIC,
 } from '../actions'
 
+vi.mock('uuid', () => ({
+  v4: () => 'test-uuid',
+}))
+
+vi.mock('../subsonic', () => ({
+  default: {
+    streamUrl: vi.fn((id) => `/rest/stream?id=${id}`),
+    getCoverArtUrl: vi.fn(() => '/rest/getCoverArt?id=test'),
+  },
+}))
+
 describe('playerReducer', () => {
   describe('pending track selection survives SYNC_QUEUE and premature CURRENT', () => {
     // Simulates the real sequence when clicking a new song while one is playing:
@@ -54,8 +67,6 @@ describe('playerReducer', () => {
     })
 
     it('CURRENT for old track preserves pending playIndex', () => {
-      // After SYNC_QUEUE, queue has new UUIDs. The old track's UUID (zzz)
-      // is at index 2, but playIndex is 0. This is a premature callback.
       const stateAfterSync = {
         ...stateAfterPlayTracks,
         queue: [
@@ -71,7 +82,7 @@ describe('playerReducer', () => {
       const result = playerReducer(stateAfterSync, action)
       expect(result.playIndex).toBe(0)
       expect(result.clear).toBe(true)
-      expect(result.savedPlayIndex).toBe(2) // preserved from before
+      expect(result.savedPlayIndex).toBe(2)
     })
 
     it('CURRENT for correct track consumes pending playIndex', () => {
@@ -83,7 +94,6 @@ describe('playerReducer', () => {
           { trackId: 's3', uuid: 'zzz', name: 'Song 3' },
         ],
       }
-      // Player switched to Song 1 (uuid 'xxx', index 0 == playIndex)
       const action = {
         type: PLAYER_CURRENT,
         data: { uuid: 'xxx', name: 'Song 1', volume: 1 },
@@ -142,4 +152,80 @@ describe('playerReducer', () => {
       expect(result.playIndex).toBe(0)
     })
   })
+
+  it('maps embedded synced lyrics to LRC text', () => {
+    const lyrics = JSON.stringify([
+      {
+        lang: 'eng',
+        synced: true,
+        line: [{ start: 1000, value: 'Line one' }],
+      },
+      {
+        lang: 'eng',
+        synced: false,
+        line: [{ value: 'Unsynced line' }],
+      },
+    ])
+
+    const state = playerReducer(undefined, {
+      type: PLAYER_SET_TRACK,
+      data: {
+        id: 'song-1',
+        title: 'Test Song',
+        artist: 'Test Artist',
+        album: 'Test Album',
+        duration: 60,
+        lyrics,
+      },
+    })
+
+    expect(state.queue).toHaveLength(1)
+    expect(state.queue[0].lyric).toBe('[00:01.00] Line one\n')
+  })
+
+  it('updates queue lyric by track id', () => {
+    const initial = playerReducer(undefined, {
+      type: PLAYER_SET_TRACK,
+      data: {
+        id: 'song-1',
+        title: 'Test Song',
+        artist: 'Test Artist',
+        album: 'Test Album',
+        duration: 60,
+      },
+    })
+
+    const updated = playerReducer(initial, {
+      type: PLAYER_UPDATE_LYRIC,
+      data: {
+        trackId: 'song-1',
+        lyric: '[00:01.00] Updated lyric\n',
+      },
+    })
+
+    expect(updated.queue[0].lyric).toBe('[00:01.00] Updated lyric\n')
+  })
+
+  it('returns same state when lyric update does not match any track', () => {
+    const initial = playerReducer(undefined, {
+      type: PLAYER_SET_TRACK,
+      data: {
+        id: 'song-1',
+        title: 'Test Song',
+        artist: 'Test Artist',
+        album: 'Test Album',
+        duration: 60,
+      },
+    })
+
+    const updated = playerReducer(initial, {
+      type: PLAYER_UPDATE_LYRIC,
+      data: {
+        trackId: 'missing-track',
+        lyric: '[00:01.00] Updated lyric\n',
+      },
+    })
+
+    expect(updated).toBe(initial)
+  })
 })
diff --git a/ui/src/subsonic/index.js b/ui/src/subsonic/index.js
index 3579619aa..b311d5e14 100644
--- a/ui/src/subsonic/index.js
+++ b/ui/src/subsonic/index.js
@@ -120,6 +120,10 @@ const getTopSongs = (artist, count = 50) => {
   return httpClient(url('getTopSongs', null, { artist, count }))
 }
 
+const getLyricsBySongId = (id) => {
+  return httpClient(url('getLyricsBySongId', id))
+}
+
 const streamUrl = (id, options) => {
   return baseUrl(
     url('stream', id, {
@@ -149,4 +153,5 @@ export default {
   getArtistInfo,
   getTopSongs,
   getSimilarSongs2,
+  getLyricsBySongId,
 }
diff --git a/ui/src/subsonic/index.test.js b/ui/src/subsonic/index.test.js
index a750694f4..6910fdc8d 100644
--- a/ui/src/subsonic/index.test.js
+++ b/ui/src/subsonic/index.test.js
@@ -1,7 +1,12 @@
 import { vi } from 'vitest'
-import config from '../config'
+import { COVER_ART_SIZE } from '../consts'
+import { httpClient } from '../dataProvider'
 import subsonic from './index'
 
+vi.mock('../dataProvider', () => ({
+  httpClient: vi.fn(() => Promise.resolve({})),
+}))
+
 describe('getCoverArtUrl', () => {
   beforeEach(() => {
     // Mock window.location
@@ -31,11 +36,7 @@ describe('getCoverArtUrl', () => {
       updatedAt: '2023-01-01T00:00:00Z',
     }
 
-    const url = subsonic.getCoverArtUrl(
-      playlistRecord,
-      config.uiCoverArtSize,
-      true,
-    )
+    const url = subsonic.getCoverArtUrl(playlistRecord, COVER_ART_SIZE, true)
 
     expect(url).toContain('pl-playlist-123')
     expect(url).toContain('size=600')
@@ -49,11 +50,7 @@ describe('getCoverArtUrl', () => {
       sync: true,
     }
 
-    const url = subsonic.getCoverArtUrl(
-      playlistRecord,
-      config.uiCoverArtSize,
-      true,
-    )
+    const url = subsonic.getCoverArtUrl(playlistRecord, COVER_ART_SIZE, true)
 
     expect(url).toContain('pl-playlist-123')
     expect(url).toContain('size=600')
@@ -68,11 +65,7 @@ describe('getCoverArtUrl', () => {
       updatedAt: '2023-01-01T00:00:00Z',
     }
 
-    const url = subsonic.getCoverArtUrl(
-      albumRecord,
-      config.uiCoverArtSize,
-      true,
-    )
+    const url = subsonic.getCoverArtUrl(albumRecord, COVER_ART_SIZE, true)
 
     expect(url).toContain('al-album-123')
     expect(url).toContain('size=600')
@@ -86,7 +79,7 @@ describe('getCoverArtUrl', () => {
       updatedAt: '2023-01-01T00:00:00Z',
     }
 
-    const url = subsonic.getCoverArtUrl(songRecord, config.uiCoverArtSize, true)
+    const url = subsonic.getCoverArtUrl(songRecord, COVER_ART_SIZE, true)
 
     expect(url).toContain('mf-song-123')
     expect(url).toContain('size=600')
@@ -99,11 +92,7 @@ describe('getCoverArtUrl', () => {
       updatedAt: '2023-01-01T00:00:00Z',
     }
 
-    const url = subsonic.getCoverArtUrl(
-      artistRecord,
-      config.uiCoverArtSize,
-      true,
-    )
+    const url = subsonic.getCoverArtUrl(artistRecord, COVER_ART_SIZE, true)
 
     expect(url).toContain('ar-artist-123')
     expect(url).toContain('size=600')
@@ -194,3 +183,30 @@ describe('getAvatarUrl', () => {
     expect(url).toContain('username=john')
   })
 })
+
+describe('getLyricsBySongId', () => {
+  beforeEach(() => {
+    vi.clearAllMocks()
+    const localStorageMock = {
+      getItem: vi.fn((key) => {
+        const values = {
+          username: 'testuser',
+          'subsonic-token': 'testtoken',
+          'subsonic-salt': 'testsalt',
+        }
+        return values[key] || null
+      }),
+    }
+    Object.defineProperty(window, 'localStorage', { value: localStorageMock })
+  })
+
+  it('calls the getLyricsBySongId endpoint with enhanced=true', async () => {
+    await subsonic.getLyricsBySongId('song-1')
+
+    expect(httpClient).toHaveBeenCalledTimes(1)
+    const calledUrl = httpClient.mock.calls[0][0]
+    expect(calledUrl).toContain('/rest/getLyricsBySongId?')
+    expect(calledUrl).toContain('id=song-1')
+    expect(calledUrl).toContain('enhanced=true')
+  })
+})

From 2f6f0bca797843a4fad08a43c7e1a0d3810b1555 Mon Sep 17 00:00:00 2001
From: ranokay <github@ranokay.com>
Date: Sun, 22 Feb 2026 14:53:27 +0200
Subject: [PATCH 02/14] test: rewrite TTML tests to Ginkgo/Gomega framework

- Convert ttml_test.go from testing.T/Fatalf to Ginkgo Describe/It with Gomega matchers
- Remove sources_ttml_test.go (duplicate tests already exist in sources_test.go using Ginkgo)
- All 26 lyrics specs pass
---
 core/lyrics/sources_ttml_test.go |  92 -------
 core/lyrics/ttml_test.go         | 425 ++++++++++++-------------------
 2 files changed, 165 insertions(+), 352 deletions(-)
 delete mode 100644 core/lyrics/sources_ttml_test.go

diff --git a/core/lyrics/sources_ttml_test.go b/core/lyrics/sources_ttml_test.go
deleted file mode 100644
index 217bf7b36..000000000
--- a/core/lyrics/sources_ttml_test.go
+++ /dev/null
@@ -1,92 +0,0 @@
-package lyrics
-
-import (
-	"context"
-	"os"
-	"path/filepath"
-	"testing"
-
-	"github.com/navidrome/navidrome/model"
-)
-
-func TestFromExternalFileTTML(t *testing.T) {
-	ctx := context.Background()
-	mf := model.MediaFile{Path: fixturePath("test.mp3")}
-
-	lyrics, err := fromExternalFile(ctx, &mf, ".ttml")
-	if err != nil {
-		t.Fatalf("fromExternalFile returned error: %v", err)
-	}
-	if len(lyrics) != 2 {
-		t.Fatalf("expected 2 lyric tracks, got %d", len(lyrics))
-	}
-	if lyrics[0].Lang != "eng" {
-		t.Fatalf("expected first language 'eng', got %q", lyrics[0].Lang)
-	}
-	if len(lyrics[0].Line) != 2 {
-		t.Fatalf("expected 2 english lines, got %d", len(lyrics[0].Line))
-	}
-	if lyrics[0].Line[0].Start == nil || *lyrics[0].Line[0].Start != 18800 {
-		t.Fatalf("expected first english line start to be 18800, got %v", lyrics[0].Line[0].Start)
-	}
-}
-
-func TestFromExternalFileTTMLWithUTF8BOM(t *testing.T) {
-	ctx := context.Background()
-	mf := model.MediaFile{Path: fixturePath("bom-test.ttml")}
-
-	lyrics, err := fromExternalFile(ctx, &mf, ".ttml")
-	if err != nil {
-		t.Fatalf("fromExternalFile returned error: %v", err)
-	}
-	if len(lyrics) != 1 {
-		t.Fatalf("expected 1 lyric track, got %d", len(lyrics))
-	}
-	if !lyrics[0].Synced {
-		t.Fatal("expected BOM TTML lyrics to be synced")
-	}
-	if len(lyrics[0].Line) != 1 {
-		t.Fatalf("expected 1 lyric line, got %d", len(lyrics[0].Line))
-	}
-	if lyrics[0].Line[0].Start == nil || *lyrics[0].Line[0].Start != 0 {
-		t.Fatalf("expected first line start 0, got %v", lyrics[0].Line[0].Start)
-	}
-}
-
-func TestFromExternalFileTTMLUTF16(t *testing.T) {
-	ctx := context.Background()
-	mf := model.MediaFile{Path: fixturePath("bom-utf16-test.ttml")}
-
-	lyrics, err := fromExternalFile(ctx, &mf, ".ttml")
-	if err != nil {
-		t.Fatalf("fromExternalFile returned error: %v", err)
-	}
-	if len(lyrics) != 1 {
-		t.Fatalf("expected 1 lyric track, got %d", len(lyrics))
-	}
-	if !lyrics[0].Synced {
-		t.Fatal("expected UTF16 TTML lyrics to be synced")
-	}
-	if len(lyrics[0].Line) != 2 {
-		t.Fatalf("expected 2 lyric lines, got %d", len(lyrics[0].Line))
-	}
-	if lyrics[0].Line[0].Start == nil || *lyrics[0].Line[0].Start != 18800 {
-		t.Fatalf("expected first line start 18800, got %v", lyrics[0].Line[0].Start)
-	}
-	if lyrics[0].Line[1].Start == nil || *lyrics[0].Line[1].Start != 22801 {
-		t.Fatalf("expected second line start 22801, got %v", lyrics[0].Line[1].Start)
-	}
-}
-
-func fixturePath(name string) string {
-	candidates := []string{
-		filepath.Join("tests", "fixtures", name),
-		filepath.Join("..", "..", "tests", "fixtures", name),
-	}
-	for _, candidate := range candidates {
-		if _, err := os.Stat(candidate); err == nil {
-			return candidate
-		}
-	}
-	return filepath.Join("tests", "fixtures", name)
-}
diff --git a/core/lyrics/ttml_test.go b/core/lyrics/ttml_test.go
index 12270c27d..c8596243b 100644
--- a/core/lyrics/ttml_test.go
+++ b/core/lyrics/ttml_test.go
@@ -1,13 +1,16 @@
 package lyrics
 
 import (
-	"testing"
-
 	"github.com/navidrome/navidrome/model"
+	"github.com/navidrome/navidrome/utils/gg"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
 )
 
-func TestParseTTML_MultiLanguageAndTiming(t *testing.T) {
-	content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
+var _ = Describe("parseTTML", func() {
+	Describe("Multi-language and timing", func() {
+		It("should parse multiple language divs with inherited offsets and frame/tick timing", func() {
+			content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
 <tt xmlns="http://www.w3.org/ns/ttml" xmlns:ttp="http://www.w3.org/ns/ttml#parameter" ttp:frameRate="30" ttp:subFrameRate="2" ttp:tickRate="10">
   <body>
     <div xml:lang="eng" begin="1s">
@@ -20,33 +23,30 @@ func TestParseTTML_MultiLanguageAndTiming(t *testing.T) {
   </body>
 </tt>`)
 
-	list, err := parseTTML(content)
-	if err != nil {
-		t.Fatalf("parseTTML returned error: %v", err)
-	}
-	if len(list) != 2 {
-		t.Fatalf("expected 2 lyric tracks, got %d", len(list))
-	}
+			list, err := parseTTML(content)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(list).To(HaveLen(2))
 
-	eng := list[0]
-	if eng.Lang != "eng" {
-		t.Fatalf("expected first track language 'eng', got %q", eng.Lang)
-	}
-	if !eng.Synced {
-		t.Fatal("expected first track to be synced")
-	}
-	assertTimedLine(t, eng.Line[0], 3000, "Line one")
-	assertTimedLine(t, eng.Line[1], 4517, "Line two\nwith break")
+			By("parsing the English track")
+			eng := list[0]
+			Expect(eng.Lang).To(Equal("eng"))
+			Expect(eng.Synced).To(BeTrue())
+			Expect(eng.Line[0].Start).To(Equal(gg.P(int64(3000))))
+			Expect(eng.Line[0].Value).To(Equal("Line one"))
+			Expect(eng.Line[1].Start).To(Equal(gg.P(int64(4517))))
+			Expect(eng.Line[1].Value).To(Equal("Line two\nwith break"))
 
-	por := list[1]
-	if por.Lang != "por" {
-		t.Fatalf("expected second track language 'por', got %q", por.Lang)
-	}
-	assertTimedLine(t, por.Line[0], 4500, "Linha")
-}
+			By("parsing the Portuguese track")
+			por := list[1]
+			Expect(por.Lang).To(Equal("por"))
+			Expect(por.Line[0].Start).To(Equal(gg.P(int64(4500))))
+			Expect(por.Line[0].Value).To(Equal("Linha"))
+		})
+	})
 
-func TestParseTTML_UnsupportedCueSkipped(t *testing.T) {
-	content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
+	Describe("Unsupported cue handling", func() {
+		It("should skip wallclock cues and keep valid ones", func() {
+			content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
 <tt xmlns="http://www.w3.org/ns/ttml">
   <body xml:lang="eng">
     <div>
@@ -56,21 +56,18 @@ func TestParseTTML_UnsupportedCueSkipped(t *testing.T) {
   </body>
 </tt>`)
 
-	list, err := parseTTML(content)
-	if err != nil {
-		t.Fatalf("parseTTML returned error: %v", err)
-	}
-	if len(list) != 1 {
-		t.Fatalf("expected 1 lyric track, got %d", len(list))
-	}
-	if len(list[0].Line) != 1 {
-		t.Fatalf("expected 1 line in lyric track, got %d", len(list[0].Line))
-	}
-	assertTimedLine(t, list[0].Line[0], 1000, "Keep me")
-}
+			list, err := parseTTML(content)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(list).To(HaveLen(1))
+			Expect(list[0].Line).To(HaveLen(1))
+			Expect(list[0].Line[0].Start).To(Equal(gg.P(int64(1000))))
+			Expect(list[0].Line[0].Value).To(Equal("Keep me"))
+		})
+	})
 
-func TestParseTTML_BeginEndDurWithInheritance(t *testing.T) {
-	content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
+	Describe("Begin/End/Dur with inheritance", func() {
+		It("should correctly accumulate nested timing from body, div, and p elements", func() {
+			content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
 <tt xmlns="http://www.w3.org/ns/ttml">
   <body xml:lang="eng" begin="10s">
     <div begin="5s" dur="8s">
@@ -80,25 +77,21 @@ func TestParseTTML_BeginEndDurWithInheritance(t *testing.T) {
   </body>
 </tt>`)
 
-	list, err := parseTTML(content)
-	if err != nil {
-		t.Fatalf("parseTTML returned error: %v", err)
-	}
-	if len(list) != 1 {
-		t.Fatalf("expected 1 lyric track, got %d", len(list))
-	}
-	if list[0].Lang != "eng" {
-		t.Fatalf("expected language 'eng', got %q", list[0].Lang)
-	}
-	if len(list[0].Line) != 2 {
-		t.Fatalf("expected 2 lines, got %d", len(list[0].Line))
-	}
-	assertTimedLine(t, list[0].Line[0], 16000, "First line")
-	assertTimedLine(t, list[0].Line[1], 18000, "Second line")
-}
+			list, err := parseTTML(content)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(list).To(HaveLen(1))
+			Expect(list[0].Lang).To(Equal("eng"))
+			Expect(list[0].Line).To(HaveLen(2))
+			Expect(list[0].Line[0].Start).To(Equal(gg.P(int64(16000))))
+			Expect(list[0].Line[0].Value).To(Equal("First line"))
+			Expect(list[0].Line[1].Start).To(Equal(gg.P(int64(18000))))
+			Expect(list[0].Line[1].Value).To(Equal("Second line"))
+		})
+	})
 
-func TestParseTTML_NonStandardBareSecondOffsets(t *testing.T) {
-	content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
+	Describe("Non-standard bare second offsets", func() {
+		It("should parse bare decimal numbers as seconds", func() {
+			content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
 <tt xmlns="http://www.w3.org/ns/ttml">
   <body xml:lang="eng" begin="10">
     <div>
@@ -108,22 +101,20 @@ func TestParseTTML_NonStandardBareSecondOffsets(t *testing.T) {
   </body>
 </tt>`)
 
-	list, err := parseTTML(content)
-	if err != nil {
-		t.Fatalf("parseTTML returned error: %v", err)
-	}
-	if len(list) != 1 {
-		t.Fatalf("expected 1 lyric track, got %d", len(list))
-	}
-	if len(list[0].Line) != 2 {
-		t.Fatalf("expected 2 lines, got %d", len(list[0].Line))
-	}
-	assertTimedLine(t, list[0].Line[0], 10170, "First line")
-	assertTimedLine(t, list[0].Line[1], 13710, "Second line")
-}
+			list, err := parseTTML(content)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(list).To(HaveLen(1))
+			Expect(list[0].Line).To(HaveLen(2))
+			Expect(list[0].Line[0].Start).To(Equal(gg.P(int64(10170))))
+			Expect(list[0].Line[0].Value).To(Equal("First line"))
+			Expect(list[0].Line[1].Start).To(Equal(gg.P(int64(13710))))
+			Expect(list[0].Line[1].Value).To(Equal("Second line"))
+		})
+	})
 
-func TestParseTTML_WordTimingTokens(t *testing.T) {
-	content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
+	Describe("Word timing tokens", func() {
+		It("should extract timed tokens from spans including background role", func() {
+			content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
 <tt xmlns="http://www.w3.org/ns/ttml" xmlns:ttm="http://www.w3.org/ns/ttml#metadata">
   <body xml:lang="eng">
     <div>
@@ -135,33 +126,26 @@ func TestParseTTML_WordTimingTokens(t *testing.T) {
   </body>
 </tt>`)
 
-	list, err := parseTTML(content)
-	if err != nil {
-		t.Fatalf("parseTTML returned error: %v", err)
-	}
-	if len(list) != 1 {
-		t.Fatalf("expected 1 lyric track, got %d", len(list))
-	}
-	if len(list[0].Line) != 1 {
-		t.Fatalf("expected 1 line, got %d", len(list[0].Line))
-	}
+			list, err := parseTTML(content)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(list).To(HaveLen(1))
+			Expect(list[0].Line).To(HaveLen(1))
 
-	line := list[0].Line[0]
-	assertTimedLine(t, line, 1000, "Hello\necho")
-	if line.End == nil || *line.End != 3000 {
-		t.Fatalf("expected line end 3000, got %v", line.End)
-	}
-	if len(line.Token) != 3 {
-		t.Fatalf("expected 3 timed tokens, got %d", len(line.Token))
-	}
+			line := list[0].Line[0]
+			Expect(line.Start).To(Equal(gg.P(int64(1000))))
+			Expect(line.Value).To(Equal("Hello\necho"))
+			Expect(line.End).To(Equal(gg.P(int64(3000))))
+			Expect(line.Token).To(HaveLen(3))
 
-	assertToken(t, line.Token[0], 1000, 1400, "He", "")
-	assertToken(t, line.Token[1], 1400, 1800, "llo", "")
-	assertToken(t, line.Token[2], 2000, 2500, "echo", "x-bg")
-}
+			Expect(line.Token[0]).To(Equal(model.Token{Start: gg.P(int64(1000)), End: gg.P(int64(1400)), Value: "He"}))
+			Expect(line.Token[1]).To(Equal(model.Token{Start: gg.P(int64(1400)), End: gg.P(int64(1800)), Value: "llo"}))
+			Expect(line.Token[2]).To(Equal(model.Token{Start: gg.P(int64(2000)), End: gg.P(int64(2500)), Value: "echo", Role: "x-bg"}))
+		})
+	})
 
-func TestParseTTML_AmbiguousDecimalTimingPrefersAbsoluteWhenInsideParentWindow(t *testing.T) {
-	content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
+	Describe("Ambiguous decimal timing", func() {
+		It("should prefer absolute timing when values fall inside parent window", func() {
+			content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
 <tt xmlns="http://www.w3.org/ns/ttml">
   <body xml:lang="eng">
     <div begin="37.870" end="45.570">
@@ -173,28 +157,24 @@ func TestParseTTML_AmbiguousDecimalTimingPrefersAbsoluteWhenInsideParentWindow(t
   </body>
 </tt>`)
 
-	list, err := parseTTML(content)
-	if err != nil {
-		t.Fatalf("parseTTML returned error: %v", err)
-	}
-	if len(list) != 1 || len(list[0].Line) != 1 {
-		t.Fatalf("expected one parsed lyric line, got %#v", list)
-	}
+			list, err := parseTTML(content)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(list).To(HaveLen(1))
+			Expect(list[0].Line).To(HaveLen(1))
 
-	line := list[0].Line[0]
-	assertTimedLine(t, line, 43444, "go\ngo")
-	if line.End == nil || *line.End != 45570 {
-		t.Fatalf("expected line end 45570, got %v", line.End)
-	}
-	if len(line.Token) != 2 {
-		t.Fatalf("expected 2 timed tokens, got %d", len(line.Token))
-	}
-	assertToken(t, line.Token[0], 43444, 43716, "go", "")
-	assertToken(t, line.Token[1], 43716, 43887, "go", "")
-}
+			line := list[0].Line[0]
+			Expect(line.Start).To(Equal(gg.P(int64(43444))))
+			Expect(line.Value).To(Equal("go\ngo"))
+			Expect(line.End).To(Equal(gg.P(int64(45570))))
+			Expect(line.Token).To(HaveLen(2))
+			Expect(line.Token[0]).To(Equal(model.Token{Start: gg.P(int64(43444)), End: gg.P(int64(43716)), Value: "go"}))
+			Expect(line.Token[1]).To(Equal(model.Token{Start: gg.P(int64(43716)), End: gg.P(int64(43887)), Value: "go"}))
+		})
+	})
 
-func TestParseTTML_UnsyncedFallback(t *testing.T) {
-	content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
+	Describe("Unsynced fallback", func() {
+		It("should return unsynced lyrics when no timing is present", func() {
+			content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
 <tt xmlns="http://www.w3.org/ns/ttml">
   <body>
     <div>
@@ -203,32 +183,20 @@ func TestParseTTML_UnsyncedFallback(t *testing.T) {
   </body>
 </tt>`)
 
-	list, err := parseTTML(content)
-	if err != nil {
-		t.Fatalf("parseTTML returned error: %v", err)
-	}
-	if len(list) != 1 {
-		t.Fatalf("expected 1 lyric track, got %d", len(list))
-	}
-	if list[0].Lang != "xxx" {
-		t.Fatalf("expected default language 'xxx', got %q", list[0].Lang)
-	}
-	if list[0].Synced {
-		t.Fatal("expected lyric track to be unsynced")
-	}
-	if len(list[0].Line) != 1 {
-		t.Fatalf("expected 1 line, got %d", len(list[0].Line))
-	}
-	if list[0].Line[0].Start != nil {
-		t.Fatalf("expected line start to be nil, got %v", *list[0].Line[0].Start)
-	}
-	if list[0].Line[0].Value != "No timing here" {
-		t.Fatalf("expected line value %q, got %q", "No timing here", list[0].Line[0].Value)
-	}
-}
+			list, err := parseTTML(content)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(list).To(HaveLen(1))
+			Expect(list[0].Lang).To(Equal("xxx"))
+			Expect(list[0].Synced).To(BeFalse())
+			Expect(list[0].Line).To(HaveLen(1))
+			Expect(list[0].Line[0].Start).To(BeNil())
+			Expect(list[0].Line[0].Value).To(Equal("No timing here"))
+		})
+	})
 
-func TestParseTTML_MetadataTracksByKey(t *testing.T) {
-	content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
+	Describe("Metadata tracks", func() {
+		It("should produce main, translation, and pronunciation tracks from iTunesMetadata", func() {
+			content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
 <tt xmlns="http://www.w3.org/ns/ttml" xmlns:itunes="http://music.apple.com/lyric-ttml-internal">
   <head>
     <metadata>
@@ -255,63 +223,42 @@ func TestParseTTML_MetadataTracksByKey(t *testing.T) {
   </body>
 </tt>`)
 
-	list, err := parseTTML(content)
-	if err != nil {
-		t.Fatalf("parseTTML returned error: %v", err)
-	}
-	if len(list) != 3 {
-		t.Fatalf("expected 3 lyric tracks, got %d", len(list))
-	}
+			list, err := parseTTML(content)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(list).To(HaveLen(3))
 
-	main := list[0]
-	if main.Kind != "main" {
-		t.Fatalf("expected main track kind %q, got %q", "main", main.Kind)
-	}
-	if main.Lang != "ja" {
-		t.Fatalf("expected main track language %q, got %q", "ja", main.Lang)
-	}
-	if len(main.Line) != 2 {
-		t.Fatalf("expected 2 lines in main track, got %d", len(main.Line))
-	}
+			By("checking the main track")
+			main := list[0]
+			Expect(main.Kind).To(Equal("main"))
+			Expect(main.Lang).To(Equal("ja"))
+			Expect(main.Line).To(HaveLen(2))
 
-	translation := list[1]
-	if translation.Kind != "translation" {
-		t.Fatalf("expected translation kind %q, got %q", "translation", translation.Kind)
-	}
-	if translation.Lang != "es" {
-		t.Fatalf("expected translation language %q, got %q", "es", translation.Lang)
-	}
-	if len(translation.Line) != 1 {
-		t.Fatalf("expected 1 translation line, got %d", len(translation.Line))
-	}
-	assertTimedLine(t, translation.Line[0], 1000, "Hola")
-	if translation.Line[0].End == nil || *translation.Line[0].End != 1500 {
-		t.Fatalf("expected translation line end %d, got %v", 1500, translation.Line[0].End)
-	}
+			By("checking the translation track")
+			translation := list[1]
+			Expect(translation.Kind).To(Equal("translation"))
+			Expect(translation.Lang).To(Equal("es"))
+			Expect(translation.Line).To(HaveLen(1))
+			Expect(translation.Line[0].Start).To(Equal(gg.P(int64(1000))))
+			Expect(translation.Line[0].Value).To(Equal("Hola"))
+			Expect(translation.Line[0].End).To(Equal(gg.P(int64(1500))))
 
-	pronunciation := list[2]
-	if pronunciation.Kind != "pronunciation" {
-		t.Fatalf("expected pronunciation kind %q, got %q", "pronunciation", pronunciation.Kind)
-	}
-	if pronunciation.Lang != "ja-latn" {
-		t.Fatalf("expected pronunciation language %q, got %q", "ja-latn", pronunciation.Lang)
-	}
-	if len(pronunciation.Line) != 1 {
-		t.Fatalf("expected 1 pronunciation line, got %d", len(pronunciation.Line))
-	}
-	assertTimedLine(t, pronunciation.Line[0], 2000, "konni")
-	if pronunciation.Line[0].End == nil || *pronunciation.Line[0].End != 2600 {
-		t.Fatalf("expected pronunciation line end %d, got %v", 2600, pronunciation.Line[0].End)
-	}
-	if len(pronunciation.Line[0].Token) != 2 {
-		t.Fatalf("expected 2 pronunciation tokens, got %d", len(pronunciation.Line[0].Token))
-	}
-	assertToken(t, pronunciation.Line[0].Token[0], 2000, 2300, "ko", "")
-	assertToken(t, pronunciation.Line[0].Token[1], 2300, 2600, "nni", "")
-}
+			By("checking the pronunciation track")
+			pronunciation := list[2]
+			Expect(pronunciation.Kind).To(Equal("pronunciation"))
+			Expect(pronunciation.Lang).To(Equal("ja-latn"))
+			Expect(pronunciation.Line).To(HaveLen(1))
+			Expect(pronunciation.Line[0].Start).To(Equal(gg.P(int64(2000))))
+			Expect(pronunciation.Line[0].Value).To(Equal("konni"))
+			Expect(pronunciation.Line[0].End).To(Equal(gg.P(int64(2600))))
+			Expect(pronunciation.Line[0].Token).To(HaveLen(2))
+			Expect(pronunciation.Line[0].Token[0]).To(Equal(model.Token{Start: gg.P(int64(2000)), End: gg.P(int64(2300)), Value: "ko"}))
+			Expect(pronunciation.Line[0].Token[1]).To(Equal(model.Token{Start: gg.P(int64(2300)), End: gg.P(int64(2600)), Value: "nni"}))
+		})
+	})
 
-func TestParseTTML_PronunciationBareDecimalEndTimes(t *testing.T) {
-	content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
+	Describe("Pronunciation with bare decimal end times", func() {
+		It("should correctly parse bare decimal times in transliteration spans", func() {
+			content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
 <tt xmlns="http://www.w3.org/ns/ttml" xmlns:itunes="http://music.apple.com/lyric-ttml-internal">
   <head>
     <metadata>
@@ -331,68 +278,26 @@ func TestParseTTML_PronunciationBareDecimalEndTimes(t *testing.T) {
   </body>
 </tt>`)
 
-	list, err := parseTTML(content)
-	if err != nil {
-		t.Fatalf("parseTTML returned error: %v", err)
-	}
+			list, err := parseTTML(content)
+			Expect(err).ToNot(HaveOccurred())
 
-	var pronunciation *model.Lyrics
-	for i := range list {
-		if list[i].Kind == "pronunciation" {
-			pronunciation = &list[i]
-			break
-		}
-	}
-	if pronunciation == nil {
-		t.Fatal("expected a pronunciation track")
-	}
-	if len(pronunciation.Line) != 1 {
-		t.Fatalf("expected 1 pronunciation line, got %d", len(pronunciation.Line))
-	}
+			var pronunciation *model.Lyrics
+			for i := range list {
+				if list[i].Kind == "pronunciation" {
+					pronunciation = &list[i]
+					break
+				}
+			}
+			Expect(pronunciation).ToNot(BeNil())
+			Expect(pronunciation.Line).To(HaveLen(1))
 
-	line := pronunciation.Line[0]
-	assertTimedLine(t, line, 2747, "I woke up")
-	if len(line.Token) != 3 {
-		t.Fatalf("expected 3 tokens, got %d", len(line.Token))
-	}
-	assertToken(t, line.Token[0], 2747, 3018, "I", "")
-	assertToken(t, line.Token[1], 3018, 3179, "woke", "")
-	assertToken(t, line.Token[2], 3179, 3582, "up", "")
-}
-
-func assertTimedLine(t *testing.T, line model.Line, expectedStart int64, expectedValue string) {
-	t.Helper()
-
-	if line.Start == nil {
-		t.Fatal("expected line start to be set, got nil")
-	}
-	if *line.Start != expectedStart {
-		t.Fatalf("expected line start %d, got %d", expectedStart, *line.Start)
-	}
-	if line.Value != expectedValue {
-		t.Fatalf("expected line value %q, got %q", expectedValue, line.Value)
-	}
-}
-
-func assertToken(t *testing.T, token model.Token, expectedStart int64, expectedEnd int64, expectedValue string, expectedRole string) {
-	t.Helper()
-
-	if token.Start == nil {
-		t.Fatal("expected token start to be set, got nil")
-	}
-	if *token.Start != expectedStart {
-		t.Fatalf("expected token start %d, got %d", expectedStart, *token.Start)
-	}
-	if token.End == nil {
-		t.Fatal("expected token end to be set, got nil")
-	}
-	if *token.End != expectedEnd {
-		t.Fatalf("expected token end %d, got %d", expectedEnd, *token.End)
-	}
-	if token.Value != expectedValue {
-		t.Fatalf("expected token value %q, got %q", expectedValue, token.Value)
-	}
-	if token.Role != expectedRole {
-		t.Fatalf("expected token role %q, got %q", expectedRole, token.Role)
-	}
-}
+			line := pronunciation.Line[0]
+			Expect(line.Start).To(Equal(gg.P(int64(2747))))
+			Expect(line.Value).To(Equal("I woke up"))
+			Expect(line.Token).To(HaveLen(3))
+			Expect(line.Token[0]).To(Equal(model.Token{Start: gg.P(int64(2747)), End: gg.P(int64(3018)), Value: "I"}))
+			Expect(line.Token[1]).To(Equal(model.Token{Start: gg.P(int64(3018)), End: gg.P(int64(3179)), Value: "woke"}))
+			Expect(line.Token[2]).To(Equal(model.Token{Start: gg.P(int64(3179)), End: gg.P(int64(3582)), Value: "up"}))
+		})
+	})
+})

From 1d78373b778008f8e994dbe85cbd53d124696d27 Mon Sep 17 00:00:00 2001
From: ranokay <github@ranokay.com>
Date: Sun, 22 Feb 2026 21:05:14 +0200
Subject: [PATCH 03/14] refactor: align with OpenSubsonic spec feedback

- Rename token/tokenLine to cue/cueLine across Go backend and JS frontend
- Move role from individual cue to cueLine level (server pre-splits by role)
- Add enhanced query parameter to getLyricsBySongId for backward compat
- Add enhanced=true to UI API client so translations/pronunciations load
- Update all Go and JS tests to match new naming and structure
---
 core/lyrics/ttml.go                     |  28 +++----
 core/lyrics/ttml_test.go                |  28 +++----
 model/lyrics.go                         |  10 +--
 server/subsonic/helpers.go              |  70 ++++++++++------
 server/subsonic/media_retrieval.go      |   4 +-
 server/subsonic/media_retrieval_test.go |  70 ++++++++--------
 server/subsonic/responses/responses.go  |  32 ++++----
 ui/src/audioplayer/lyrics.js            | 101 +++++++++++++++---------
 ui/src/audioplayer/lyrics.test.js       |  38 +++++----
 ui/src/subsonic/index.js                |   4 +-
 10 files changed, 226 insertions(+), 159 deletions(-)

diff --git a/core/lyrics/ttml.go b/core/lyrics/ttml.go
index 3aae53aa0..a0bdcac5a 100644
--- a/core/lyrics/ttml.go
+++ b/core/lyrics/ttml.go
@@ -162,7 +162,7 @@ func (p *ttmlParser) parseElement(start xml.StartElement, parent ttmlTimingConte
 			parsedLine.End = &endMs
 		}
 		if len(tokens) > 0 {
-			parsedLine.Token = tokens
+			parsedLine.Cue = tokens
 		}
 		parsedLine = hydrateLineTimingFromTokens(parsedLine)
 
@@ -261,20 +261,20 @@ func (p *ttmlParser) parseMetadataText(start xml.StartElement, parent ttmlTiming
 		line.End = &endMs
 	}
 	if len(tokens) > 0 {
-		line.Token = tokens
+		line.Cue = tokens
 	}
 	line = hydrateLineTimingFromTokens(line)
 
-	if line.Value == "" && len(line.Token) == 0 {
+	if line.Value == "" && len(line.Cue) == 0 {
 		return ttmlMetadataEntry{}, false, nil
 	}
 
 	return ttmlMetadataEntry{key: forKey, line: line}, true, nil
 }
 
-func (p *ttmlParser) parseParagraph(parent ttmlTimingContext) (string, []model.Token, error) {
+func (p *ttmlParser) parseParagraph(parent ttmlTimingContext) (string, []model.Cue, error) {
 	var text strings.Builder
-	var tokens []model.Token
+	var tokens []model.Cue
 
 	for {
 		token, err := p.decoder.Token()
@@ -300,7 +300,7 @@ func (p *ttmlParser) parseParagraph(parent ttmlTimingContext) (string, []model.T
 	}
 }
 
-func (p *ttmlParser) parseInlineElement(start xml.StartElement, parent ttmlTimingContext) (string, []model.Token, error) {
+func (p *ttmlParser) parseInlineElement(start xml.StartElement, parent ttmlTimingContext) (string, []model.Cue, error) {
 	local := strings.ToLower(start.Name.Local)
 	if local == "br" {
 		return "\n", nil, nil
@@ -313,7 +313,7 @@ func (p *ttmlParser) parseInlineElement(start xml.StartElement, parent ttmlTimin
 	hasOwnTiming := hasBegin || hasEnd || hasDur
 
 	var text strings.Builder
-	var tokens []model.Token
+	var tokens []model.Cue
 
 	for {
 		token, err := p.decoder.Token()
@@ -337,7 +337,7 @@ func (p *ttmlParser) parseInlineElement(start xml.StartElement, parent ttmlTimin
 			value := text.String()
 			tokenText := sanitizeTTMLText(value)
 			if local == "span" && hasOwnTiming && !ctx.invalid && tokenText != "" && len(tokens) == 0 {
-				parsedToken := model.Token{
+				parsedToken := model.Cue{
 					Value: tokenText,
 					Role:  ctx.role,
 				}
@@ -413,7 +413,7 @@ func (p *ttmlParser) buildMetadataLyrics(kind string, langOrder []string, entrie
 			}
 			line = hydrateLineTimingFromTokens(line)
 
-			if line.Value == "" && len(line.Token) == 0 {
+			if line.Value == "" && len(line.Cue) == 0 {
 				continue
 			}
 
@@ -830,8 +830,8 @@ func linesAreSynced(lines []model.Line) bool {
 		if lines[i].Start != nil {
 			return true
 		}
-		for j := range lines[i].Token {
-			if lines[i].Token[j].Start != nil {
+		for j := range lines[i].Cue {
+			if lines[i].Cue[j].Start != nil {
 				return true
 			}
 		}
@@ -840,14 +840,14 @@ func linesAreSynced(lines []model.Line) bool {
 }
 
 func hydrateLineTimingFromTokens(line model.Line) model.Line {
-	if len(line.Token) == 0 {
+	if len(line.Cue) == 0 {
 		return line
 	}
 
 	var earliestStart *int64
 	var latestEnd *int64
-	for i := range line.Token {
-		token := line.Token[i]
+	for i := range line.Cue {
+		token := line.Cue[i]
 		if token.Start != nil {
 			if earliestStart == nil || *token.Start < *earliestStart {
 				v := *token.Start
diff --git a/core/lyrics/ttml_test.go b/core/lyrics/ttml_test.go
index c8596243b..8ec16f679 100644
--- a/core/lyrics/ttml_test.go
+++ b/core/lyrics/ttml_test.go
@@ -135,11 +135,11 @@ var _ = Describe("parseTTML", func() {
 			Expect(line.Start).To(Equal(gg.P(int64(1000))))
 			Expect(line.Value).To(Equal("Hello\necho"))
 			Expect(line.End).To(Equal(gg.P(int64(3000))))
-			Expect(line.Token).To(HaveLen(3))
+			Expect(line.Cue).To(HaveLen(3))
 
-			Expect(line.Token[0]).To(Equal(model.Token{Start: gg.P(int64(1000)), End: gg.P(int64(1400)), Value: "He"}))
-			Expect(line.Token[1]).To(Equal(model.Token{Start: gg.P(int64(1400)), End: gg.P(int64(1800)), Value: "llo"}))
-			Expect(line.Token[2]).To(Equal(model.Token{Start: gg.P(int64(2000)), End: gg.P(int64(2500)), Value: "echo", Role: "x-bg"}))
+			Expect(line.Cue[0]).To(Equal(model.Cue{Start: gg.P(int64(1000)), End: gg.P(int64(1400)), Value: "He"}))
+			Expect(line.Cue[1]).To(Equal(model.Cue{Start: gg.P(int64(1400)), End: gg.P(int64(1800)), Value: "llo"}))
+			Expect(line.Cue[2]).To(Equal(model.Cue{Start: gg.P(int64(2000)), End: gg.P(int64(2500)), Value: "echo", Role: "x-bg"}))
 		})
 	})
 
@@ -166,9 +166,9 @@ var _ = Describe("parseTTML", func() {
 			Expect(line.Start).To(Equal(gg.P(int64(43444))))
 			Expect(line.Value).To(Equal("go\ngo"))
 			Expect(line.End).To(Equal(gg.P(int64(45570))))
-			Expect(line.Token).To(HaveLen(2))
-			Expect(line.Token[0]).To(Equal(model.Token{Start: gg.P(int64(43444)), End: gg.P(int64(43716)), Value: "go"}))
-			Expect(line.Token[1]).To(Equal(model.Token{Start: gg.P(int64(43716)), End: gg.P(int64(43887)), Value: "go"}))
+			Expect(line.Cue).To(HaveLen(2))
+			Expect(line.Cue[0]).To(Equal(model.Cue{Start: gg.P(int64(43444)), End: gg.P(int64(43716)), Value: "go"}))
+			Expect(line.Cue[1]).To(Equal(model.Cue{Start: gg.P(int64(43716)), End: gg.P(int64(43887)), Value: "go"}))
 		})
 	})
 
@@ -250,9 +250,9 @@ var _ = Describe("parseTTML", func() {
 			Expect(pronunciation.Line[0].Start).To(Equal(gg.P(int64(2000))))
 			Expect(pronunciation.Line[0].Value).To(Equal("konni"))
 			Expect(pronunciation.Line[0].End).To(Equal(gg.P(int64(2600))))
-			Expect(pronunciation.Line[0].Token).To(HaveLen(2))
-			Expect(pronunciation.Line[0].Token[0]).To(Equal(model.Token{Start: gg.P(int64(2000)), End: gg.P(int64(2300)), Value: "ko"}))
-			Expect(pronunciation.Line[0].Token[1]).To(Equal(model.Token{Start: gg.P(int64(2300)), End: gg.P(int64(2600)), Value: "nni"}))
+			Expect(pronunciation.Line[0].Cue).To(HaveLen(2))
+			Expect(pronunciation.Line[0].Cue[0]).To(Equal(model.Cue{Start: gg.P(int64(2000)), End: gg.P(int64(2300)), Value: "ko"}))
+			Expect(pronunciation.Line[0].Cue[1]).To(Equal(model.Cue{Start: gg.P(int64(2300)), End: gg.P(int64(2600)), Value: "nni"}))
 		})
 	})
 
@@ -294,10 +294,10 @@ var _ = Describe("parseTTML", func() {
 			line := pronunciation.Line[0]
 			Expect(line.Start).To(Equal(gg.P(int64(2747))))
 			Expect(line.Value).To(Equal("I woke up"))
-			Expect(line.Token).To(HaveLen(3))
-			Expect(line.Token[0]).To(Equal(model.Token{Start: gg.P(int64(2747)), End: gg.P(int64(3018)), Value: "I"}))
-			Expect(line.Token[1]).To(Equal(model.Token{Start: gg.P(int64(3018)), End: gg.P(int64(3179)), Value: "woke"}))
-			Expect(line.Token[2]).To(Equal(model.Token{Start: gg.P(int64(3179)), End: gg.P(int64(3582)), Value: "up"}))
+			Expect(line.Cue).To(HaveLen(3))
+			Expect(line.Cue[0]).To(Equal(model.Cue{Start: gg.P(int64(2747)), End: gg.P(int64(3018)), Value: "I"}))
+			Expect(line.Cue[1]).To(Equal(model.Cue{Start: gg.P(int64(3018)), End: gg.P(int64(3179)), Value: "woke"}))
+			Expect(line.Cue[2]).To(Equal(model.Cue{Start: gg.P(int64(3179)), End: gg.P(int64(3582)), Value: "up"}))
 		})
 	})
 })
diff --git a/model/lyrics.go b/model/lyrics.go
index 220eec7b5..3cb1cb715 100644
--- a/model/lyrics.go
+++ b/model/lyrics.go
@@ -11,7 +11,7 @@ import (
 	"github.com/navidrome/navidrome/utils/str"
 )
 
-type Token struct {
+type Cue struct {
 	Start *int64 `structs:"start,omitempty" json:"start,omitempty"`
 	End   *int64 `structs:"end,omitempty"   json:"end,omitempty"`
 	Value string `structs:"value"           json:"value"`
@@ -19,10 +19,10 @@ type Token struct {
 }
 
 type Line struct {
-	Start *int64  `structs:"start,omitempty" json:"start,omitempty"`
-	End   *int64  `structs:"end,omitempty"   json:"end,omitempty"`
-	Value string  `structs:"value"           json:"value"`
-	Token []Token `structs:"token,omitempty" json:"token,omitempty"`
+	Start *int64 `structs:"start,omitempty" json:"start,omitempty"`
+	End   *int64 `structs:"end,omitempty"   json:"end,omitempty"`
+	Value string `structs:"value"           json:"value"`
+	Cue   []Cue  `structs:"cue,omitempty"   json:"cue,omitempty"`
 }
 
 type Lyrics struct {
diff --git a/server/subsonic/helpers.go b/server/subsonic/helpers.go
index 3b9412fb1..6922f0683 100644
--- a/server/subsonic/helpers.go
+++ b/server/subsonic/helpers.go
@@ -493,35 +493,49 @@ func mapExplicitStatus(explicitStatus string) string {
 	return ""
 }
 
-func buildStructuredLyric(mf *model.MediaFile, lyrics model.Lyrics) responses.StructuredLyric {
+func buildStructuredLyric(mf *model.MediaFile, lyrics model.Lyrics, enhanced bool) responses.StructuredLyric {
 	lines := make([]responses.Line, len(lyrics.Line))
-	tokenLines := make([]responses.TokenLine, 0, len(lyrics.Line))
+	var cueLines []responses.CueLine
 
 	for i, line := range lyrics.Line {
 		lines[i] = responses.Line{
 			Start: line.Start,
 			Value: line.Value,
 		}
-		if len(line.Token) == 0 {
+		if !enhanced || len(line.Cue) == 0 {
 			continue
 		}
 
-		tokens := make([]responses.LyricToken, len(line.Token))
-		for j, token := range line.Token {
-			tokens[j] = responses.LyricToken{
-				Start: token.Start,
-				End:   token.End,
-				Value: token.Value,
-				Role:  token.Role,
+		// Group cues by role, preserving order of first appearance
+		roleOrder := make([]string, 0, 2)
+		cuesByRole := make(map[string][]responses.LyricCue)
+		for _, cue := range line.Cue {
+			role := cue.Role
+			if _, exists := cuesByRole[role]; !exists {
+				roleOrder = append(roleOrder, role)
 			}
+			cuesByRole[role] = append(cuesByRole[role], responses.LyricCue{
+				Start: cue.Start,
+				End:   cue.End,
+				Value: cue.Value,
+			})
+		}
+
+		// Create a separate CueLine for each role group
+		for _, role := range roleOrder {
+			cues := cuesByRole[role]
+			cueLine := responses.CueLine{
+				Index: int32(i),
+				Start: line.Start,
+				End:   line.End,
+				Value: line.Value,
+				Cue:   cues,
+			}
+			if role != "" {
+				cueLine.Role = role
+			}
+			cueLines = append(cueLines, cueLine)
 		}
-		tokenLines = append(tokenLines, responses.TokenLine{
-			Index: int32(i),
-			Start: line.Start,
-			End:   line.End,
-			Value: line.Value,
-			Token: tokens,
-		})
 	}
 
 	kind := strings.TrimSpace(lyrics.Kind)
@@ -535,7 +549,7 @@ func buildStructuredLyric(mf *model.MediaFile, lyrics model.Lyrics) responses.St
 		Kind:          kind,
 		Lang:          lyrics.Lang,
 		Line:          lines,
-		TokenLine:     tokenLines,
+		CueLine:       cueLines,
 		Offset:        lyrics.Offset,
 		Synced:        lyrics.Synced,
 	}
@@ -550,11 +564,23 @@ func buildStructuredLyric(mf *model.MediaFile, lyrics model.Lyrics) responses.St
 	return structured
 }
 
-func buildLyricsList(mf *model.MediaFile, lyricsList model.LyricList) *responses.LyricsList {
-	lyricList := make(responses.StructuredLyrics, len(lyricsList))
+func buildLyricsList(mf *model.MediaFile, lyricsList model.LyricList, enhanced bool) *responses.LyricsList {
+	var filtered model.LyricList
+	if enhanced {
+		filtered = lyricsList
+	} else {
+		// Without enhanced, only return "main" kind entries
+		for _, l := range lyricsList {
+			kind := strings.TrimSpace(l.Kind)
+			if kind == "" || kind == "main" {
+				filtered = append(filtered, l)
+			}
+		}
+	}
 
-	for i, lyrics := range lyricsList {
-		lyricList[i] = buildStructuredLyric(mf, lyrics)
+	lyricList := make(responses.StructuredLyrics, len(filtered))
+	for i, lyrics := range filtered {
+		lyricList[i] = buildStructuredLyric(mf, lyrics, enhanced)
 	}
 
 	res := &responses.LyricsList{
diff --git a/server/subsonic/media_retrieval.go b/server/subsonic/media_retrieval.go
index 963db067c..de88849a2 100644
--- a/server/subsonic/media_retrieval.go
+++ b/server/subsonic/media_retrieval.go
@@ -149,8 +149,10 @@ func (api *Router) GetLyricsBySongId(r *http.Request) (*responses.Subsonic, erro
 		return nil, err
 	}
 
+	enhanced, _ := req.Params(r).Bool("enhanced")
+
 	response := newResponse()
-	response.LyricsList = buildLyricsList(mediaFile, structuredLyrics)
+	response.LyricsList = buildLyricsList(mediaFile, structuredLyrics, enhanced)
 
 	return response, nil
 }
diff --git a/server/subsonic/media_retrieval_test.go b/server/subsonic/media_retrieval_test.go
index 6c52d38bc..7cf96fee5 100644
--- a/server/subsonic/media_retrieval_test.go
+++ b/server/subsonic/media_retrieval_test.go
@@ -258,36 +258,36 @@ var _ = Describe("MediaRetrievalController", func() {
 					}
 				}
 
-				Expect(realLyric.TokenLine).To(HaveLen(len(expectedLyric.TokenLine)))
-				for j, realTokenLine := range realLyric.TokenLine {
-					expectedTokenLine := expectedLyric.TokenLine[j]
-					Expect(realTokenLine.Index).To(Equal(expectedTokenLine.Index))
-					Expect(realTokenLine.Value).To(Equal(expectedTokenLine.Value))
-					if expectedTokenLine.Start == nil {
-						Expect(realTokenLine.Start).To(BeNil())
+				Expect(realLyric.CueLine).To(HaveLen(len(expectedLyric.CueLine)))
+				for j, realCueLine := range realLyric.CueLine {
+					expectedCueLine := expectedLyric.CueLine[j]
+					Expect(realCueLine.Index).To(Equal(expectedCueLine.Index))
+					Expect(realCueLine.Value).To(Equal(expectedCueLine.Value))
+					Expect(realCueLine.Role).To(Equal(expectedCueLine.Role))
+					if expectedCueLine.Start == nil {
+						Expect(realCueLine.Start).To(BeNil())
 					} else {
-						Expect(*realTokenLine.Start).To(Equal(*expectedTokenLine.Start))
+						Expect(*realCueLine.Start).To(Equal(*expectedCueLine.Start))
 					}
-					if expectedTokenLine.End == nil {
-						Expect(realTokenLine.End).To(BeNil())
+					if expectedCueLine.End == nil {
+						Expect(realCueLine.End).To(BeNil())
 					} else {
-						Expect(*realTokenLine.End).To(Equal(*expectedTokenLine.End))
+						Expect(*realCueLine.End).To(Equal(*expectedCueLine.End))
 					}
 
-					Expect(realTokenLine.Token).To(HaveLen(len(expectedTokenLine.Token)))
-					for k, realToken := range realTokenLine.Token {
-						expectedToken := expectedTokenLine.Token[k]
-						Expect(realToken.Value).To(Equal(expectedToken.Value))
-						Expect(realToken.Role).To(Equal(expectedToken.Role))
-						if expectedToken.Start == nil {
-							Expect(realToken.Start).To(BeNil())
+					Expect(realCueLine.Cue).To(HaveLen(len(expectedCueLine.Cue)))
+					for k, realCue := range realCueLine.Cue {
+						expectedCue := expectedCueLine.Cue[k]
+						Expect(realCue.Value).To(Equal(expectedCue.Value))
+						if expectedCue.Start == nil {
+							Expect(realCue.Start).To(BeNil())
 						} else {
-							Expect(*realToken.Start).To(Equal(*expectedToken.Start))
+							Expect(*realCue.Start).To(Equal(*expectedCue.Start))
 						}
-						if expectedToken.End == nil {
-							Expect(realToken.End).To(BeNil())
+						if expectedCue.End == nil {
+							Expect(realCue.End).To(BeNil())
 						} else {
-							Expect(*realToken.End).To(Equal(*expectedToken.End))
+							Expect(*realCue.End).To(Equal(*expectedCue.End))
 						}
 					}
 				}
@@ -448,7 +448,7 @@ var _ = Describe("MediaRetrievalController", func() {
 
 		It("should return metadata-linked translation and pronunciation tracks from TTML", func() {
 			conf.Server.LyricsPriority = ".ttml,embedded"
-			r := newGetRequest("id=1")
+			r := newGetRequest("id=1&enhanced=true")
 
 			mockRepo.SetData(model.MediaFiles{
 				{
@@ -513,13 +513,13 @@ var _ = Describe("MediaRetrievalController", func() {
 								Value: "konni",
 							},
 						},
-						TokenLine: []responses.TokenLine{
+						CueLine: []responses.CueLine{
 							{
 								Index: 0,
 								Start: &mainStartB,
 								End:   &tokenEndB,
 								Value: "konni",
-								Token: []responses.LyricToken{
+								Cue: []responses.LyricCue{
 									{
 										Start: &tokenStartA,
 										End:   &tokenEndA,
@@ -538,8 +538,8 @@ var _ = Describe("MediaRetrievalController", func() {
 			})
 		})
 
-		It("should return tokenized lines for songLyrics v2 clients", func() {
-			r := newGetRequest("id=1")
+		It("should return cue lines for songLyrics v2 clients with enhanced=true", func() {
+			r := newGetRequest("id=1&enhanced=true")
 
 			lineStart := int64(1000)
 			lineEnd := int64(3000)
@@ -556,7 +556,7 @@ var _ = Describe("MediaRetrievalController", func() {
 							Start: &lineStart,
 							End:   &lineEnd,
 							Value: "Hello echo",
-							Token: []model.Token{
+							Cue: []model.Cue{
 								{
 									Start: &tokenStartA,
 									End:   &tokenEndA,
@@ -599,23 +599,31 @@ var _ = Describe("MediaRetrievalController", func() {
 								Value: "Hello echo",
 							},
 						},
-						TokenLine: []responses.TokenLine{
+						CueLine: []responses.CueLine{
 							{
 								Index: 0,
 								Start: &lineStart,
 								End:   &lineEnd,
 								Value: "Hello echo",
-								Token: []responses.LyricToken{
+								Cue: []responses.LyricCue{
 									{
 										Start: &tokenStartA,
 										End:   &tokenEndA,
 										Value: "Hello",
 									},
+								},
+							},
+							{
+								Index: 0,
+								Start: &lineStart,
+								End:   &lineEnd,
+								Value: "Hello echo",
+								Role:  "x-bg",
+								Cue: []responses.LyricCue{
 									{
 										Start: &tokenStartB,
 										End:   &tokenEndB,
 										Value: "echo",
-										Role:  "x-bg",
 									},
 								},
 							},
diff --git a/server/subsonic/responses/responses.go b/server/subsonic/responses/responses.go
index ff5ae0d3b..d19f99ca6 100644
--- a/server/subsonic/responses/responses.go
+++ b/server/subsonic/responses/responses.go
@@ -537,30 +537,30 @@ type Line struct {
 	Value string `xml:",chardata"            json:"value"`
 }
 
-type LyricToken struct {
+type LyricCue struct {
 	Start *int64 `xml:"start,attr,omitempty" json:"start,omitempty"`
 	End   *int64 `xml:"end,attr,omitempty"   json:"end,omitempty"`
 	Value string `xml:"value,attr"           json:"value"`
-	Role  string `xml:"role,attr,omitempty"  json:"role,omitempty"`
 }
 
-type TokenLine struct {
-	Index int32        `xml:"index,attr"                    json:"index"`
-	Start *int64       `xml:"start,attr,omitempty"         json:"start,omitempty"`
-	End   *int64       `xml:"end,attr,omitempty"           json:"end,omitempty"`
-	Value string       `xml:"value,attr,omitempty"         json:"value,omitempty"`
-	Token []LyricToken `xml:"token,omitempty"        json:"token,omitempty"`
+type CueLine struct {
+	Index int32      `xml:"index,attr"                    json:"index"`
+	Start *int64     `xml:"start,attr,omitempty"         json:"start,omitempty"`
+	End   *int64     `xml:"end,attr,omitempty"           json:"end,omitempty"`
+	Value string     `xml:"value,attr,omitempty"         json:"value,omitempty"`
+	Role  string     `xml:"role,attr,omitempty"          json:"role,omitempty"`
+	Cue   []LyricCue `xml:"cue,omitempty"        json:"cue,omitempty"`
 }
 
 type StructuredLyric struct {
-	DisplayArtist string      `xml:"displayArtist,attr,omitempty" json:"displayArtist,omitempty"`
-	DisplayTitle  string      `xml:"displayTitle,attr,omitempty"  json:"displayTitle,omitempty"`
-	Kind          string      `xml:"kind,attr,omitempty"          json:"kind,omitempty"`
-	Lang          string      `xml:"lang,attr"                    json:"lang"`
-	Line          []Line      `xml:"line"                         json:"line"`
-	TokenLine     []TokenLine `xml:"tokenLine,omitempty"     json:"tokenLine,omitempty"`
-	Offset        *int64      `xml:"offset,attr,omitempty"        json:"offset,omitempty"`
-	Synced        bool        `xml:"synced,attr"                  json:"synced"`
+	DisplayArtist string    `xml:"displayArtist,attr,omitempty" json:"displayArtist,omitempty"`
+	DisplayTitle  string    `xml:"displayTitle,attr,omitempty"  json:"displayTitle,omitempty"`
+	Kind          string    `xml:"kind,attr,omitempty"          json:"kind,omitempty"`
+	Lang          string    `xml:"lang,attr"                    json:"lang"`
+	Line          []Line    `xml:"line"                         json:"line"`
+	CueLine       []CueLine `xml:"cueLine,omitempty"     json:"cueLine,omitempty"`
+	Offset        *int64    `xml:"offset,attr,omitempty"        json:"offset,omitempty"`
+	Synced        bool      `xml:"synced,attr"                  json:"synced"`
 }
 
 type StructuredLyrics []StructuredLyric
diff --git a/ui/src/audioplayer/lyrics.js b/ui/src/audioplayer/lyrics.js
index 3dcf9b0f9..111ded02e 100644
--- a/ui/src/audioplayer/lyrics.js
+++ b/ui/src/audioplayer/lyrics.js
@@ -74,25 +74,25 @@ const normalizeToken = (token) => {
     start: toTime(token.start),
     end: toTime(token.end),
     value,
-    role: typeof token.role === 'string' ? token.role : '',
   }
 }
 
-const normalizeTokenLine = (tokenLine, fallbackIndex) => {
-  const index = Number.isFinite(Number(tokenLine?.index))
-    ? Number(tokenLine.index)
+const normalizeCueLine = (cueLine, fallbackIndex) => {
+  const index = Number.isFinite(Number(cueLine?.index))
+    ? Number(cueLine.index)
     : fallbackIndex
   const tokens = sortTokensByStart(
-    Array.isArray(tokenLine?.token)
-      ? tokenLine.token.map(normalizeToken).filter(Boolean)
+    Array.isArray(cueLine?.cue)
+      ? cueLine.cue.map(normalizeToken).filter(Boolean)
       : [],
   )
 
   return {
     index,
-    start: toTime(tokenLine?.start),
-    end: toTime(tokenLine?.end),
-    value: typeof tokenLine?.value === 'string' ? tokenLine.value : '',
+    start: toTime(cueLine?.start),
+    end: toTime(cueLine?.end),
+    value: typeof cueLine?.value === 'string' ? cueLine.value : '',
+    role: typeof cueLine?.role === 'string' ? cueLine.role : '',
     tokens,
   }
 }
@@ -197,14 +197,14 @@ const buildSyntheticWordTokens = (line, token) => {
   }))
 }
 
-export const hasTokenTiming = (structuredLyric) =>
+export const hasCueTiming = (structuredLyric) =>
   Boolean(
     structuredLyric &&
-    Array.isArray(structuredLyric.tokenLine) &&
-    structuredLyric.tokenLine.some(
-      (tokenLine) =>
-        Array.isArray(tokenLine?.token) &&
-        tokenLine.token.some((token) => Number.isFinite(Number(token?.start))),
+    Array.isArray(structuredLyric.cueLine) &&
+    structuredLyric.cueLine.some(
+      (cueLine) =>
+        Array.isArray(cueLine?.cue) &&
+        cueLine.cue.some((cue) => Number.isFinite(Number(cue?.start))),
     ),
   )
 
@@ -215,7 +215,7 @@ export const hasStructuredLyricContent = (structuredLyric) =>
       structuredLyric.line.some(
         (line) => typeof line?.value === 'string' && line.value.trim() !== '',
       )) ||
-      hasTokenTiming(structuredLyric)),
+      hasCueTiming(structuredLyric)),
   )
 
 export const getPreferredLyricLanguage = () => {
@@ -319,34 +319,57 @@ export const buildKaraokeLines = (structuredLyric) => {
   const baseLines = Array.isArray(structuredLyric.line)
     ? structuredLyric.line
     : []
-  const rawTokenLines = Array.isArray(structuredLyric.tokenLine)
-    ? structuredLyric.tokenLine
+  const rawCueLines = Array.isArray(structuredLyric.cueLine)
+    ? structuredLyric.cueLine
     : []
 
   const lines =
-    rawTokenLines.length > 0
-      ? rawTokenLines.map((tokenLine, fallbackIndex) => {
-          const normalized = normalizeTokenLine(tokenLine, fallbackIndex)
-          const baseLine = baseLines[normalized.index] || {}
-          const tokens = normalized.tokens
-          const fallbackStart =
-            tokens.find((token) => token.start != null)?.start ?? null
-          const fallbackEnd =
-            [...tokens].reverse().find((token) => token.end != null)?.end ??
-            null
-          const value =
-            normalized.value ||
-            (typeof baseLine.value === 'string' ? baseLine.value : '') ||
-            tokens.map((token) => token.value).join('')
+    rawCueLines.length > 0
+      ? (() => {
+          const normalizedCueLines = rawCueLines.map(
+            (cueLine, fallbackIndex) => {
+              const normalized = normalizeCueLine(cueLine, fallbackIndex)
+              return {
+                ...normalized,
+                tokens: normalized.tokens.map((token) => ({
+                  ...token,
+                  role: normalized.role,
+                })),
+              }
+            },
+          )
 
-          return {
-            index: normalized.index,
-            start: normalized.start ?? toTime(baseLine.start) ?? fallbackStart,
-            end: normalized.end ?? toTime(baseLine.end) ?? fallbackEnd,
-            value,
-            tokens,
+          const byIndex = new Map()
+          for (const cl of normalizedCueLines) {
+            if (!byIndex.has(cl.index)) {
+              byIndex.set(cl.index, [])
+            }
+            byIndex.get(cl.index).push(cl)
           }
-        })
+
+          return Array.from(byIndex.entries()).map(([index, group]) => {
+            const first = group[0]
+            const baseLine = baseLines[index] || {}
+            const tokens = sortTokensByStart(group.flatMap((cl) => cl.tokens))
+            const fallbackStart =
+              tokens.find((token) => token.start != null)?.start ?? null
+            const fallbackEnd =
+              [...tokens].reverse().find((token) => token.end != null)?.end ??
+              null
+            const value =
+              first.value ||
+              (typeof baseLine.value === 'string' ? baseLine.value : '') ||
+              tokens.map((token) => token.value).join('')
+
+            return {
+              index,
+              start: first.start ?? toTime(baseLine.start) ?? fallbackStart,
+              end: first.end ?? toTime(baseLine.end) ?? fallbackEnd,
+              value,
+              tokens,
+            }
+          })
+        })()
       : baseLines.map((line, index) => ({
           index,
           start: toTime(line.start),
diff --git a/ui/src/audioplayer/lyrics.test.js b/ui/src/audioplayer/lyrics.test.js
index c60605a6f..7e0b0d105 100644
--- a/ui/src/audioplayer/lyrics.test.js
+++ b/ui/src/audioplayer/lyrics.test.js
@@ -1,15 +1,15 @@
 import {
   buildKaraokeLines,
   findLayerLineIndexForMain,
-  getPreferredLyricLanguage,
   getActiveKaraokeState,
+  getPreferredLyricLanguage,
   hasStructuredLyricContent,
   pickStructuredLyric,
   resolveKaraokeTokenWindow,
   resolveLayerLineForMain,
   selectLyricLayers,
-  structuredLyricToLrc,
   structuredLyricsToLrc,
+  structuredLyricToLrc,
 } from './lyrics'
 
 describe('lyrics helpers', () => {
@@ -200,21 +200,27 @@ describe('lyrics helpers', () => {
     expect(getPreferredLyricLanguage()).toBe('pt-BR')
   })
 
-  it('builds karaoke lines from tokenLine payload', () => {
+  it('builds karaoke lines from cueLine payload', () => {
     const lines = buildKaraokeLines({
       lang: 'eng',
       synced: true,
       line: [{ start: 1000, end: 3000, value: 'Hello world' }],
-      tokenLine: [
+      cueLine: [
         {
           index: 0,
           start: 1000,
           end: 3000,
           value: 'Hello world',
-          token: [
-            { start: 1000, end: 1500, value: 'Hello' },
-            { start: 2000, end: 2500, value: 'world', role: 'x-bg' },
-          ],
+          role: '',
+          cue: [{ start: 1000, end: 1500, value: 'Hello' }],
+        },
+        {
+          index: 0,
+          start: 1000,
+          end: 3000,
+          value: 'Hello world',
+          role: 'x-bg',
+          cue: [{ start: 2000, end: 2500, value: 'world' }],
         },
       ],
     })
@@ -238,15 +244,16 @@ describe('lyrics helpers', () => {
       lang: 'eng',
       synced: true,
       line: [{ start: 1000, end: 3000, value: 'Hello world' }],
-      tokenLine: [
+      cueLine: [
         {
           index: 0,
           start: 1000,
           end: 3000,
           value: 'Hello world',
-          token: [
-            { start: 2000, end: 2500, value: 'world', role: '' },
-            { start: 1000, end: 1500, value: 'Hello', role: '' },
+          role: '',
+          cue: [
+            { start: 2000, end: 2500, value: 'world' },
+            { start: 1000, end: 1500, value: 'Hello' },
           ],
         },
       ],
@@ -263,13 +270,14 @@ describe('lyrics helpers', () => {
       lang: 'ko-Latn',
       synced: true,
       line: [{ start: 1000, end: 2000, value: 'Da-la-lun, dun' }],
-      tokenLine: [
+      cueLine: [
         {
           index: 0,
           start: 1000,
           end: 2000,
           value: 'Da-la-lun, dun',
-          token: [{ start: 1000, end: 2000, value: 'Da-la-lun, dun' }],
+          role: '',
+          cue: [{ start: 1000, end: 2000, value: 'Da-la-lun, dun' }],
         },
       ],
     })
@@ -409,7 +417,7 @@ describe('lyrics helpers', () => {
   it('reports structured lyric content when token timing exists', () => {
     expect(
       hasStructuredLyricContent({
-        tokenLine: [{ token: [{ start: 100, value: 'a' }] }],
+        cueLine: [{ cue: [{ start: 100, value: 'a' }] }],
       }),
     ).toBe(true)
   })
diff --git a/ui/src/subsonic/index.js b/ui/src/subsonic/index.js
index b311d5e14..47ebabe99 100644
--- a/ui/src/subsonic/index.js
+++ b/ui/src/subsonic/index.js
@@ -1,5 +1,5 @@
-import { baseUrl } from '../utils'
 import { httpClient } from '../dataProvider'
+import { baseUrl } from '../utils'
 
 const url = (command, id, options) => {
   const username = localStorage.getItem('username')
@@ -121,7 +121,7 @@ const getTopSongs = (artist, count = 50) => {
 }
 
 const getLyricsBySongId = (id) => {
-  return httpClient(url('getLyricsBySongId', id))
+  return httpClient(url('getLyricsBySongId', id, { enhanced: true }))
 }
 
 const streamUrl = (id, options) => {

From 944401cae3f21c982ecf5b69b6004db68a147764 Mon Sep 17 00:00:00 2001
From: ranokay <github@ranokay.com>
Date: Sun, 22 Feb 2026 22:13:22 +0200
Subject: [PATCH 04/14] refactor: address Tolriq feedback on roles and cue
 timing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Drop x- prefix from role values (x-bg → bg, x-voice1 → voice1, x-group → group)
- Clarify voiceN has no upper bound (voice1, voice100, voice1000 all valid)
- Make cue.start required (non-pointer int64) in API response
- Keep cue.end optional with defined fallback semantics
- Strip x- prefix from TTML role values when mapping to API output
---
 server/subsonic/helpers.go              | 13 +++++++++++--
 server/subsonic/media_retrieval_test.go | 16 ++++++----------
 server/subsonic/responses/responses.go  |  2 +-
 ui/src/audioplayer/lyrics.test.js       |  4 ++--
 4 files changed, 20 insertions(+), 15 deletions(-)

diff --git a/server/subsonic/helpers.go b/server/subsonic/helpers.go
index 6922f0683..d8ec8451b 100644
--- a/server/subsonic/helpers.go
+++ b/server/subsonic/helpers.go
@@ -493,6 +493,11 @@ func mapExplicitStatus(explicitStatus string) string {
 	return ""
 }
 
+// sanitizeRole strips the TTML x- prefix from role values for the API.
+func sanitizeRole(role string) string {
+	return strings.TrimPrefix(role, "x-")
+}
+
 func buildStructuredLyric(mf *model.MediaFile, lyrics model.Lyrics, enhanced bool) responses.StructuredLyric {
 	lines := make([]responses.Line, len(lyrics.Line))
 	var cueLines []responses.CueLine
@@ -510,12 +515,16 @@ func buildStructuredLyric(mf *model.MediaFile, lyrics model.Lyrics, enhanced boo
 		roleOrder := make([]string, 0, 2)
 		cuesByRole := make(map[string][]responses.LyricCue)
 		for _, cue := range line.Cue {
-			role := cue.Role
+			role := sanitizeRole(cue.Role)
 			if _, exists := cuesByRole[role]; !exists {
 				roleOrder = append(roleOrder, role)
 			}
+			var start int64
+			if cue.Start != nil {
+				start = *cue.Start
+			}
 			cuesByRole[role] = append(cuesByRole[role], responses.LyricCue{
-				Start: cue.Start,
+				Start: start,
 				End:   cue.End,
 				Value: cue.Value,
 			})
diff --git a/server/subsonic/media_retrieval_test.go b/server/subsonic/media_retrieval_test.go
index 7cf96fee5..fa3f20e2d 100644
--- a/server/subsonic/media_retrieval_test.go
+++ b/server/subsonic/media_retrieval_test.go
@@ -279,11 +279,7 @@ var _ = Describe("MediaRetrievalController", func() {
 					for k, realCue := range realCueLine.Cue {
 						expectedCue := expectedCueLine.Cue[k]
 						Expect(realCue.Value).To(Equal(expectedCue.Value))
-						if expectedCue.Start == nil {
-							Expect(realCue.Start).To(BeNil())
-						} else {
-							Expect(*realCue.Start).To(Equal(*expectedCue.Start))
-						}
+						Expect(realCue.Start).To(Equal(expectedCue.Start))
 						if expectedCue.End == nil {
 							Expect(realCue.End).To(BeNil())
 						} else {
@@ -521,12 +517,12 @@ var _ = Describe("MediaRetrievalController", func() {
 								Value: "konni",
 								Cue: []responses.LyricCue{
 									{
-										Start: &tokenStartA,
+										Start: tokenStartA,
 										End:   &tokenEndA,
 										Value: "ko",
 									},
 									{
-										Start: &tokenStartB,
+										Start: tokenStartB,
 										End:   &tokenEndB,
 										Value: "nni",
 									},
@@ -607,7 +603,7 @@ var _ = Describe("MediaRetrievalController", func() {
 								Value: "Hello echo",
 								Cue: []responses.LyricCue{
 									{
-										Start: &tokenStartA,
+										Start: tokenStartA,
 										End:   &tokenEndA,
 										Value: "Hello",
 									},
@@ -618,10 +614,10 @@ var _ = Describe("MediaRetrievalController", func() {
 								Start: &lineStart,
 								End:   &lineEnd,
 								Value: "Hello echo",
-								Role:  "x-bg",
+								Role:  "bg",
 								Cue: []responses.LyricCue{
 									{
-										Start: &tokenStartB,
+										Start: tokenStartB,
 										End:   &tokenEndB,
 										Value: "echo",
 									},
diff --git a/server/subsonic/responses/responses.go b/server/subsonic/responses/responses.go
index d19f99ca6..d74c118b3 100644
--- a/server/subsonic/responses/responses.go
+++ b/server/subsonic/responses/responses.go
@@ -538,7 +538,7 @@ type Line struct {
 }
 
 type LyricCue struct {
-	Start *int64 `xml:"start,attr,omitempty" json:"start,omitempty"`
+	Start int64  `xml:"start,attr"           json:"start"`
 	End   *int64 `xml:"end,attr,omitempty"   json:"end,omitempty"`
 	Value string `xml:"value,attr"           json:"value"`
 }
diff --git a/ui/src/audioplayer/lyrics.test.js b/ui/src/audioplayer/lyrics.test.js
index 7e0b0d105..6cb3a1b87 100644
--- a/ui/src/audioplayer/lyrics.test.js
+++ b/ui/src/audioplayer/lyrics.test.js
@@ -219,7 +219,7 @@ describe('lyrics helpers', () => {
           start: 1000,
           end: 3000,
           value: 'Hello world',
-          role: 'x-bg',
+          role: 'bg',
           cue: [{ start: 2000, end: 2500, value: 'world' }],
         },
       ],
@@ -233,7 +233,7 @@ describe('lyrics helpers', () => {
         value: 'Hello world',
         tokens: [
           { start: 1000, end: 1500, value: 'Hello', role: '' },
-          { start: 2000, end: 2500, value: 'world', role: 'x-bg' },
+          { start: 2000, end: 2500, value: 'world', role: 'bg' },
         ],
       },
     ])

From 92793386648e1c4205d3aad27056006c25268c36 Mon Sep 17 00:00:00 2001
From: ranokay <github@ranokay.com>
Date: Mon, 23 Feb 2026 11:22:00 +0200
Subject: [PATCH 05/14] fix: guarantee main-first cueLine ordering for same
 index

Add stable sort to ensure the main vocals cueLine (empty role)
always appears before other roles when multiple cueLines share
the same line index. Previously relied on source document order
which is not guaranteed across all TTML files.
---
 server/subsonic/helpers.go | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/server/subsonic/helpers.go b/server/subsonic/helpers.go
index d8ec8451b..056ca89f1 100644
--- a/server/subsonic/helpers.go
+++ b/server/subsonic/helpers.go
@@ -530,6 +530,11 @@ func buildStructuredLyric(mf *model.MediaFile, lyrics model.Lyrics, enhanced boo
 			})
 		}
 
+		// Ensure main vocals (empty role) always comes first
+		sort.SliceStable(roleOrder, func(i, j int) bool {
+			return roleOrder[i] == "" && roleOrder[j] != ""
+		})
+
 		// Create a separate CueLine for each role group
 		for _, role := range roleOrder {
 			cues := cuesByRole[role]

From 4e8f363e818db764360939a14d2c7326a1ec0028 Mon Sep 17 00:00:00 2001
From: ranokay <github@ranokay.com>
Date: Thu, 5 Mar 2026 22:20:32 +0200
Subject: [PATCH 06/14] fix: align songLyrics v2 with spec, add Enhanced LRC
 parser and bg role UI styling

- Fix LyricCue.Value XML tag: chardata instead of attribute
- Fix Kind field leaking to non-enhanced (v1) responses
- Guard against nil cue.Start values
- Add Enhanced LRC parser for word-level inline timing markers
- Add role-based UI styling: bg tokens render italic at 72% opacity
- Add integration test for Enhanced LRC file reading
- Add unit tests for Enhanced LRC parser
---
 core/lyrics/sources_test.go                 |  37 +++++++
 model/lyrics.go                             | 105 +++++++++++++++++++-
 model/lyrics_test.go                        |  59 +++++++++++
 server/subsonic/helpers.go                  |  23 +++--
 server/subsonic/media_retrieval_test.go     |   7 +-
 server/subsonic/responses/responses.go      |   2 +-
 tests/fixtures/test-enhanced.lrc            |   6 ++
 ui/src/audioplayer/KaraokeLyricsOverlay.jsx |  44 ++++----
 8 files changed, 244 insertions(+), 39 deletions(-)
 create mode 100644 tests/fixtures/test-enhanced.lrc

diff --git a/core/lyrics/sources_test.go b/core/lyrics/sources_test.go
index 8823a3175..3dd2825e6 100644
--- a/core/lyrics/sources_test.go
+++ b/core/lyrics/sources_test.go
@@ -88,6 +88,43 @@ var _ = Describe("sources", func() {
 			}))
 		})
 
+		It("should return Enhanced LRC lyrics with word-level cues from a file", func() {
+			mf := model.MediaFile{Path: "tests/fixtures/test-enhanced.mp3"}
+			lyrics, err := fromExternalFile(ctx, &mf, ".lrc")
+
+			Expect(err).To(BeNil())
+			Expect(lyrics).To(HaveLen(1))
+			Expect(lyrics[0].DisplayArtist).To(Equal("Test Artist"))
+			Expect(lyrics[0].DisplayTitle).To(Equal("Enhanced Test"))
+			Expect(lyrics[0].Lang).To(Equal("eng"))
+			Expect(lyrics[0].Synced).To(BeTrue())
+			Expect(lyrics[0].Line).To(HaveLen(3))
+
+			// Line 1: has inline markers → Cue array populated
+			Expect(lyrics[0].Line[0].Start).To(Equal(gg.P(int64(1000))))
+			Expect(lyrics[0].Line[0].Value).To(Equal("Some lyrics here"))
+			Expect(lyrics[0].Line[0].Cue).To(HaveLen(3))
+			Expect(*lyrics[0].Line[0].Cue[0].Start).To(Equal(int64(1000)))
+			Expect(lyrics[0].Line[0].Cue[0].Value).To(Equal("Some "))
+			Expect(*lyrics[0].Line[0].Cue[0].End).To(Equal(int64(1500)))
+			Expect(*lyrics[0].Line[0].Cue[1].Start).To(Equal(int64(1500)))
+			Expect(lyrics[0].Line[0].Cue[1].Value).To(Equal("lyrics "))
+			Expect(*lyrics[0].Line[0].Cue[1].End).To(Equal(int64(2000)))
+			Expect(*lyrics[0].Line[0].Cue[2].Start).To(Equal(int64(2000)))
+			Expect(lyrics[0].Line[0].Cue[2].Value).To(Equal("here"))
+			Expect(lyrics[0].Line[0].Cue[2].End).To(BeNil())
+
+			// Line 2: has inline markers
+			Expect(lyrics[0].Line[1].Start).To(Equal(gg.P(int64(3000))))
+			Expect(lyrics[0].Line[1].Value).To(Equal("More words"))
+			Expect(lyrics[0].Line[1].Cue).To(HaveLen(2))
+
+			// Line 3: plain line, no cues
+			Expect(lyrics[0].Line[2].Start).To(Equal(gg.P(int64(5000))))
+			Expect(lyrics[0].Line[2].Value).To(Equal("Plain line without inline markers"))
+			Expect(lyrics[0].Line[2].Cue).To(BeNil())
+		})
+
 		It("should return unsynchronized lyrics from a file", func() {
 			mf := model.MediaFile{Path: "tests/fixtures/test.mp3"}
 			lyrics, err := fromExternalFile(ctx, &mf, ".txt")
diff --git a/model/lyrics.go b/model/lyrics.go
index 3cb1cb715..9fcd4992e 100644
--- a/model/lyrics.go
+++ b/model/lyrics.go
@@ -43,6 +43,10 @@ var (
 	syncRegex  = regexp.MustCompile(`(^|\n)\s*` + timeRegexString)
 	timeRegex  = regexp.MustCompile(timeRegexString)
 	lrcIdRegex = regexp.MustCompile(`\[(ar|ti|offset|lang):([^]]+)]`)
+
+	// Enhanced LRC: inline word-level timing markers like <00:12.34>
+	enhancedLRCTimeString = `<([0-9]{1,2}:)?([0-9]{1,2}):([0-9]{1,2})(.[0-9]{1,3})?>`
+	enhancedLRCRegex      = regexp.MustCompile(enhancedLRCTimeString)
 )
 
 func (l Lyrics) IsEmpty() bool {
@@ -116,9 +120,15 @@ func ToLyrics(language, text string) (*Lyrics, error) {
 
 			if validLine {
 				for idx := range timestamps {
+					cues := parseEnhancedCues(priorLine)
+					value := priorLine
+					if cues != nil {
+						value = stripEnhancedMarkers(value)
+					}
 					structuredLines = append(structuredLines, Line{
 						Start: &timestamps[idx],
-						Value: strings.TrimSpace(priorLine),
+						Value: strings.TrimSpace(value),
+						Cue:   cues,
 					})
 				}
 				timestamps = nil
@@ -164,9 +174,15 @@ func ToLyrics(language, text string) (*Lyrics, error) {
 
 	if validLine {
 		for idx := range timestamps {
+			cues := parseEnhancedCues(priorLine)
+			value := priorLine
+			if cues != nil {
+				value = stripEnhancedMarkers(value)
+			}
 			structuredLines = append(structuredLines, Line{
 				Start: &timestamps[idx],
-				Value: strings.TrimSpace(priorLine),
+				Value: strings.TrimSpace(value),
+				Cue:   cues,
 			})
 		}
 	}
@@ -190,6 +206,91 @@ func ToLyrics(language, text string) (*Lyrics, error) {
 	return &lyrics, nil
 }
 
+// parseEnhancedCues extracts word-level timing cues from Enhanced LRC inline markers.
+// Format: <mm:ss.mm>word <mm:ss.mm>word ...
+// Returns nil if no inline markers are found.
+func parseEnhancedCues(text string) []Cue {
+	matches := enhancedLRCRegex.FindAllStringSubmatchIndex(text, -1)
+	if len(matches) == 0 {
+		return nil
+	}
+
+	type segment struct {
+		start int64
+		text  string
+	}
+
+	segments := make([]segment, 0, len(matches))
+	for i, match := range matches {
+		timeMs, err := parseTime(
+			// Rewrite <...> as [...] so parseTime can handle it with the same logic
+			"["+text[match[0]+1:match[1]-1]+"]",
+			// Adjust match indices to point into our rewritten string (need start/end pairs for each group)
+			[]int{
+				0, match[1] - match[0],
+				adjustGroup(match, 2), adjustGroup(match, 3),
+				adjustGroup(match, 4), adjustGroup(match, 5),
+				adjustGroup(match, 6), adjustGroup(match, 7),
+				adjustGroup(match, 8), adjustGroup(match, 9),
+			},
+		)
+		if err != nil {
+			continue
+		}
+
+		// Text runs from after this marker to the start of the next marker (or end of string)
+		textStart := match[1]
+		var textEnd int
+		if i+1 < len(matches) {
+			textEnd = matches[i+1][0]
+		} else {
+			textEnd = len(text)
+		}
+
+		word := text[textStart:textEnd]
+		if word == "" {
+			continue
+		}
+		segments = append(segments, segment{start: timeMs, text: word})
+	}
+
+	if len(segments) == 0 {
+		return nil
+	}
+
+	cues := make([]Cue, len(segments))
+	for i, seg := range segments {
+		start := seg.start
+		cues[i] = Cue{
+			Start: &start,
+			Value: seg.text,
+		}
+		// Derive End from the next cue's Start
+		if i+1 < len(segments) {
+			end := segments[i+1].start
+			cues[i].End = &end
+		}
+	}
+	return cues
+}
+
+// adjustGroup remaps a capture group index from the original match to our rewritten "[...]" string.
+// The rewrite shifts by -1 (removed '<', added '[') so positions within the brackets stay the same.
+func adjustGroup(match []int, groupIdx int) int {
+	orig := match[groupIdx]
+	if orig == -1 {
+		return -1
+	}
+	// Offset is: original position minus the position of '<' in the original, plus 1 for '['
+	return orig - match[0]
+}
+
+// stripEnhancedMarkers removes all <mm:ss.mm> inline markers from text,
+// returning the plain lyric text.
+func stripEnhancedMarkers(text string) string {
+	return enhancedLRCRegex.ReplaceAllString(text, "")
+}
+
 func parseTime(line string, match []int) (int64, error) {
 	var hours, millis int64
 	var err error
diff --git a/model/lyrics_test.go b/model/lyrics_test.go
index 382976872..2228306d0 100644
--- a/model/lyrics_test.go
+++ b/model/lyrics_test.go
@@ -116,4 +116,63 @@ var _ = Describe("ToLyrics", func() {
 			{Start: &e, Value: "Test"},
 		}))
 	})
+
+	It("should parse Enhanced LRC with word-level timing", func() {
+		lyrics, err := ToLyrics("xxx", "[00:01.00]<00:01.00>Some <00:01.50>lyrics <00:02.00>here\n[00:03.00]<00:03.00>More <00:03.50>words")
+		Expect(err).ToNot(HaveOccurred())
+		Expect(lyrics.Synced).To(BeTrue())
+		Expect(lyrics.Line).To(HaveLen(2))
+
+		t1000, t1500, t2000, t3000, t3500 := int64(1000), int64(1500), int64(2000), int64(3000), int64(3500)
+
+		line0 := lyrics.Line[0]
+		Expect(line0.Start).To(Equal(&t1000))
+		Expect(line0.Value).To(Equal("Some lyrics here"))
+		Expect(line0.Cue).To(Equal([]Cue{
+			{Start: &t1000, End: &t1500, Value: "Some "},
+			{Start: &t1500, End: &t2000, Value: "lyrics "},
+			{Start: &t2000, Value: "here"},
+		}))
+
+		line1 := lyrics.Line[1]
+		Expect(line1.Start).To(Equal(&t3000))
+		Expect(line1.Value).To(Equal("More words"))
+		Expect(line1.Cue).To(Equal([]Cue{
+			{Start: &t3000, End: &t3500, Value: "More "},
+			{Start: &t3500, Value: "words"},
+		}))
+	})
+
+	It("should ignore Enhanced LRC markers and return plain lines when no markers present", func() {
+		a, b := int64(1000), int64(3000)
+		lyrics, err := ToLyrics("xxx", "[00:01.00]Plain line\n[00:03.00]Another plain line")
+		Expect(err).ToNot(HaveOccurred())
+		Expect(lyrics.Line).To(Equal([]Line{
+			{Start: &a, Value: "Plain line"},
+			{Start: &b, Value: "Another plain line"},
+		}))
+	})
+
+	It("should handle mixed Enhanced and plain LRC lines", func() {
+		lyrics, err := ToLyrics("xxx", "[00:01.00]<00:01.00>Some <00:01.50>lyrics\n[00:03.00]Plain line\n[00:05.00]<00:05.00>More <00:05.50>words")
+		Expect(err).ToNot(HaveOccurred())
+		Expect(lyrics.Line).To(HaveLen(3))
+
+		t1000, t1500, t5000, t5500 := int64(1000), int64(1500), int64(5000), int64(5500)
+
+		Expect(lyrics.Line[0].Cue).To(Equal([]Cue{
+			{Start: &t1000, End: &t1500, Value: "Some "},
+			{Start: &t1500, Value: "lyrics"},
+		}))
+		Expect(lyrics.Line[0].Value).To(Equal("Some lyrics"))
+
+		Expect(lyrics.Line[1].Cue).To(BeNil())
+		Expect(lyrics.Line[1].Value).To(Equal("Plain line"))
+
+		Expect(lyrics.Line[2].Cue).To(Equal([]Cue{
+			{Start: &t5000, End: &t5500, Value: "More "},
+			{Start: &t5500, Value: "words"},
+		}))
+		Expect(lyrics.Line[2].Value).To(Equal("More words"))
+	})
 })
diff --git a/server/subsonic/helpers.go b/server/subsonic/helpers.go
index 056ca89f1..b881fa169 100644
--- a/server/subsonic/helpers.go
+++ b/server/subsonic/helpers.go
@@ -515,16 +515,15 @@ func buildStructuredLyric(mf *model.MediaFile, lyrics model.Lyrics, enhanced boo
 		roleOrder := make([]string, 0, 2)
 		cuesByRole := make(map[string][]responses.LyricCue)
 		for _, cue := range line.Cue {
+			if cue.Start == nil {
+				continue
+			}
 			role := sanitizeRole(cue.Role)
 			if _, exists := cuesByRole[role]; !exists {
 				roleOrder = append(roleOrder, role)
 			}
-			var start int64
-			if cue.Start != nil {
-				start = *cue.Start
-			}
 			cuesByRole[role] = append(cuesByRole[role], responses.LyricCue{
-				Start: start,
+				Start: *cue.Start,
 				End:   cue.End,
 				Value: cue.Value,
 			})
@@ -552,15 +551,9 @@ func buildStructuredLyric(mf *model.MediaFile, lyrics model.Lyrics, enhanced boo
 		}
 	}
 
-	kind := strings.TrimSpace(lyrics.Kind)
-	if kind == "" {
-		kind = "main"
-	}
-
 	structured := responses.StructuredLyric{
 		DisplayArtist: lyrics.DisplayArtist,
 		DisplayTitle:  lyrics.DisplayTitle,
-		Kind:          kind,
 		Lang:          lyrics.Lang,
 		Line:          lines,
 		CueLine:       cueLines,
@@ -568,6 +561,14 @@ func buildStructuredLyric(mf *model.MediaFile, lyrics model.Lyrics, enhanced boo
 		Synced:        lyrics.Synced,
 	}
 
+	if enhanced {
+		kind := strings.TrimSpace(lyrics.Kind)
+		if kind == "" {
+			kind = "main"
+		}
+		structured.Kind = kind
+	}
+
 	if structured.DisplayArtist == "" {
 		structured.DisplayArtist = mf.Artist
 	}
diff --git a/server/subsonic/media_retrieval_test.go b/server/subsonic/media_retrieval_test.go
index fa3f20e2d..0fdbb3854 100644
--- a/server/subsonic/media_retrieval_test.go
+++ b/server/subsonic/media_retrieval_test.go
@@ -232,11 +232,7 @@ var _ = Describe("MediaRetrievalController", func() {
 
 				Expect(realLyric.DisplayArtist).To(Equal(expectedLyric.DisplayArtist))
 				Expect(realLyric.DisplayTitle).To(Equal(expectedLyric.DisplayTitle))
-				expectedKind := expectedLyric.Kind
-				if expectedKind == "" {
-					expectedKind = "main"
-				}
-				Expect(realLyric.Kind).To(Equal(expectedKind))
+				Expect(realLyric.Kind).To(Equal(expectedLyric.Kind))
 				Expect(realLyric.Lang).To(Equal(expectedLyric.Lang))
 				Expect(realLyric.Synced).To(Equal(expectedLyric.Synced))
 
@@ -587,6 +583,7 @@ var _ = Describe("MediaRetrievalController", func() {
 					{
 						DisplayArtist: "Rick Astley",
 						DisplayTitle:  "Never Gonna Give You Up",
+						Kind:          "main",
 						Lang:          "eng",
 						Synced:        true,
 						Line: []responses.Line{
diff --git a/server/subsonic/responses/responses.go b/server/subsonic/responses/responses.go
index d74c118b3..f5446a961 100644
--- a/server/subsonic/responses/responses.go
+++ b/server/subsonic/responses/responses.go
@@ -540,7 +540,7 @@ type Line struct {
 type LyricCue struct {
 	Start int64  `xml:"start,attr"           json:"start"`
 	End   *int64 `xml:"end,attr,omitempty"   json:"end,omitempty"`
-	Value string `xml:"value,attr"           json:"value"`
+	Value string `xml:",chardata"            json:"value"`
 }
 
 type CueLine struct {
diff --git a/tests/fixtures/test-enhanced.lrc b/tests/fixtures/test-enhanced.lrc
new file mode 100644
index 000000000..8f7b60f8c
--- /dev/null
+++ b/tests/fixtures/test-enhanced.lrc
@@ -0,0 +1,6 @@
+[ar:Test Artist]
+[ti:Enhanced Test]
+[lang:eng]
+[00:01.00]<00:01.00>Some <00:01.50>lyrics <00:02.00>here
+[00:03.00]<00:03.00>More <00:03.50>words
+[00:05.00]Plain line without inline markers
diff --git a/ui/src/audioplayer/KaraokeLyricsOverlay.jsx b/ui/src/audioplayer/KaraokeLyricsOverlay.jsx
index 3814cbee6..a44e50bf6 100644
--- a/ui/src/audioplayer/KaraokeLyricsOverlay.jsx
+++ b/ui/src/audioplayer/KaraokeLyricsOverlay.jsx
@@ -1,3 +1,12 @@
+import Button from '@material-ui/core/Button'
+import IconButton from '@material-ui/core/IconButton'
+import Popover from '@material-ui/core/Popover'
+import Slider from '@material-ui/core/Slider'
+import { makeStyles } from '@material-ui/core/styles'
+import Typography from '@material-ui/core/Typography'
+import CloseIcon from '@material-ui/icons/Close'
+import TuneIcon from '@material-ui/icons/Tune'
+import clsx from 'clsx'
 import React, {
   memo,
   useCallback,
@@ -6,21 +15,12 @@ import React, {
   useRef,
   useState,
 } from 'react'
-import clsx from 'clsx'
-import Button from '@material-ui/core/Button'
-import IconButton from '@material-ui/core/IconButton'
-import Popover from '@material-ui/core/Popover'
-import Slider from '@material-ui/core/Slider'
-import Typography from '@material-ui/core/Typography'
-import CloseIcon from '@material-ui/icons/Close'
-import TuneIcon from '@material-ui/icons/Tune'
-import { makeStyles } from '@material-ui/core/styles'
 import {
   buildKaraokeLines,
   getActiveKaraokeState,
   hasStructuredLyricContent,
-  resolveLayerLineForMain,
   resolveKaraokeTokenWindow,
+  resolveLayerLineForMain,
 } from './lyrics'
 
 const KARAOKE_RENDER_LEAD_MS = 24
@@ -421,9 +421,7 @@ const LyricsSettingsPopover = ({ settings, onChange }) => {
 
 const easeInOut = (v) => {
   const clamped = clamp(v, 0, 1)
-  return clamped < 0.5
-    ? 2 * clamped * clamped
-    : 1 - Math.pow(-2 * clamped + 2, 2) / 2
+  return clamped < 0.5 ? 2 * clamped * clamped : 1 - (-2 * clamped + 2) ** 2 / 2
 }
 
 const getMaxHeightPx = () => {
@@ -716,17 +714,23 @@ const KaraokeLineRow = memo(
           }
           alpha = clamp(alpha, TOKEN_FUTURE_ALPHA, TOKEN_ACTIVE_ALPHA)
           const fillProgress = isDone ? 1 : isActive ? progress : 0
+          const isBgRole = segment.token?.role === 'bg'
 
           return (
             <span
               key={`token-${idx}-${tokenStart ?? 'na'}`}
               className={tokenClassName}
-              style={buildTokenWipeStyle({
-                fillProgress,
-                highlightAlpha: alpha,
-                futureAlpha: TOKEN_FUTURE_ALPHA,
-                rgb: tokenRGB,
-              })}
+              style={{
+                ...buildTokenWipeStyle({
+                  fillProgress,
+                  highlightAlpha: isBgRole ? alpha * 0.72 : alpha,
+                  futureAlpha: isBgRole
+                    ? TOKEN_FUTURE_ALPHA * 0.72
+                    : TOKEN_FUTURE_ALPHA,
+                  rgb: tokenRGB,
+                }),
+                ...(isBgRole ? { fontStyle: 'italic' } : undefined),
+              }}
             >
               {segment.text}
             </span>
@@ -1066,7 +1070,7 @@ const KaraokeLyricsOverlay = ({
     const isActive = delta === 0
     let opacity = isActive ? 1 : delta < 0 ? 0.6 : 0.72
     const [r, g, b] = parseColorRGB(getColorValue(lyricsSettings.main.colorKey))
-    let color = isActive
+    const color = isActive
       ? `rgba(${r}, ${g}, ${b}, 0.98)`
       : delta < 0
         ? `rgba(${r}, ${g}, ${b}, 0.4)`

From 1aac92bc1401529b4586972b4b80f2a627db63f9 Mon Sep 17 00:00:00 2001
From: ranokay <github@ranokay.com>
Date: Fri, 20 Mar 2026 23:41:29 +0200
Subject: [PATCH 07/14] feat(lyrics): support agent-based lyric layers

---
 conf/configuration.go                   |   2 +-
 core/lyrics/lyrics_test.go              |  53 +++-
 core/lyrics/sources.go                  |  11 +-
 core/lyrics/sources_test.go             |  56 ++++-
 core/lyrics/srt.go                      | 161 +++++++++++++
 core/lyrics/ttml.go                     | 307 ++++++++++++++++++++----
 core/lyrics/ttml_test.go                |  42 +++-
 model/lyrics.go                         | 160 ++++++++++--
 model/lyrics_test.go                    |  10 +-
 server/subsonic/helpers.go              | 131 ++++++++--
 server/subsonic/media_retrieval_test.go |  42 ++--
 server/subsonic/responses/responses.go  |  19 +-
 tests/fixtures/test.elrc                |   5 +
 tests/fixtures/test.srt                 |   7 +
 ui/src/audioplayer/PlayerToolbar.jsx    |   2 +-
 ui/src/audioplayer/lyrics.js            |  73 +++++-
 ui/src/audioplayer/lyrics.test.js       | 132 ++++++++--
 17 files changed, 1059 insertions(+), 154 deletions(-)
 create mode 100644 core/lyrics/srt.go
 create mode 100644 tests/fixtures/test.elrc
 create mode 100644 tests/fixtures/test.srt

diff --git a/conf/configuration.go b/conf/configuration.go
index af9f6c283..6370d5cb8 100644
--- a/conf/configuration.go
+++ b/conf/configuration.go
@@ -730,7 +730,7 @@ func setViperDefaults() {
 	viper.SetDefault("artistartpriority", "artist.*, album/artist.*, external")
 	viper.SetDefault("artistimagefolder", "")
 	viper.SetDefault("discartpriority", "disc*.*, cd*.*, cover.*, folder.*, front.*, discsubtitle, embedded")
-	viper.SetDefault("lyricspriority", ".lrc,.ttml,.txt,embedded")
+	viper.SetDefault("lyricspriority", ".ttml,.elrc,.lrc,.srt,.txt,embedded")
 	viper.SetDefault("enablegravatar", false)
 	viper.SetDefault("enablefavourites", true)
 	viper.SetDefault("enablestarrating", true)
diff --git a/core/lyrics/lyrics_test.go b/core/lyrics/lyrics_test.go
index d5f79a4d0..58e8ba82b 100644
--- a/core/lyrics/lyrics_test.go
+++ b/core/lyrics/lyrics_test.go
@@ -44,6 +44,36 @@ var _ = Describe("sources", func() {
 		},
 	}
 
+	elrcLyrics := model.LyricList{
+		model.Lyrics{
+			DisplayArtist: "ELRC Artist",
+			DisplayTitle:  "ELRC Song",
+			Lang:          "eng",
+			Line: []model.Line{
+				{
+					Start: gg.P(int64(1000)),
+					End:   gg.P(int64(1500)),
+					Value: "Lead words",
+					Cue: []model.Cue{
+						{
+							Start: gg.P(int64(1000)),
+							Value: "Lead ",
+						},
+						{
+							Start: gg.P(int64(1500)),
+							Value: "words",
+						},
+					},
+				},
+				{
+					Start: gg.P(int64(3000)),
+					Value: "Fallback line",
+				},
+			},
+			Synced: true,
+		},
+	}
+
 	ttmlLyrics := model.LyricList{
 		model.Lyrics{
 			Kind: "main",
@@ -88,6 +118,25 @@ var _ = Describe("sources", func() {
 		},
 	}
 
+	srtLyrics := model.LyricList{
+		model.Lyrics{
+			Lang: "xxx",
+			Line: []model.Line{
+				{
+					Start: gg.P(int64(18800)),
+					End:   gg.P(int64(22800)),
+					Value: "We're from subtitles",
+				},
+				{
+					Start: gg.P(int64(22801)),
+					End:   gg.P(int64(26000)),
+					Value: "Another subtitle line",
+				},
+			},
+			Synced: true,
+		},
+	}
+
 	BeforeEach(func() {
 		DeferCleanup(configtest.SetupConfig())
 
@@ -109,8 +158,10 @@ var _ = Describe("sources", func() {
 	},
 		Entry("embedded > lrc > txt", "embedded,.lrc,.txt", embeddedLyrics),
 		Entry("lrc > embedded > txt", ".lrc,embedded,.txt", syncedLyrics),
+		Entry("elrc > lrc > embedded", ".elrc,.lrc,embedded", elrcLyrics),
+		Entry("srt > txt > embedded", ".srt,.txt,embedded", srtLyrics),
 		Entry("txt > lrc > embedded", ".txt,.lrc,embedded", unsyncedLyrics),
-		Entry("ttml > lrc > embedded", ".ttml,.lrc,embedded", ttmlLyrics))
+		Entry("ttml > elrc > lrc > srt > embedded", ".ttml,.elrc,.lrc,.srt,embedded", ttmlLyrics))
 
 	Context("Errors", func() {
 		var RegularUserContext = XContext
diff --git a/core/lyrics/sources.go b/core/lyrics/sources.go
index 38a71cb8a..7586c944f 100644
--- a/core/lyrics/sources.go
+++ b/core/lyrics/sources.go
@@ -38,13 +38,20 @@ func fromExternalFile(ctx context.Context, mf *model.MediaFile, suffix string) (
 	}
 
 	var list model.LyricList
-	if strings.EqualFold(suffix, ".ttml") {
+	switch {
+	case strings.EqualFold(suffix, ".ttml"):
 		list, err = parseTTML(contents)
 		if err != nil {
 			log.Error(ctx, "error parsing ttml external file", "path", externalLyric, err)
 			return nil, err
 		}
-	} else {
+	case strings.EqualFold(suffix, ".srt"):
+		list, err = parseSRT(contents)
+		if err != nil {
+			log.Error(ctx, "error parsing srt external file", "path", externalLyric, err)
+			return nil, err
+		}
+	default:
 		lyrics, err := model.ToLyrics("xxx", string(contents))
 		if err != nil {
 			log.Error(ctx, "error parsing lyric external file", "path", externalLyric, err)
diff --git a/core/lyrics/sources_test.go b/core/lyrics/sources_test.go
index 3dd2825e6..a110390d8 100644
--- a/core/lyrics/sources_test.go
+++ b/core/lyrics/sources_test.go
@@ -106,10 +106,10 @@ var _ = Describe("sources", func() {
 			Expect(lyrics[0].Line[0].Cue).To(HaveLen(3))
 			Expect(*lyrics[0].Line[0].Cue[0].Start).To(Equal(int64(1000)))
 			Expect(lyrics[0].Line[0].Cue[0].Value).To(Equal("Some "))
-			Expect(*lyrics[0].Line[0].Cue[0].End).To(Equal(int64(1500)))
+			Expect(lyrics[0].Line[0].Cue[0].End).To(BeNil())
 			Expect(*lyrics[0].Line[0].Cue[1].Start).To(Equal(int64(1500)))
 			Expect(lyrics[0].Line[0].Cue[1].Value).To(Equal("lyrics "))
-			Expect(*lyrics[0].Line[0].Cue[1].End).To(Equal(int64(2000)))
+			Expect(lyrics[0].Line[0].Cue[1].End).To(BeNil())
 			Expect(*lyrics[0].Line[0].Cue[2].Start).To(Equal(int64(2000)))
 			Expect(lyrics[0].Line[0].Cue[2].Value).To(Equal("here"))
 			Expect(lyrics[0].Line[0].Cue[2].End).To(BeNil())
@@ -125,6 +125,33 @@ var _ = Describe("sources", func() {
 			Expect(lyrics[0].Line[2].Cue).To(BeNil())
 		})
 
+		It("should return Enhanced LRC lyrics from an ELRC file", func() {
+			mf := model.MediaFile{Path: "tests/fixtures/test.mp3"}
+			lyrics, err := fromExternalFile(ctx, &mf, ".elrc")
+
+			Expect(err).To(BeNil())
+			Expect(lyrics).To(HaveLen(1))
+			Expect(lyrics[0].DisplayArtist).To(Equal("ELRC Artist"))
+			Expect(lyrics[0].DisplayTitle).To(Equal("ELRC Song"))
+			Expect(lyrics[0].Lang).To(Equal("eng"))
+			Expect(lyrics[0].Synced).To(BeTrue())
+			Expect(lyrics[0].Line).To(HaveLen(2))
+
+			Expect(lyrics[0].Line[0].Start).To(Equal(gg.P(int64(1000))))
+			Expect(lyrics[0].Line[0].Value).To(Equal("Lead words"))
+			Expect(lyrics[0].Line[0].Cue).To(HaveLen(2))
+			Expect(*lyrics[0].Line[0].Cue[0].Start).To(Equal(int64(1000)))
+			Expect(lyrics[0].Line[0].Cue[0].Value).To(Equal("Lead "))
+			Expect(lyrics[0].Line[0].Cue[0].End).To(BeNil())
+			Expect(*lyrics[0].Line[0].Cue[1].Start).To(Equal(int64(1500)))
+			Expect(lyrics[0].Line[0].Cue[1].Value).To(Equal("words"))
+			Expect(lyrics[0].Line[0].Cue[1].End).To(BeNil())
+
+			Expect(lyrics[0].Line[1].Start).To(Equal(gg.P(int64(3000))))
+			Expect(lyrics[0].Line[1].Value).To(Equal("Fallback line"))
+			Expect(lyrics[0].Line[1].Cue).To(BeNil())
+		})
+
 		It("should return unsynchronized lyrics from a file", func() {
 			mf := model.MediaFile{Path: "tests/fixtures/test.mp3"}
 			lyrics, err := fromExternalFile(ctx, &mf, ".txt")
@@ -146,6 +173,31 @@ var _ = Describe("sources", func() {
 			}))
 		})
 
+		It("should return synchronized lyrics from an SRT file", func() {
+			mf := model.MediaFile{Path: "tests/fixtures/test.mp3"}
+			lyrics, err := fromExternalFile(ctx, &mf, ".srt")
+
+			Expect(err).To(BeNil())
+			Expect(lyrics).To(Equal(model.LyricList{
+				model.Lyrics{
+					Lang: "xxx",
+					Line: []model.Line{
+						{
+							Start: gg.P(int64(18800)),
+							End:   gg.P(int64(22800)),
+							Value: "We're from subtitles",
+						},
+						{
+							Start: gg.P(int64(22801)),
+							End:   gg.P(int64(26000)),
+							Value: "Another subtitle line",
+						},
+					},
+					Synced: true,
+				},
+			}))
+		})
+
 		It("should return synchronized multilingual lyrics from a TTML file", func() {
 			mf := model.MediaFile{Path: "tests/fixtures/test.mp3"}
 			lyrics, err := fromExternalFile(ctx, &mf, ".ttml")
diff --git a/core/lyrics/srt.go b/core/lyrics/srt.go
new file mode 100644
index 000000000..8fd77abb4
--- /dev/null
+++ b/core/lyrics/srt.go
@@ -0,0 +1,161 @@
+package lyrics
+
+import (
+	"bytes"
+	"regexp"
+	"strconv"
+	"strings"
+
+	"github.com/navidrome/navidrome/model"
+	"github.com/navidrome/navidrome/utils/str"
+)
+
+var srtTimeRegex = regexp.MustCompile(`^\s*(\d{1,2}):(\d{2}):(\d{2})[,.](\d{1,3})\s*$`)
+
+func parseSRT(contents []byte) (model.LyricList, error) {
+	raw := strings.ReplaceAll(string(contents), "\r\n", "\n")
+	raw = strings.ReplaceAll(raw, "\r", "\n")
+
+	blocks := splitSRTBlocks(raw)
+	lines := make([]model.Line, 0, len(blocks))
+
+	for _, block := range blocks {
+		line, ok, err := parseSRTBlock(block)
+		if err != nil {
+			return nil, err
+		}
+		if ok {
+			lines = append(lines, line)
+		}
+	}
+
+	if len(lines) == 0 {
+		return nil, nil
+	}
+
+	lyrics := model.NormalizeLyrics(model.Lyrics{
+		Lang:   "xxx",
+		Line:   lines,
+		Synced: true,
+	})
+	return model.LyricList{lyrics}, nil
+}
+
+func splitSRTBlocks(raw string) []string {
+	raw = strings.TrimSpace(raw)
+	if raw == "" {
+		return nil
+	}
+
+	parts := strings.Split(raw, "\n\n")
+	blocks := make([]string, 0, len(parts))
+	for _, part := range parts {
+		part = strings.TrimSpace(part)
+		if part != "" {
+			blocks = append(blocks, part)
+		}
+	}
+	return blocks
+}
+
+func parseSRTBlock(block string) (model.Line, bool, error) {
+	scanner := bytes.Split([]byte(block), []byte("\n"))
+	if len(scanner) == 0 {
+		return model.Line{}, false, nil
+	}
+
+	lines := make([]string, 0, len(scanner))
+	for _, line := range scanner {
+		lines = append(lines, strings.TrimSpace(string(line)))
+	}
+
+	if len(lines) == 0 {
+		return model.Line{}, false, nil
+	}
+
+	startIdx := 0
+	if digitsOnly(lines[0]) {
+		startIdx = 1
+	}
+	if startIdx >= len(lines) {
+		return model.Line{}, false, nil
+	}
+
+	timing := strings.Split(lines[startIdx], "-->")
+	if len(timing) != 2 {
+		return model.Line{}, false, nil
+	}
+
+	startMs, err := parseSRTTime(timing[0])
+	if err != nil {
+		return model.Line{}, false, err
+	}
+	endMs, err := parseSRTTime(timing[1])
+	if err != nil {
+		return model.Line{}, false, err
+	}
+
+	textLines := make([]string, 0, len(lines)-startIdx-1)
+	for _, line := range lines[startIdx+1:] {
+		if line == "" {
+			continue
+		}
+		textLines = append(textLines, line)
+	}
+
+	value := str.SanitizeText(strings.Join(textLines, "\n"))
+	if value == "" {
+		return model.Line{}, false, nil
+	}
+
+	return model.Line{
+		Start: &startMs,
+		End:   &endMs,
+		Value: value,
+	}, true, nil
+}
+
+func parseSRTTime(value string) (int64, error) {
+	match := srtTimeRegex.FindStringSubmatch(strings.TrimSpace(value))
+	if match == nil {
+		return 0, strconv.ErrSyntax
+	}
+
+	hours, err := strconv.ParseInt(match[1], 10, 64)
+	if err != nil {
+		return 0, err
+	}
+	minutes, err := strconv.ParseInt(match[2], 10, 64)
+	if err != nil {
+		return 0, err
+	}
+	seconds, err := strconv.ParseInt(match[3], 10, 64)
+	if err != nil {
+		return 0, err
+	}
+	millis, err := strconv.ParseInt(match[4], 10, 64)
+	if err != nil {
+		return 0, err
+	}
+
+	switch len(match[4]) {
+	case 1:
+		millis *= 100
+	case 2:
+		millis *= 10
+	}
+
+	return (((hours*60)+minutes)*60+seconds)*1000 + millis, nil
+}
+
+func digitsOnly(value string) bool {
+	if value == "" {
+		return false
+	}
+	for _, ch := range value {
+		if ch < '0' || ch > '9' {
+			return false
+		}
+	}
+	return true
+}
diff --git a/core/lyrics/ttml.go b/core/lyrics/ttml.go
index a0bdcac5a..e79dfe846 100644
--- a/core/lyrics/ttml.go
+++ b/core/lyrics/ttml.go
@@ -46,6 +46,7 @@ type ttmlTimingParams struct {
 type ttmlTimingContext struct {
 	lang     string
 	role     string
+	agentID  string
 	begin    int64
 	hasBegin bool
 	end      int64
@@ -70,6 +71,12 @@ type ttmlResolvedMetadataLine struct {
 	line  model.Line
 }
 
+type ttmlDefinedAgent struct {
+	ID   string
+	Type string
+	Name string
+}
+
 type ttmlParser struct {
 	decoder *xml.Decoder
 	params  ttmlTimingParams
@@ -86,6 +93,8 @@ type ttmlParser struct {
 	pronunciationLangOrder   []string
 	pronunciationEntriesByLg map[string][]ttmlMetadataEntry
 
+	definedAgents map[string]ttmlDefinedAgent
+
 	metadataSeq int
 }
 
@@ -103,6 +112,7 @@ func parseTTML(contents []byte) (model.LyricList, error) {
 		mainLineRefsByKey:        make(map[string]ttmlLineRef),
 		translationEntriesByLg:   make(map[string][]ttmlMetadataEntry),
 		pronunciationEntriesByLg: make(map[string][]ttmlMetadataEntry),
+		definedAgents:            make(map[string]ttmlDefinedAgent),
 	}
 
 	root := ttmlTimingContext{lang: "xxx"}
@@ -140,6 +150,8 @@ func (p *ttmlParser) parseElement(start xml.StartElement, parent ttmlTimingConte
 		return p.parseMetadataTrack(start, parent, ttmlLyricKindTranslation)
 	case "transliteration":
 		return p.parseMetadataTrack(start, parent, ttmlLyricKindPronunciation)
+	case "agent":
+		return p.parseAgentDefinition(start)
 	}
 
 	ctx := p.childContext(start.Attr, parent)
@@ -234,6 +246,49 @@ func (p *ttmlParser) parseMetadataTrack(start xml.StartElement, parent ttmlTimin
 	}
 }
 
+func (p *ttmlParser) parseAgentDefinition(start xml.StartElement) error {
+	id, ok := attrValue(start.Attr, "id")
+	id = strings.TrimSpace(id)
+	if !ok || id == "" {
+		return p.skipElement(start)
+	}
+
+	agent := ttmlDefinedAgent{
+		ID:   id,
+		Type: strings.ToLower(strings.TrimSpace(attrOrEmpty(start.Attr, "type"))),
+	}
+
+	for {
+		token, err := p.decoder.Token()
+		if err != nil {
+			return err
+		}
+
+		switch t := token.(type) {
+		case xml.StartElement:
+			if strings.EqualFold(t.Name.Local, "name") {
+				name, err := p.collectElementText(t)
+				if err != nil {
+					return err
+				}
+				name = sanitizeTTMLText(name)
+				if name != "" && agent.Name == "" {
+					agent.Name = name
+				}
+				continue
+			}
+			if err := p.skipElement(t); err != nil {
+				return err
+			}
+		case xml.EndElement:
+			if strings.EqualFold(t.Name.Local, start.Name.Local) {
+				p.definedAgents[agent.ID] = agent
+				return nil
+			}
+		}
+	}
+}
+
 func (p *ttmlParser) parseMetadataText(start xml.StartElement, parent ttmlTimingContext) (ttmlMetadataEntry, bool, error) {
 	forKey, hasFor := attrValue(start.Attr, "for")
 	forKey = strings.TrimSpace(forKey)
@@ -338,8 +393,8 @@ func (p *ttmlParser) parseInlineElement(start xml.StartElement, parent ttmlTimin
 			tokenText := sanitizeTTMLText(value)
 			if local == "span" && hasOwnTiming && !ctx.invalid && tokenText != "" && len(tokens) == 0 {
 				parsedToken := model.Cue{
-					Value: tokenText,
-					Role:  ctx.role,
+					Value:   tokenText,
+					AgentID: p.resolveCueAgentID(ctx),
 				}
 				if ctx.hasBegin {
 					startMs := ctx.begin
@@ -366,12 +421,12 @@ func (p *ttmlParser) toLyricList() model.LyricList {
 		if len(lines) == 0 {
 			continue
 		}
-		res = append(res, model.Lyrics{
+		res = append(res, p.finalizeLyrics(model.Lyrics{
 			Kind:   ttmlLyricKindMain,
 			Lang:   lang,
 			Line:   lines,
 			Synced: linesAreSynced(lines),
-		})
+		}))
 	}
 
 	res = append(res, p.buildMetadataLyrics(ttmlLyricKindTranslation, p.translationLangOrder, p.translationEntriesByLg)...)
@@ -440,17 +495,168 @@ func (p *ttmlParser) buildMetadataLyrics(kind string, langOrder []string, entrie
 			lines[i] = resolved[i].line
 		}
 
-		res = append(res, model.Lyrics{
+		res = append(res, p.finalizeLyrics(model.Lyrics{
 			Kind:   kind,
 			Lang:   lang,
 			Line:   lines,
 			Synced: linesAreSynced(lines),
-		})
+		}))
 	}
 
 	return res
 }
 
+func (p *ttmlParser) finalizeLyrics(lyrics model.Lyrics) model.Lyrics {
+	lyrics.Line = model.NormalizeCueLines(lyrics.Line)
+	lyrics.Line, lyrics.Agents = p.resolveAgents(lyrics.Line)
+	return model.NormalizeLyrics(lyrics)
+}
+
+func (p *ttmlParser) resolveAgents(lines []model.Line) ([]model.Line, []model.Agent) {
+	if len(lines) == 0 {
+		return lines, nil
+	}
+
+	normalized := model.NormalizeCueLines(lines)
+	usedOrder := make([]string, 0, 4)
+	usedSet := make(map[string]struct{}, 4)
+	sawEmptyCue := false
+
+	for i := range normalized {
+		for j := range normalized[i].Cue {
+			agentID := strings.TrimSpace(normalized[i].Cue[j].AgentID)
+			if agentID == "" {
+				sawEmptyCue = true
+				continue
+			}
+			if _, exists := usedSet[agentID]; !exists {
+				usedSet[agentID] = struct{}{}
+				usedOrder = append(usedOrder, agentID)
+			}
+		}
+	}
+
+	if len(usedOrder) == 0 {
+		return normalized, nil
+	}
+
+	mainID := ""
+	for _, agentID := range usedOrder {
+		role := p.baseRoleForAgent(agentID)
+		if role != "bg" && role != "group" {
+			mainID = agentID
+			break
+		}
+	}
+	if mainID == "" && sawEmptyCue {
+		mainID = "main"
+	}
+	if mainID == "" {
+		for _, agentID := range usedOrder {
+			if p.baseRoleForAgent(agentID) != "bg" {
+				mainID = agentID
+				break
+			}
+		}
+	}
+	if mainID == "" {
+		mainID = usedOrder[0]
+	}
+
+	if _, exists := usedSet[mainID]; !exists {
+		usedSet[mainID] = struct{}{}
+		usedOrder = append([]string{mainID}, usedOrder...)
+	}
+
+	for i := range normalized {
+		for j := range normalized[i].Cue {
+			if strings.TrimSpace(normalized[i].Cue[j].AgentID) == "" {
+				normalized[i].Cue[j].AgentID = mainID
+			}
+		}
+	}
+
+	agents := make([]model.Agent, 0, len(usedOrder))
+	for _, agentID := range usedOrder {
+		role := p.baseRoleForAgent(agentID)
+		if agentID == mainID {
+			role = "main"
+		}
+		agent := model.Agent{
+			ID:   agentID,
+			Role: role,
+			Name: p.agentNameForID(agentID),
+		}
+		agents = append(agents, agent)
+	}
+
+	return normalized, agents
+}
+
+func (p *ttmlParser) resolveCueAgentID(ctx ttmlTimingContext) string {
+	agentID := strings.TrimSpace(ctx.agentID)
+	if contextHasRole(ctx.role, "x-bg") {
+		if agentID == "" {
+			agentID = "main"
+		}
+		return backgroundAgentID(agentID)
+	}
+	return agentID
+}
+
+func (p *ttmlParser) baseRoleForAgent(agentID string) string {
+	if isBackgroundAgentID(agentID) {
+		return "bg"
+	}
+
+	if agent, ok := p.definedAgents[agentID]; ok {
+		switch agent.Type {
+		case "group":
+			return "group"
+		default:
+			return "voice"
+		}
+	}
+
+	return "voice"
+}
+
+func (p *ttmlParser) agentNameForID(agentID string) string {
+	if isBackgroundAgentID(agentID) {
+		baseID := strings.TrimSuffix(agentID, "__bg")
+		if baseID == "main" {
+			return ""
+		}
+		if agent, ok := p.definedAgents[baseID]; ok {
+			return agent.Name
+		}
+		return ""
+	}
+
+	if agent, ok := p.definedAgents[agentID]; ok {
+		return agent.Name
+	}
+
+	return ""
+}
+
+func backgroundAgentID(agentID string) string {
+	return agentID + "__bg"
+}
+
+func isBackgroundAgentID(agentID string) bool {
+	return strings.HasSuffix(agentID, "__bg")
+}
+
+func contextHasRole(roles string, role string) bool {
+	for _, candidate := range strings.Fields(strings.ToLower(roles)) {
+		if candidate == strings.ToLower(role) {
+			return true
+		}
+	}
+	return false
+}
+
 func (p *ttmlParser) addMainLine(lang string, lineKey string, line model.Line) {
 	lang = normalizeTTMLLang(lang)
 	if _, ok := p.mainLinesByLang[lang]; !ok {
@@ -495,6 +701,9 @@ func (p *ttmlParser) childContext(attrs []xml.Attr, parent ttmlTimingContext) tt
 	if lang, ok := attrValue(attrs, "lang"); ok {
 		ctx.lang = normalizeTTMLLang(lang)
 	}
+	if agentID, ok := attrValue(attrs, "agent"); ok {
+		ctx.agentID = strings.TrimSpace(agentID)
+	}
 	if role, ok := attrValue(attrs, "role"); ok {
 		role = strings.TrimSpace(role)
 		if role != "" {
@@ -805,6 +1014,55 @@ func attrValue(attrs []xml.Attr, key string) (string, bool) {
 	return "", false
 }
 
+func attrOrEmpty(attrs []xml.Attr, key string) string {
+	value, _ := attrValue(attrs, key)
+	return value
+}
+
+func (p *ttmlParser) collectElementText(start xml.StartElement) (string, error) {
+	var text strings.Builder
+
+	for {
+		token, err := p.decoder.Token()
+		if err != nil {
+			return "", err
+		}
+
+		switch t := token.(type) {
+		case xml.StartElement:
+			value, err := p.collectElementText(t)
+			if err != nil {
+				return "", err
+			}
+			text.WriteString(value)
+		case xml.EndElement:
+			if strings.EqualFold(t.Name.Local, start.Name.Local) {
+				return text.String(), nil
+			}
+		case xml.CharData:
+			text.WriteString(string(t))
+		}
+	}
+}
+
+func (p *ttmlParser) skipElement(_ xml.StartElement) error {
+	depth := 1
+	for depth > 0 {
+		token, err := p.decoder.Token()
+		if err != nil {
+			return err
+		}
+
+		switch token.(type) {
+		case xml.StartElement:
+			depth++
+		case xml.EndElement:
+			depth--
+		}
+	}
+	return nil
+}
+
 func normalizeTTMLLang(lang string) string {
 	lang = strings.ToLower(strings.TrimSpace(lang))
 	if lang == "" {
@@ -840,42 +1098,7 @@ func linesAreSynced(lines []model.Line) bool {
 }
 
 func hydrateLineTimingFromTokens(line model.Line) model.Line {
-	if len(line.Cue) == 0 {
-		return line
-	}
-
-	var earliestStart *int64
-	var latestEnd *int64
-	for i := range line.Cue {
-		token := line.Cue[i]
-		if token.Start != nil {
-			if earliestStart == nil || *token.Start < *earliestStart {
-				v := *token.Start
-				earliestStart = &v
-			}
-		}
-
-		candidateEnd := token.End
-		if candidateEnd == nil {
-			candidateEnd = token.Start
-		}
-		if candidateEnd != nil {
-			if latestEnd == nil || *candidateEnd > *latestEnd {
-				v := *candidateEnd
-				latestEnd = &v
-			}
-		}
-	}
-
-	if line.Start == nil && earliestStart != nil {
-		v := *earliestStart
-		line.Start = &v
-	}
-	if line.End == nil && latestEnd != nil {
-		v := *latestEnd
-		line.End = &v
-	}
-	return line
+	return model.NormalizeLineTiming(line)
 }
 
 func max(v float64, fallback float64) float64 {
diff --git a/core/lyrics/ttml_test.go b/core/lyrics/ttml_test.go
index 8ec16f679..5fc484a3b 100644
--- a/core/lyrics/ttml_test.go
+++ b/core/lyrics/ttml_test.go
@@ -129,6 +129,10 @@ var _ = Describe("parseTTML", func() {
 			list, err := parseTTML(content)
 			Expect(err).ToNot(HaveOccurred())
 			Expect(list).To(HaveLen(1))
+			Expect(list[0].Agents).To(Equal([]model.Agent{
+				{ID: "main", Role: "main"},
+				{ID: "main__bg", Role: "bg"},
+			}))
 			Expect(list[0].Line).To(HaveLen(1))
 
 			line := list[0].Line[0]
@@ -137,9 +141,41 @@ var _ = Describe("parseTTML", func() {
 			Expect(line.End).To(Equal(gg.P(int64(3000))))
 			Expect(line.Cue).To(HaveLen(3))
 
-			Expect(line.Cue[0]).To(Equal(model.Cue{Start: gg.P(int64(1000)), End: gg.P(int64(1400)), Value: "He"}))
-			Expect(line.Cue[1]).To(Equal(model.Cue{Start: gg.P(int64(1400)), End: gg.P(int64(1800)), Value: "llo"}))
-			Expect(line.Cue[2]).To(Equal(model.Cue{Start: gg.P(int64(2000)), End: gg.P(int64(2500)), Value: "echo", Role: "x-bg"}))
+			Expect(line.Cue[0]).To(Equal(model.Cue{Start: gg.P(int64(1000)), End: gg.P(int64(1400)), Value: "He", AgentID: "main"}))
+			Expect(line.Cue[1]).To(Equal(model.Cue{Start: gg.P(int64(1400)), End: gg.P(int64(1800)), Value: "llo", AgentID: "main"}))
+			Expect(line.Cue[2]).To(Equal(model.Cue{Start: gg.P(int64(2000)), End: gg.P(int64(2500)), Value: "echo", AgentID: "main__bg"}))
+		})
+
+		It("should parse named TTML agents into main, voice, and group roles", func() {
+			content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
+<tt xmlns="http://www.w3.org/ns/ttml" xmlns:ttm="http://www.w3.org/ns/ttml#metadata">
+  <head>
+    <metadata>
+      <ttm:agent xml:id="v1" type="person"><ttm:name>Chris Martin</ttm:name></ttm:agent>
+      <ttm:agent xml:id="v2" type="person"><ttm:name>Jin</ttm:name></ttm:agent>
+      <ttm:agent xml:id="v1000" type="group"><ttm:name>All</ttm:name></ttm:agent>
+    </metadata>
+  </head>
+  <body xml:lang="eng">
+    <div>
+      <p begin="1s" end="2s" ttm:agent="v1"><span begin="1s" end="1.5s">You</span></p>
+      <p begin="2s" end="3s" ttm:agent="v2"><span begin="2s" end="2.5s">and</span></p>
+      <p begin="3s" end="4s" ttm:agent="v1000"><span begin="3s" end="3.5s">All</span></p>
+    </div>
+  </body>
+</tt>`)
+
+			list, err := parseTTML(content)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(list).To(HaveLen(1))
+			Expect(list[0].Agents).To(Equal([]model.Agent{
+				{ID: "v1", Role: "main", Name: "Chris Martin"},
+				{ID: "v2", Role: "voice", Name: "Jin"},
+				{ID: "v1000", Role: "group", Name: "All"},
+			}))
+			Expect(list[0].Line[0].Cue[0].AgentID).To(Equal("v1"))
+			Expect(list[0].Line[1].Cue[0].AgentID).To(Equal("v2"))
+			Expect(list[0].Line[2].Cue[0].AgentID).To(Equal("v1000"))
 		})
 	})
 
diff --git a/model/lyrics.go b/model/lyrics.go
index 9fcd4992e..725c3aa94 100644
--- a/model/lyrics.go
+++ b/model/lyrics.go
@@ -12,10 +12,16 @@ import (
 )
 
 type Cue struct {
-	Start *int64 `structs:"start,omitempty" json:"start,omitempty"`
-	End   *int64 `structs:"end,omitempty"   json:"end,omitempty"`
-	Value string `structs:"value"           json:"value"`
-	Role  string `structs:"role,omitempty"  json:"role,omitempty"`
+	Start   *int64 `structs:"start,omitempty"   json:"start,omitempty"`
+	End     *int64 `structs:"end,omitempty"     json:"end,omitempty"`
+	Value   string `structs:"value"             json:"value"`
+	AgentID string `structs:"agentId,omitempty" json:"agentId,omitempty"`
+}
+
+type Agent struct {
+	ID   string `structs:"id"             json:"id"`
+	Role string `structs:"role"           json:"role"`
+	Name string `structs:"name,omitempty" json:"name,omitempty"`
 }
 
 type Line struct {
@@ -26,13 +32,14 @@ type Line struct {
 }
 
 type Lyrics struct {
-	DisplayArtist string `structs:"displayArtist,omitempty" json:"displayArtist,omitempty"`
-	DisplayTitle  string `structs:"displayTitle,omitempty"  json:"displayTitle,omitempty"`
-	Kind          string `structs:"kind,omitempty"          json:"kind,omitempty"`
-	Lang          string `structs:"lang"                    json:"lang"`
-	Line          []Line `structs:"line"                    json:"line"`
-	Offset        *int64 `structs:"offset,omitempty"        json:"offset,omitempty"`
-	Synced        bool   `structs:"synced"                  json:"synced"`
+	DisplayArtist string  `structs:"displayArtist,omitempty" json:"displayArtist,omitempty"`
+	DisplayTitle  string  `structs:"displayTitle,omitempty"  json:"displayTitle,omitempty"`
+	Kind          string  `structs:"kind,omitempty"          json:"kind,omitempty"`
+	Lang          string  `structs:"lang"                    json:"lang"`
+	Agents        []Agent `structs:"agents,omitempty"       json:"agents,omitempty"`
+	Line          []Line  `structs:"line"                    json:"line"`
+	Offset        *int64  `structs:"offset,omitempty"        json:"offset,omitempty"`
+	Synced        bool    `structs:"synced"                  json:"synced"`
 }
 
 // support the standard [mm:ss.mm], as well as [hh:*] and [*.mmm]
@@ -199,7 +206,7 @@ func ToLyrics(language, text string) (*Lyrics, error) {
 		DisplayArtist: artist,
 		DisplayTitle:  title,
 		Lang:          language,
-		Line:          structuredLines,
+		Line:          NormalizeCueLines(structuredLines),
 		Offset:        offset,
 		Synced:        synced,
 	}
@@ -265,11 +272,6 @@ func parseEnhancedCues(text string) []Cue {
 			Start: &start,
 			Value: seg.text,
 		}
-		// Derive End from the next cue's Start
-		if i+1 < len(segments) {
-			end := segments[i+1].start
-			cues[i].End = &end
-		}
 	}
 	return cues
 }
@@ -338,3 +340,127 @@ func parseTime(line string, match []int) (int64, error) {
 }
 
 type LyricList []Lyrics
+
+func NormalizeLyrics(lyrics Lyrics) Lyrics {
+	lyrics.Line = NormalizeCueLines(lyrics.Line)
+	if len(lyrics.Agents) == 0 {
+		lyrics.Agents = nil
+	}
+	return lyrics
+}
+
+func NormalizeCueLines(lines []Line) []Line {
+	if len(lines) == 0 {
+		return lines
+	}
+
+	normalized := make([]Line, len(lines))
+	copy(normalized, lines)
+
+	for i := range normalized {
+		var fallbackEnd *int64
+		if normalized[i].End != nil {
+			v := *normalized[i].End
+			fallbackEnd = &v
+		} else if i+1 < len(normalized) && normalized[i+1].Start != nil {
+			v := *normalized[i+1].Start
+			fallbackEnd = &v
+		}
+
+		normalized[i] = normalizeCueLine(normalized[i], fallbackEnd)
+	}
+
+	return normalized
+}
+
+func NormalizeLineTiming(line Line) Line {
+	if len(line.Cue) == 0 {
+		return line
+	}
+
+	var earliestStart *int64
+	var latestEnd *int64
+	for i := range line.Cue {
+		token := line.Cue[i]
+		if token.Start != nil {
+			if earliestStart == nil || *token.Start < *earliestStart {
+				v := *token.Start
+				earliestStart = &v
+			}
+		}
+
+		candidateEnd := token.End
+		if candidateEnd == nil {
+			candidateEnd = token.Start
+		}
+		if candidateEnd != nil {
+			if latestEnd == nil || *candidateEnd > *latestEnd {
+				v := *candidateEnd
+				latestEnd = &v
+			}
+		}
+	}
+
+	if line.Start == nil && earliestStart != nil {
+		v := *earliestStart
+		line.Start = &v
+	}
+	if line.End == nil && latestEnd != nil {
+		v := *latestEnd
+		line.End = &v
+	}
+	return line
+}
+
+func normalizeCueLine(line Line, fallbackEnd *int64) Line {
+	if len(line.Cue) == 0 {
+		return line
+	}
+
+	hasAnyEnd := false
+	for i := range line.Cue {
+		if line.Cue[i].End != nil {
+			hasAnyEnd = true
+			break
+		}
+	}
+	if !hasAnyEnd {
+		line.Cue = clearCueEnds(line.Cue)
+		return NormalizeLineTiming(line)
+	}
+
+	for i := range line.Cue {
+		if line.Cue[i].End != nil {
+			continue
+		}
+
+		if i+1 < len(line.Cue) && line.Cue[i+1].Start != nil {
+			v := *line.Cue[i+1].Start
+			line.Cue[i].End = &v
+			continue
+		}
+
+		if fallbackEnd != nil {
+			v := *fallbackEnd
+			line.Cue[i].End = &v
+		}
+	}
+
+	for i := range line.Cue {
+		if line.Cue[i].End == nil {
+			line.Cue = clearCueEnds(line.Cue)
+			return NormalizeLineTiming(line)
+		}
+	}
+
+	return NormalizeLineTiming(line)
+}
+
+func clearCueEnds(cues []Cue) []Cue {
+	normalized := make([]Cue, len(cues))
+	copy(normalized, cues)
+	for i := range normalized {
+		normalized[i].End = nil
+	}
+	return normalized
+}
diff --git a/model/lyrics_test.go b/model/lyrics_test.go
index 2228306d0..9aad7d968 100644
--- a/model/lyrics_test.go
+++ b/model/lyrics_test.go
@@ -129,8 +129,8 @@ var _ = Describe("ToLyrics", func() {
 		Expect(line0.Start).To(Equal(&t1000))
 		Expect(line0.Value).To(Equal("Some lyrics here"))
 		Expect(line0.Cue).To(Equal([]Cue{
-			{Start: &t1000, End: &t1500, Value: "Some "},
-			{Start: &t1500, End: &t2000, Value: "lyrics "},
+			{Start: &t1000, Value: "Some "},
+			{Start: &t1500, Value: "lyrics "},
 			{Start: &t2000, Value: "here"},
 		}))
 
@@ -138,7 +138,7 @@ var _ = Describe("ToLyrics", func() {
 		Expect(line1.Start).To(Equal(&t3000))
 		Expect(line1.Value).To(Equal("More words"))
 		Expect(line1.Cue).To(Equal([]Cue{
-			{Start: &t3000, End: &t3500, Value: "More "},
+			{Start: &t3000, Value: "More "},
 			{Start: &t3500, Value: "words"},
 		}))
 	})
@@ -161,7 +161,7 @@ var _ = Describe("ToLyrics", func() {
 		t1000, t1500, t5000, t5500 := int64(1000), int64(1500), int64(5000), int64(5500)
 
 		Expect(lyrics.Line[0].Cue).To(Equal([]Cue{
-			{Start: &t1000, End: &t1500, Value: "Some "},
+			{Start: &t1000, Value: "Some "},
 			{Start: &t1500, Value: "lyrics"},
 		}))
 		Expect(lyrics.Line[0].Value).To(Equal("Some lyrics"))
@@ -170,7 +170,7 @@ var _ = Describe("ToLyrics", func() {
 		Expect(lyrics.Line[1].Value).To(Equal("Plain line"))
 
 		Expect(lyrics.Line[2].Cue).To(Equal([]Cue{
-			{Start: &t5000, End: &t5500, Value: "More "},
+			{Start: &t5000, Value: "More "},
 			{Start: &t5500, Value: "words"},
 		}))
 		Expect(lyrics.Line[2].Value).To(Equal("More words"))
diff --git a/server/subsonic/helpers.go b/server/subsonic/helpers.go
index b881fa169..ad769ee94 100644
--- a/server/subsonic/helpers.go
+++ b/server/subsonic/helpers.go
@@ -493,14 +493,22 @@ func mapExplicitStatus(explicitStatus string) string {
 	return ""
 }
 
-// sanitizeRole strips the TTML x- prefix from role values for the API.
-func sanitizeRole(role string) string {
-	return strings.TrimPrefix(role, "x-")
-}
-
 func buildStructuredLyric(mf *model.MediaFile, lyrics model.Lyrics, enhanced bool) responses.StructuredLyric {
 	lines := make([]responses.Line, len(lyrics.Line))
 	var cueLines []responses.CueLine
+	agentOrderByID := make(map[string]int, len(lyrics.Agents))
+	agentRoleByID := make(map[string]string, len(lyrics.Agents))
+	responseAgents := make([]responses.Agent, 0, len(lyrics.Agents))
+
+	for i, agent := range lyrics.Agents {
+		agentOrderByID[agent.ID] = i
+		agentRoleByID[agent.ID] = agent.Role
+		responseAgents = append(responseAgents, responses.Agent{
+			ID:   agent.ID,
+			Role: agent.Role,
+			Name: agent.Name,
+		})
+	}
 
 	for i, line := range lyrics.Line {
 		lines[i] = responses.Line{
@@ -511,41 +519,50 @@ func buildStructuredLyric(mf *model.MediaFile, lyrics model.Lyrics, enhanced boo
 			continue
 		}
 
-		// Group cues by role, preserving order of first appearance
-		roleOrder := make([]string, 0, 2)
-		cuesByRole := make(map[string][]responses.LyricCue)
+		agentOrder := make([]string, 0, 2)
+		cuesByAgent := make(map[string][]model.Cue)
 		for _, cue := range line.Cue {
 			if cue.Start == nil {
 				continue
 			}
-			role := sanitizeRole(cue.Role)
-			if _, exists := cuesByRole[role]; !exists {
-				roleOrder = append(roleOrder, role)
+			agentID := strings.TrimSpace(cue.AgentID)
+			if _, exists := cuesByAgent[agentID]; !exists {
+				agentOrder = append(agentOrder, agentID)
 			}
-			cuesByRole[role] = append(cuesByRole[role], responses.LyricCue{
-				Start: *cue.Start,
-				End:   cue.End,
-				Value: cue.Value,
-			})
+			cuesByAgent[agentID] = append(cuesByAgent[agentID], cue)
 		}
 
-		// Ensure main vocals (empty role) always comes first
-		sort.SliceStable(roleOrder, func(i, j int) bool {
-			return roleOrder[i] == "" && roleOrder[j] != ""
+		sort.SliceStable(agentOrder, func(i, j int) bool {
+			leftRole := agentRoleByID[agentOrder[i]]
+			rightRole := agentRoleByID[agentOrder[j]]
+			if leftRole == "main" && rightRole != "main" {
+				return true
+			}
+			if rightRole == "main" && leftRole != "main" {
+				return false
+			}
+
+			leftOrder, leftOK := agentOrderByID[agentOrder[i]]
+			rightOrder, rightOK := agentOrderByID[agentOrder[j]]
+			if leftOK && rightOK && leftOrder != rightOrder {
+				return leftOrder < rightOrder
+			}
+			if leftOK != rightOK {
+				return leftOK
+			}
+			return i < j
 		})
 
-		// Create a separate CueLine for each role group
-		for _, role := range roleOrder {
-			cues := cuesByRole[role]
+		for _, agentID := range agentOrder {
 			cueLine := responses.CueLine{
 				Index: int32(i),
 				Start: line.Start,
 				End:   line.End,
 				Value: line.Value,
-				Cue:   cues,
+				Cue:   buildLyricCues(cuesByAgent[agentID], line.End),
 			}
-			if role != "" {
-				cueLine.Role = role
+			if agentID != "" {
+				cueLine.AgentID = agentID
 			}
 			cueLines = append(cueLines, cueLine)
 		}
@@ -567,6 +584,9 @@ func buildStructuredLyric(mf *model.MediaFile, lyrics model.Lyrics, enhanced boo
 			kind = "main"
 		}
 		structured.Kind = kind
+		if len(cueLines) > 0 && len(responseAgents) > 0 {
+			structured.Agents = responseAgents
+		}
 	}
 
 	if structured.DisplayArtist == "" {
@@ -579,6 +599,67 @@ func buildStructuredLyric(mf *model.MediaFile, lyrics model.Lyrics, enhanced boo
 	return structured
 }
 
+func buildLyricCues(cues []model.Cue, lineEnd *int64) []responses.LyricCue {
+	if len(cues) == 0 {
+		return nil
+	}
+
+	hasAnyEnd := false
+	for i := range cues {
+		if cues[i].End != nil {
+			hasAnyEnd = true
+			break
+		}
+	}
+
+	normalized := make([]responses.LyricCue, 0, len(cues))
+	for i := range cues {
+		if cues[i].Start == nil {
+			continue
+		}
+
+		cue := responses.LyricCue{
+			Start: *cues[i].Start,
+			Value: cues[i].Value,
+		}
+		if hasAnyEnd {
+			end := cues[i].End
+			if end == nil {
+				if i+1 < len(cues) && cues[i+1].Start != nil {
+					v := *cues[i+1].Start
+					end = &v
+				} else if lineEnd != nil {
+					v := *lineEnd
+					end = &v
+				}
+			}
+			if end != nil && i+1 < len(cues) && cues[i+1].Start != nil && *end > *cues[i+1].Start {
+				v := *cues[i+1].Start
+				end = &v
+			}
+			if end != nil && *end < cue.Start {
+				v := cue.Start
+				end = &v
+			}
+			cue.End = end
+		}
+		normalized = append(normalized, cue)
+	}
+
+	if hasAnyEnd {
+		for i := range normalized {
+			if normalized[i].End == nil {
+				for j := range normalized {
+					normalized[j].End = nil
+				}
+				break
+			}
+		}
+	}
+
+	return normalized
+}
+
 func buildLyricsList(mf *model.MediaFile, lyricsList model.LyricList, enhanced bool) *responses.LyricsList {
 	var filtered model.LyricList
 	if enhanced {
diff --git a/server/subsonic/media_retrieval_test.go b/server/subsonic/media_retrieval_test.go
index 0fdbb3854..5489492ce 100644
--- a/server/subsonic/media_retrieval_test.go
+++ b/server/subsonic/media_retrieval_test.go
@@ -235,6 +235,7 @@ var _ = Describe("MediaRetrievalController", func() {
 				Expect(realLyric.Kind).To(Equal(expectedLyric.Kind))
 				Expect(realLyric.Lang).To(Equal(expectedLyric.Lang))
 				Expect(realLyric.Synced).To(Equal(expectedLyric.Synced))
+				Expect(realLyric.Agents).To(Equal(expectedLyric.Agents))
 
 				if expectedLyric.Offset == nil {
 					Expect(realLyric.Offset).To(BeNil())
@@ -259,7 +260,7 @@ var _ = Describe("MediaRetrievalController", func() {
 					expectedCueLine := expectedLyric.CueLine[j]
 					Expect(realCueLine.Index).To(Equal(expectedCueLine.Index))
 					Expect(realCueLine.Value).To(Equal(expectedCueLine.Value))
-					Expect(realCueLine.Role).To(Equal(expectedCueLine.Role))
+					Expect(realCueLine.AgentID).To(Equal(expectedCueLine.AgentID))
 					if expectedCueLine.Start == nil {
 						Expect(realCueLine.Start).To(BeNil())
 					} else {
@@ -542,6 +543,7 @@ var _ = Describe("MediaRetrievalController", func() {
 			lyricsJson, err := json.Marshal(model.LyricList{
 				{
 					Lang:   "eng",
+					Agents: []model.Agent{{ID: "lead", Role: "main"}, {ID: "lead__bg", Role: "bg"}},
 					Synced: true,
 					Line: []model.Line{
 						{
@@ -550,15 +552,16 @@ var _ = Describe("MediaRetrievalController", func() {
 							Value: "Hello echo",
 							Cue: []model.Cue{
 								{
-									Start: &tokenStartA,
-									End:   &tokenEndA,
-									Value: "Hello",
+									Start:   &tokenStartA,
+									End:     &tokenEndA,
+									Value:   "Hello",
+									AgentID: "lead",
 								},
 								{
-									Start: &tokenStartB,
-									End:   &tokenEndB,
-									Value: "echo",
-									Role:  "x-bg",
+									Start:   &tokenStartB,
+									End:     &tokenEndB,
+									Value:   "echo",
+									AgentID: "lead__bg",
 								},
 							},
 						},
@@ -586,6 +589,10 @@ var _ = Describe("MediaRetrievalController", func() {
 						Kind:          "main",
 						Lang:          "eng",
 						Synced:        true,
+						Agents: []responses.Agent{
+							{ID: "lead", Role: "main"},
+							{ID: "lead__bg", Role: "bg"},
+						},
 						Line: []responses.Line{
 							{
 								Start: &lineStart,
@@ -594,10 +601,11 @@ var _ = Describe("MediaRetrievalController", func() {
 						},
 						CueLine: []responses.CueLine{
 							{
-								Index: 0,
-								Start: &lineStart,
-								End:   &lineEnd,
-								Value: "Hello echo",
+								Index:   0,
+								Start:   &lineStart,
+								End:     &lineEnd,
+								Value:   "Hello echo",
+								AgentID: "lead",
 								Cue: []responses.LyricCue{
 									{
 										Start: tokenStartA,
@@ -607,11 +615,11 @@ var _ = Describe("MediaRetrievalController", func() {
 								},
 							},
 							{
-								Index: 0,
-								Start: &lineStart,
-								End:   &lineEnd,
-								Value: "Hello echo",
-								Role:  "bg",
+								Index:   0,
+								Start:   &lineStart,
+								End:     &lineEnd,
+								Value:   "Hello echo",
+								AgentID: "lead__bg",
 								Cue: []responses.LyricCue{
 									{
 										Start: tokenStartB,
diff --git a/server/subsonic/responses/responses.go b/server/subsonic/responses/responses.go
index f5446a961..344dd9999 100644
--- a/server/subsonic/responses/responses.go
+++ b/server/subsonic/responses/responses.go
@@ -543,13 +543,19 @@ type LyricCue struct {
 	Value string `xml:",chardata"            json:"value"`
 }
 
+type Agent struct {
+	ID   string `xml:"id,attr"                 json:"id"`
+	Role string `xml:"role,attr"               json:"role"`
+	Name string `xml:"name,attr,omitempty"     json:"name,omitempty"`
+}
+
 type CueLine struct {
-	Index int32      `xml:"index,attr"                    json:"index"`
-	Start *int64     `xml:"start,attr,omitempty"         json:"start,omitempty"`
-	End   *int64     `xml:"end,attr,omitempty"           json:"end,omitempty"`
-	Value string     `xml:"value,attr,omitempty"         json:"value,omitempty"`
-	Role  string     `xml:"role,attr,omitempty"          json:"role,omitempty"`
-	Cue   []LyricCue `xml:"cue,omitempty"        json:"cue,omitempty"`
+	Index   int32      `xml:"index,attr"                    json:"index"`
+	Start   *int64     `xml:"start,attr,omitempty"          json:"start,omitempty"`
+	End     *int64     `xml:"end,attr,omitempty"            json:"end,omitempty"`
+	Value   string     `xml:"value,attr,omitempty"          json:"value,omitempty"`
+	AgentID string     `xml:"agentId,attr,omitempty"        json:"agentId,omitempty"`
+	Cue     []LyricCue `xml:"cue,omitempty"                 json:"cue,omitempty"`
 }
 
 type StructuredLyric struct {
@@ -558,6 +564,7 @@ type StructuredLyric struct {
 	Kind          string    `xml:"kind,attr,omitempty"          json:"kind,omitempty"`
 	Lang          string    `xml:"lang,attr"                    json:"lang"`
 	Line          []Line    `xml:"line"                         json:"line"`
+	Agents        []Agent   `xml:"agent,omitempty"              json:"agents,omitempty"`
 	CueLine       []CueLine `xml:"cueLine,omitempty"     json:"cueLine,omitempty"`
 	Offset        *int64    `xml:"offset,attr,omitempty"        json:"offset,omitempty"`
 	Synced        bool      `xml:"synced,attr"                  json:"synced"`
diff --git a/tests/fixtures/test.elrc b/tests/fixtures/test.elrc
new file mode 100644
index 000000000..01c3d2cdd
--- /dev/null
+++ b/tests/fixtures/test.elrc
@@ -0,0 +1,5 @@
+[ar:ELRC Artist]
+[ti:ELRC Song]
+[lang:eng]
+[00:01.00]<00:01.00>Lead <00:01.50>words
+[00:03.00]Fallback line
diff --git a/tests/fixtures/test.srt b/tests/fixtures/test.srt
new file mode 100644
index 000000000..3c9c09a39
--- /dev/null
+++ b/tests/fixtures/test.srt
@@ -0,0 +1,7 @@
+1
+00:00:18,800 --> 00:00:22,800
+We're from subtitles
+
+2
+00:00:22,801 --> 00:00:26,000
+Another subtitle line
diff --git a/ui/src/audioplayer/PlayerToolbar.jsx b/ui/src/audioplayer/PlayerToolbar.jsx
index 869df475d..8487b0655 100644
--- a/ui/src/audioplayer/PlayerToolbar.jsx
+++ b/ui/src/audioplayer/PlayerToolbar.jsx
@@ -108,7 +108,7 @@ const PlayerToolbar = ({
   )
 
   const toggleLyricsButton = (
-    <Tooltip title="Toggle synchronized lyrics">
+    <Tooltip title="Toggle lyrics">
       <span>
         <IconButton
           size={isDesktop ? 'small' : undefined}
diff --git a/ui/src/audioplayer/lyrics.js b/ui/src/audioplayer/lyrics.js
index 111ded02e..cd5248096 100644
--- a/ui/src/audioplayer/lyrics.js
+++ b/ui/src/audioplayer/lyrics.js
@@ -62,6 +62,11 @@ const hasTimedLines = (lyric) =>
   Array.isArray(lyric.line) &&
   lyric.line.some((line) => Number.isFinite(Number(line.start)))
 
+const preferTimedLyrics = (lyrics) => {
+  const timed = lyrics.filter(hasTimedLines)
+  return timed.length > 0 ? timed : lyrics
+}
+
 const normalizeToken = (token) => {
   if (!token) {
     return null
@@ -77,10 +82,38 @@ const normalizeToken = (token) => {
   }
 }
 
-const normalizeCueLine = (cueLine, fallbackIndex) => {
+const buildAgentLookup = (structuredLyric) => {
+  const lookup = new Map()
+  const agents = Array.isArray(structuredLyric?.agents) ? structuredLyric.agents : []
+  for (const agent of agents) {
+    const id = typeof agent?.id === 'string' ? agent.id : ''
+    if (!id || lookup.has(id)) {
+      continue
+    }
+    lookup.set(id, {
+      id,
+      role: typeof agent?.role === 'string' ? agent.role : '',
+      name: typeof agent?.name === 'string' ? agent.name : '',
+    })
+  }
+  return lookup
+}
+
+const deriveUiRole = (agent) => {
+  if (!agent?.role || agent.role === 'main') {
+    return ''
+  }
+  return agent.role
+}
+
+const normalizeCueLine = (cueLine, fallbackIndex, agentLookup) => {
   const index = Number.isFinite(Number(cueLine?.index))
     ? Number(cueLine.index)
     : fallbackIndex
+  const agentId = typeof cueLine?.agentId === 'string' ? cueLine.agentId : ''
+  const agent = agentId ? agentLookup.get(agentId) || null : null
+  const fallbackRole =
+    typeof cueLine?.role === 'string' ? cueLine.role : ''
   const tokens = sortTokensByStart(
     Array.isArray(cueLine?.cue)
       ? cueLine.cue.map(normalizeToken).filter(Boolean)
@@ -92,7 +125,10 @@ const normalizeCueLine = (cueLine, fallbackIndex) => {
     start: toTime(cueLine?.start),
     end: toTime(cueLine?.end),
     value: typeof cueLine?.value === 'string' ? cueLine.value : '',
-    role: typeof cueLine?.role === 'string' ? cueLine.role : '',
+    role: agent ? deriveUiRole(agent) : fallbackRole,
+    agentId,
+    agentRole: agent?.role || fallbackRole,
+    agentName: agent?.name || '',
     tokens,
   }
 }
@@ -194,6 +230,9 @@ const buildSyntheticWordTokens = (line, token) => {
     end: baseStart + (duration * (idx + 1)) / chunks.length,
     value: chunk,
     role: typeof token?.role === 'string' ? token.role : '',
+    agentId: typeof token?.agentId === 'string' ? token.agentId : '',
+    agentName: typeof token?.agentName === 'string' ? token.agentName : '',
+    agentRole: typeof token?.agentRole === 'string' ? token.agentRole : '',
   }))
 }
 
@@ -240,8 +279,8 @@ export const selectLyricLayers = (structuredLyrics, preferredLanguage) => {
     }
   }
 
-  const synced = structuredLyrics.filter(hasTimedLines)
-  if (synced.length === 0) {
+  const available = structuredLyrics.filter(hasStructuredLyricContent)
+  if (available.length === 0) {
     return {
       main: null,
       translation: null,
@@ -255,22 +294,25 @@ export const selectLyricLayers = (structuredLyrics, preferredLanguage) => {
     [LYRIC_KIND_PRONUNCIATION]: [],
   }
 
-  for (const lyric of synced) {
+  for (const lyric of available) {
     grouped[normalizeLyricKind(lyric?.kind)].push(lyric)
   }
 
   const mainCandidates = grouped[LYRIC_KIND_MAIN].length
     ? grouped[LYRIC_KIND_MAIN]
-    : synced
+    : available
 
   return {
-    main: pickLyricByLanguage(mainCandidates, preferredLanguage),
+    main: pickLyricByLanguage(
+      preferTimedLyrics(mainCandidates),
+      preferredLanguage,
+    ),
     translation: pickLyricByLanguage(
-      grouped[LYRIC_KIND_TRANSLATION],
+      preferTimedLyrics(grouped[LYRIC_KIND_TRANSLATION]),
       preferredLanguage,
     ),
     pronunciation: pickLyricByLanguage(
-      grouped[LYRIC_KIND_PRONUNCIATION],
+      preferTimedLyrics(grouped[LYRIC_KIND_PRONUNCIATION]),
       preferredLanguage,
     ),
   }
@@ -316,6 +358,7 @@ export const buildKaraokeLines = (structuredLyric) => {
     return []
   }
 
+  const agentLookup = buildAgentLookup(structuredLyric)
   const baseLines = Array.isArray(structuredLyric.line)
     ? structuredLyric.line
     : []
@@ -328,12 +371,19 @@ export const buildKaraokeLines = (structuredLyric) => {
       ? (() => {
           const normalizedCueLines = rawCueLines.map(
             (cueLine, fallbackIndex) => {
-              const normalized = normalizeCueLine(cueLine, fallbackIndex)
+              const normalized = normalizeCueLine(
+                cueLine,
+                fallbackIndex,
+                agentLookup,
+              )
               return {
                 ...normalized,
                 tokens: normalized.tokens.map((token) => ({
                   ...token,
                   role: normalized.role,
+                  agentId: normalized.agentId,
+                  agentName: normalized.agentName,
+                  agentRole: normalized.agentRole,
                 })),
               }
             },
@@ -366,6 +416,9 @@ export const buildKaraokeLines = (structuredLyric) => {
               start: first.start ?? toTime(baseLine.start) ?? fallbackStart,
               end: first.end ?? toTime(baseLine.end) ?? fallbackEnd,
               value,
+              agentId: first.agentId,
+              agentName: first.agentName,
+              agentRole: first.agentRole,
               tokens,
             }
           })
diff --git a/ui/src/audioplayer/lyrics.test.js b/ui/src/audioplayer/lyrics.test.js
index 6cb3a1b87..3a5f83b2d 100644
--- a/ui/src/audioplayer/lyrics.test.js
+++ b/ui/src/audioplayer/lyrics.test.js
@@ -124,6 +124,49 @@ describe('lyrics helpers', () => {
     expect(layers.pronunciation).toBeNull()
   })
 
+  it('falls back to unsynced lyric content when no timed track exists', () => {
+    const layers = selectLyricLayers(
+      [
+        {
+          lang: 'eng',
+          synced: false,
+          line: [{ value: 'Plain embedded lyric' }],
+        },
+      ],
+      'eng',
+    )
+
+    expect(layers.main).toEqual({
+      lang: 'eng',
+      synced: false,
+      line: [{ value: 'Plain embedded lyric' }],
+    })
+  })
+
+  it('still prefers timed lyrics when both timed and untimed tracks exist', () => {
+    const layers = selectLyricLayers(
+      [
+        {
+          lang: 'eng',
+          synced: false,
+          line: [{ value: 'Plain lyric' }],
+        },
+        {
+          lang: 'eng',
+          synced: true,
+          line: [{ start: 1000, value: 'Timed lyric' }],
+        },
+      ],
+      'eng',
+    )
+
+    expect(layers.main).toEqual({
+      lang: 'eng',
+      synced: true,
+      line: [{ start: 1000, value: 'Timed lyric' }],
+    })
+  })
+
   it('matches layer line by timing for the active main line', () => {
     const mainLines = [
       { index: 0, start: 1000, end: 1800, value: 'Line A', tokens: [] },
@@ -200,43 +243,88 @@ describe('lyrics helpers', () => {
     expect(getPreferredLyricLanguage()).toBe('pt-BR')
   })
 
-  it('builds karaoke lines from cueLine payload', () => {
+  it('builds karaoke lines from agent-based cueLine payload', () => {
+    const lines = buildKaraokeLines({
+      lang: 'eng',
+      synced: true,
+      line: [{ start: 1000, end: 3000, value: 'Hello world' }],
+      agents: [
+        { id: 'lead', role: 'main', name: 'Lead Vocal' },
+        { id: 'backing', role: 'bg' },
+      ],
+      cueLine: [
+        {
+          index: 0,
+          start: 1000,
+          end: 3000,
+          value: 'Hello world',
+          agentId: 'lead',
+          cue: [{ start: 1000, end: 1500, value: 'Hello' }],
+        },
+        {
+          index: 0,
+          start: 1000,
+          end: 3000,
+          value: 'Hello world',
+          agentId: 'backing',
+          cue: [{ start: 2000, end: 2500, value: 'world' }],
+        },
+      ],
+    })
+
+    expect(lines).toEqual([
+      {
+        agentId: 'lead',
+        agentName: 'Lead Vocal',
+        agentRole: 'main',
+        index: 0,
+        start: 1000,
+        end: 3000,
+        value: 'Hello world',
+        tokens: [
+          {
+            start: 1000,
+            end: 1500,
+            value: 'Hello',
+            role: '',
+            agentId: 'lead',
+            agentName: 'Lead Vocal',
+            agentRole: 'main',
+          },
+          {
+            start: 2000,
+            end: 2500,
+            value: 'world',
+            role: 'bg',
+            agentId: 'backing',
+            agentName: '',
+            agentRole: 'bg',
+          },
+        ],
+      },
+    ])
+  })
+
+  it('falls back to legacy cueLine role values when agents are absent', () => {
     const lines = buildKaraokeLines({
       lang: 'eng',
       synced: true,
       line: [{ start: 1000, end: 3000, value: 'Hello world' }],
       cueLine: [
-        {
-          index: 0,
-          start: 1000,
-          end: 3000,
-          value: 'Hello world',
-          role: '',
-          cue: [{ start: 1000, end: 1500, value: 'Hello' }],
-        },
         {
           index: 0,
           start: 1000,
           end: 3000,
           value: 'Hello world',
           role: 'bg',
-          cue: [{ start: 2000, end: 2500, value: 'world' }],
+          cue: [{ start: 1000, end: 1500, value: 'Hello' }],
         },
       ],
     })
 
-    expect(lines).toEqual([
-      {
-        index: 0,
-        start: 1000,
-        end: 3000,
-        value: 'Hello world',
-        tokens: [
-          { start: 1000, end: 1500, value: 'Hello', role: '' },
-          { start: 2000, end: 2500, value: 'world', role: 'bg' },
-        ],
-      },
-    ])
+    expect(lines[0].tokens[0].role).toBe('bg')
+    expect(lines[0].tokens[0].agentId).toBe('')
+    expect(lines[0].tokens[0].agentName).toBe('')
   })
 
   it('sorts token timing by start to keep playback stable', () => {

From 554074b12052a58e122ed68e8cd359fcc9f13631 Mon Sep 17 00:00:00 2001
From: ranokay <github@ranokay.com>
Date: Sat, 21 Mar 2026 00:32:01 +0200
Subject: [PATCH 08/14] fix(lyrics): avoid derived TTML agent id collisions

---
 README.md                               |  2 +-
 core/lyrics/ttml.go                     |  7 +++--
 core/lyrics/ttml_test.go                | 42 +++++++++++++++++++++++--
 server/subsonic/media_retrieval_test.go |  8 ++---
 4 files changed, 49 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 6b9aff799..645f1580d 100644
--- a/README.md
+++ b/README.md
@@ -52,7 +52,7 @@ A share of the revenue helps fund the development of Navidrome at no additional
  - **Multi-platform**, runs on macOS, Linux and Windows. **Docker** images are also provided
  - Ready to use binaries for all major platforms, including **Raspberry Pi**
  - Automatically **monitors your library** for changes, importing new files and reloading new metadata 
- - Supports synchronized lyrics from sidecar **.lrc** and **.ttml** files (via `lyricspriority`)
+ - Supports lyrics from sidecar **.ttml**, **.elrc**, **.lrc**, **.srt**, **.txt** files and embedded tags (via `lyricspriority`)
  - **Themeable**, modern and responsive **Web interface** based on [Material UI](https://material-ui.com)
  - **Compatible** with all Subsonic/Madsonic/Airsonic [clients](https://www.navidrome.org/docs/overview/#apps)
  - **Transcoding** on the fly. Can be set per user/player. **Opus encoding is supported**
diff --git a/core/lyrics/ttml.go b/core/lyrics/ttml.go
index e79dfe846..adbc0c054 100644
--- a/core/lyrics/ttml.go
+++ b/core/lyrics/ttml.go
@@ -24,6 +24,7 @@ const (
 	ttmlLyricKindMain          = "main"
 	ttmlLyricKindTranslation   = "translation"
 	ttmlLyricKindPronunciation = "pronunciation"
+	ttmlBackgroundAgentPrefix  = "__nd_bg__|"
 )
 
 var offsetTimeRegex = regexp.MustCompile(`^([0-9]+(?:\.[0-9]+)?)(h|m|s|ms|f|t)$`)
@@ -623,7 +624,7 @@ func (p *ttmlParser) baseRoleForAgent(agentID string) string {
 
 func (p *ttmlParser) agentNameForID(agentID string) string {
 	if isBackgroundAgentID(agentID) {
-		baseID := strings.TrimSuffix(agentID, "__bg")
+		baseID := strings.TrimPrefix(agentID, ttmlBackgroundAgentPrefix)
 		if baseID == "main" {
 			return ""
 		}
@@ -641,11 +642,11 @@ func (p *ttmlParser) agentNameForID(agentID string) string {
 }
 
 func backgroundAgentID(agentID string) string {
-	return agentID + "__bg"
+	return ttmlBackgroundAgentPrefix + agentID
 }
 
 func isBackgroundAgentID(agentID string) bool {
-	return strings.HasSuffix(agentID, "__bg")
+	return strings.HasPrefix(agentID, ttmlBackgroundAgentPrefix)
 }
 
 func contextHasRole(roles string, role string) bool {
diff --git a/core/lyrics/ttml_test.go b/core/lyrics/ttml_test.go
index 5fc484a3b..4e81197d4 100644
--- a/core/lyrics/ttml_test.go
+++ b/core/lyrics/ttml_test.go
@@ -131,7 +131,7 @@ var _ = Describe("parseTTML", func() {
 			Expect(list).To(HaveLen(1))
 			Expect(list[0].Agents).To(Equal([]model.Agent{
 				{ID: "main", Role: "main"},
-				{ID: "main__bg", Role: "bg"},
+				{ID: "__nd_bg__|main", Role: "bg"},
 			}))
 			Expect(list[0].Line).To(HaveLen(1))
 
@@ -143,7 +143,7 @@ var _ = Describe("parseTTML", func() {
 
 			Expect(line.Cue[0]).To(Equal(model.Cue{Start: gg.P(int64(1000)), End: gg.P(int64(1400)), Value: "He", AgentID: "main"}))
 			Expect(line.Cue[1]).To(Equal(model.Cue{Start: gg.P(int64(1400)), End: gg.P(int64(1800)), Value: "llo", AgentID: "main"}))
-			Expect(line.Cue[2]).To(Equal(model.Cue{Start: gg.P(int64(2000)), End: gg.P(int64(2500)), Value: "echo", AgentID: "main__bg"}))
+			Expect(line.Cue[2]).To(Equal(model.Cue{Start: gg.P(int64(2000)), End: gg.P(int64(2500)), Value: "echo", AgentID: "__nd_bg__|main"}))
 		})
 
 		It("should parse named TTML agents into main, voice, and group roles", func() {
@@ -177,6 +177,44 @@ var _ = Describe("parseTTML", func() {
 			Expect(list[0].Line[1].Cue[0].AgentID).To(Equal("v2"))
 			Expect(list[0].Line[2].Cue[0].AgentID).To(Equal("v1000"))
 		})
+
+		It("should avoid collisions between derived background agents and explicit TTML agent ids", func() {
+			content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
+<tt xmlns="http://www.w3.org/ns/ttml" xmlns:ttm="http://www.w3.org/ns/ttml#metadata">
+  <head>
+    <metadata>
+      <ttm:agent xml:id="lead" type="person"><ttm:name>Lead</ttm:name></ttm:agent>
+      <ttm:agent xml:id="lead__bg" type="person"><ttm:name>Existing Background Id</ttm:name></ttm:agent>
+    </metadata>
+  </head>
+  <body xml:lang="eng">
+    <div>
+      <p begin="1s" end="2s" ttm:agent="lead">
+        <span begin="1s" end="1.4s">Lead</span>
+        <span ttm:role="x-bg"><span begin="1.5s" end="1.8s">Echo</span></span>
+      </p>
+      <p begin="2s" end="3s" ttm:agent="lead__bg">
+        <span begin="2s" end="2.5s">Named</span>
+      </p>
+    </div>
+  </body>
+</tt>`)
+
+			list, err := parseTTML(content)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(list).To(HaveLen(1))
+			Expect(list[0].Agents).To(Equal([]model.Agent{
+				{ID: "lead", Role: "main", Name: "Lead"},
+				{ID: "__nd_bg__|lead", Role: "bg", Name: "Lead"},
+				{ID: "lead__bg", Role: "voice", Name: "Existing Background Id"},
+			}))
+			Expect(list[0].Line).To(HaveLen(2))
+			Expect(list[0].Line[0].Cue).To(HaveLen(2))
+			Expect(list[0].Line[0].Cue[0].AgentID).To(Equal("lead"))
+			Expect(list[0].Line[0].Cue[1].AgentID).To(Equal("__nd_bg__|lead"))
+			Expect(list[0].Line[1].Cue).To(HaveLen(1))
+			Expect(list[0].Line[1].Cue[0].AgentID).To(Equal("lead__bg"))
+		})
 	})
 
 	Describe("Ambiguous decimal timing", func() {
diff --git a/server/subsonic/media_retrieval_test.go b/server/subsonic/media_retrieval_test.go
index 5489492ce..e4f6a21d4 100644
--- a/server/subsonic/media_retrieval_test.go
+++ b/server/subsonic/media_retrieval_test.go
@@ -543,7 +543,7 @@ var _ = Describe("MediaRetrievalController", func() {
 			lyricsJson, err := json.Marshal(model.LyricList{
 				{
 					Lang:   "eng",
-					Agents: []model.Agent{{ID: "lead", Role: "main"}, {ID: "lead__bg", Role: "bg"}},
+					Agents: []model.Agent{{ID: "lead", Role: "main"}, {ID: "__nd_bg__|lead", Role: "bg"}},
 					Synced: true,
 					Line: []model.Line{
 						{
@@ -561,7 +561,7 @@ var _ = Describe("MediaRetrievalController", func() {
 									Start:   &tokenStartB,
 									End:     &tokenEndB,
 									Value:   "echo",
-									AgentID: "lead__bg",
+									AgentID: "__nd_bg__|lead",
 								},
 							},
 						},
@@ -591,7 +591,7 @@ var _ = Describe("MediaRetrievalController", func() {
 						Synced:        true,
 						Agents: []responses.Agent{
 							{ID: "lead", Role: "main"},
-							{ID: "lead__bg", Role: "bg"},
+							{ID: "__nd_bg__|lead", Role: "bg"},
 						},
 						Line: []responses.Line{
 							{
@@ -619,7 +619,7 @@ var _ = Describe("MediaRetrievalController", func() {
 								Start:   &lineStart,
 								End:     &lineEnd,
 								Value:   "Hello echo",
-								AgentID: "lead__bg",
+								AgentID: "__nd_bg__|lead",
 								Cue: []responses.LyricCue{
 									{
 										Start: tokenStartB,

From 2ffb63477cabc2b82295b2278487ecd6ed048fe9 Mon Sep 17 00:00:00 2001
From: ranokay <github@ranokay.com>
Date: Fri, 27 Mar 2026 07:55:08 +0200
Subject: [PATCH 09/14] chore(lyrics): polish rebased TTML branch

---
 core/lyrics/ttml.go                | 8 ++++----
 server/subsonic/media_retrieval.go | 2 ++
 ui/src/audioplayer/lyrics.js       | 7 ++++---
 3 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/core/lyrics/ttml.go b/core/lyrics/ttml.go
index adbc0c054..a02fa52d8 100644
--- a/core/lyrics/ttml.go
+++ b/core/lyrics/ttml.go
@@ -814,9 +814,9 @@ func (p *ttmlParser) updateTimingParams(attrs []xml.Attr) {
 		}
 	}
 
-	p.params.frameRate = max(frameRate, defaultTTMLFrameRate)
-	p.params.subFrameRate = max(subFrameRate, defaultTTMLSubFrameRate)
-	p.params.tickRate = max(tickRate, defaultTTMLTickRate)
+	p.params.frameRate = positiveOrDefault(frameRate, defaultTTMLFrameRate)
+	p.params.subFrameRate = positiveOrDefault(subFrameRate, defaultTTMLSubFrameRate)
+	p.params.tickRate = positiveOrDefault(tickRate, defaultTTMLTickRate)
 }
 
 func parseTTMLDurationExpression(expr string, params ttmlTimingParams) (int64, bool) {
@@ -1102,7 +1102,7 @@ func hydrateLineTimingFromTokens(line model.Line) model.Line {
 	return model.NormalizeLineTiming(line)
 }
 
-func max(v float64, fallback float64) float64 {
+func positiveOrDefault(v float64, fallback float64) float64 {
 	if v <= 0 {
 		return fallback
 	}
diff --git a/server/subsonic/media_retrieval.go b/server/subsonic/media_retrieval.go
index de88849a2..16d0d2666 100644
--- a/server/subsonic/media_retrieval.go
+++ b/server/subsonic/media_retrieval.go
@@ -99,6 +99,8 @@ func (api *Router) GetLyrics(r *http.Request) (*responses.Subsonic, error) {
 	lyricsResponse := responses.Lyrics{}
 	response.Lyrics = &lyricsResponse
 	opts := filter.SongsByArtistTitleWithLyricsFirst(artist, title)
+	// Keep the search exhaustive so an older duplicate can still supply the
+	// matching sidecar lyrics when the newest candidate only has embedded data.
 	opts.Max = 0
 	mediaFiles, err := api.ds.MediaFile(r.Context()).GetAll(opts)
 
diff --git a/ui/src/audioplayer/lyrics.js b/ui/src/audioplayer/lyrics.js
index cd5248096..87b218d05 100644
--- a/ui/src/audioplayer/lyrics.js
+++ b/ui/src/audioplayer/lyrics.js
@@ -84,7 +84,9 @@ const normalizeToken = (token) => {
 
 const buildAgentLookup = (structuredLyric) => {
   const lookup = new Map()
-  const agents = Array.isArray(structuredLyric?.agents) ? structuredLyric.agents : []
+  const agents = Array.isArray(structuredLyric?.agents)
+    ? structuredLyric.agents
+    : []
   for (const agent of agents) {
     const id = typeof agent?.id === 'string' ? agent.id : ''
     if (!id || lookup.has(id)) {
@@ -112,8 +114,7 @@ const normalizeCueLine = (cueLine, fallbackIndex, agentLookup) => {
     : fallbackIndex
   const agentId = typeof cueLine?.agentId === 'string' ? cueLine.agentId : ''
   const agent = agentId ? agentLookup.get(agentId) || null : null
-  const fallbackRole =
-    typeof cueLine?.role === 'string' ? cueLine.role : ''
+  const fallbackRole = typeof cueLine?.role === 'string' ? cueLine.role : ''
   const tokens = sortTokensByStart(
     Array.isArray(cueLine?.cue)
       ? cueLine.cue.map(normalizeToken).filter(Boolean)

From 73d94962e01543b6a2340efdecea6e6cfe81320b Mon Sep 17 00:00:00 2001
From: ranokay <github@ranokay.com>
Date: Fri, 27 Mar 2026 12:41:27 +0200
Subject: [PATCH 10/14] feat(lyrics): refine karaoke overlay timing and state

---
 core/lyrics/lyrics_test.go                    |   4 +-
 core/lyrics/sources_test.go                   |  15 +-
 model/lyrics.go                               |  12 -
 model/lyrics_test.go                          |  16 +-
 ui/src/audioplayer/KaraokeLyricsOverlay.jsx   | 535 ++++++++++++++----
 .../audioplayer/KaraokeLyricsOverlay.test.jsx | 344 +++++++++++
 ui/src/audioplayer/Player.jsx                 |  51 +-
 .../audioplayer/Player.lyricsState.test.jsx   |  77 +++
 ui/src/audioplayer/lyrics.js                  |  90 +--
 ui/src/audioplayer/lyrics.test.js             | 134 ++++-
 ui/src/audioplayer/lyricsOverlayState.js      |  27 +
 11 files changed, 1070 insertions(+), 235 deletions(-)
 create mode 100644 ui/src/audioplayer/KaraokeLyricsOverlay.test.jsx
 create mode 100644 ui/src/audioplayer/Player.lyricsState.test.jsx
 create mode 100644 ui/src/audioplayer/lyricsOverlayState.js

diff --git a/core/lyrics/lyrics_test.go b/core/lyrics/lyrics_test.go
index 58e8ba82b..917c530ac 100644
--- a/core/lyrics/lyrics_test.go
+++ b/core/lyrics/lyrics_test.go
@@ -52,15 +52,17 @@ var _ = Describe("sources", func() {
 			Line: []model.Line{
 				{
 					Start: gg.P(int64(1000)),
-					End:   gg.P(int64(1500)),
+					End:   gg.P(int64(3000)),
 					Value: "Lead words",
 					Cue: []model.Cue{
 						{
 							Start: gg.P(int64(1000)),
+							End:   gg.P(int64(1500)),
 							Value: "Lead ",
 						},
 						{
 							Start: gg.P(int64(1500)),
+							End:   gg.P(int64(3000)),
 							Value: "words",
 						},
 					},
diff --git a/core/lyrics/sources_test.go b/core/lyrics/sources_test.go
index a110390d8..a86c84cd0 100644
--- a/core/lyrics/sources_test.go
+++ b/core/lyrics/sources_test.go
@@ -102,22 +102,26 @@ var _ = Describe("sources", func() {
 
 			// Line 1: has inline markers → Cue array populated
 			Expect(lyrics[0].Line[0].Start).To(Equal(gg.P(int64(1000))))
+			Expect(lyrics[0].Line[0].End).To(Equal(gg.P(int64(3000))))
 			Expect(lyrics[0].Line[0].Value).To(Equal("Some lyrics here"))
 			Expect(lyrics[0].Line[0].Cue).To(HaveLen(3))
 			Expect(*lyrics[0].Line[0].Cue[0].Start).To(Equal(int64(1000)))
 			Expect(lyrics[0].Line[0].Cue[0].Value).To(Equal("Some "))
-			Expect(lyrics[0].Line[0].Cue[0].End).To(BeNil())
+			Expect(lyrics[0].Line[0].Cue[0].End).To(Equal(gg.P(int64(1500))))
 			Expect(*lyrics[0].Line[0].Cue[1].Start).To(Equal(int64(1500)))
 			Expect(lyrics[0].Line[0].Cue[1].Value).To(Equal("lyrics "))
-			Expect(lyrics[0].Line[0].Cue[1].End).To(BeNil())
+			Expect(lyrics[0].Line[0].Cue[1].End).To(Equal(gg.P(int64(2000))))
 			Expect(*lyrics[0].Line[0].Cue[2].Start).To(Equal(int64(2000)))
 			Expect(lyrics[0].Line[0].Cue[2].Value).To(Equal("here"))
-			Expect(lyrics[0].Line[0].Cue[2].End).To(BeNil())
+			Expect(lyrics[0].Line[0].Cue[2].End).To(Equal(gg.P(int64(3000))))
 
 			// Line 2: has inline markers
 			Expect(lyrics[0].Line[1].Start).To(Equal(gg.P(int64(3000))))
+			Expect(lyrics[0].Line[1].End).To(Equal(gg.P(int64(5000))))
 			Expect(lyrics[0].Line[1].Value).To(Equal("More words"))
 			Expect(lyrics[0].Line[1].Cue).To(HaveLen(2))
+			Expect(lyrics[0].Line[1].Cue[0].End).To(Equal(gg.P(int64(3500))))
+			Expect(lyrics[0].Line[1].Cue[1].End).To(Equal(gg.P(int64(5000))))
 
 			// Line 3: plain line, no cues
 			Expect(lyrics[0].Line[2].Start).To(Equal(gg.P(int64(5000))))
@@ -138,14 +142,15 @@ var _ = Describe("sources", func() {
 			Expect(lyrics[0].Line).To(HaveLen(2))
 
 			Expect(lyrics[0].Line[0].Start).To(Equal(gg.P(int64(1000))))
+			Expect(lyrics[0].Line[0].End).To(Equal(gg.P(int64(3000))))
 			Expect(lyrics[0].Line[0].Value).To(Equal("Lead words"))
 			Expect(lyrics[0].Line[0].Cue).To(HaveLen(2))
 			Expect(*lyrics[0].Line[0].Cue[0].Start).To(Equal(int64(1000)))
 			Expect(lyrics[0].Line[0].Cue[0].Value).To(Equal("Lead "))
-			Expect(lyrics[0].Line[0].Cue[0].End).To(BeNil())
+			Expect(lyrics[0].Line[0].Cue[0].End).To(Equal(gg.P(int64(1500))))
 			Expect(*lyrics[0].Line[0].Cue[1].Start).To(Equal(int64(1500)))
 			Expect(lyrics[0].Line[0].Cue[1].Value).To(Equal("words"))
-			Expect(lyrics[0].Line[0].Cue[1].End).To(BeNil())
+			Expect(lyrics[0].Line[0].Cue[1].End).To(Equal(gg.P(int64(3000))))
 
 			Expect(lyrics[0].Line[1].Start).To(Equal(gg.P(int64(3000))))
 			Expect(lyrics[0].Line[1].Value).To(Equal("Fallback line"))
diff --git a/model/lyrics.go b/model/lyrics.go
index 725c3aa94..ec0df9f34 100644
--- a/model/lyrics.go
+++ b/model/lyrics.go
@@ -417,18 +417,6 @@ func normalizeCueLine(line Line, fallbackEnd *int64) Line {
 		return line
 	}
 
-	hasAnyEnd := false
-	for i := range line.Cue {
-		if line.Cue[i].End != nil {
-			hasAnyEnd = true
-			break
-		}
-	}
-	if !hasAnyEnd {
-		line.Cue = clearCueEnds(line.Cue)
-		return NormalizeLineTiming(line)
-	}
-
 	for i := range line.Cue {
 		if line.Cue[i].End != nil {
 			continue
diff --git a/model/lyrics_test.go b/model/lyrics_test.go
index 9aad7d968..6f189f024 100644
--- a/model/lyrics_test.go
+++ b/model/lyrics_test.go
@@ -127,20 +127,24 @@ var _ = Describe("ToLyrics", func() {
 
 		line0 := lyrics.Line[0]
 		Expect(line0.Start).To(Equal(&t1000))
+		Expect(line0.End).To(Equal(&t3000))
 		Expect(line0.Value).To(Equal("Some lyrics here"))
 		Expect(line0.Cue).To(Equal([]Cue{
-			{Start: &t1000, Value: "Some "},
-			{Start: &t1500, Value: "lyrics "},
-			{Start: &t2000, Value: "here"},
+			{Start: &t1000, End: &t1500, Value: "Some "},
+			{Start: &t1500, End: &t2000, Value: "lyrics "},
+			{Start: &t2000, End: &t3000, Value: "here"},
 		}))
 
 		line1 := lyrics.Line[1]
 		Expect(line1.Start).To(Equal(&t3000))
+		Expect(line1.End).To(Equal(&t3500))
 		Expect(line1.Value).To(Equal("More words"))
 		Expect(line1.Cue).To(Equal([]Cue{
 			{Start: &t3000, Value: "More "},
 			{Start: &t3500, Value: "words"},
 		}))
+
+		Expect(line1.Cue[1].End).To(BeNil())
 	})
 
 	It("should ignore Enhanced LRC markers and return plain lines when no markers present", func() {
@@ -159,12 +163,14 @@ var _ = Describe("ToLyrics", func() {
 		Expect(lyrics.Line).To(HaveLen(3))
 
 		t1000, t1500, t5000, t5500 := int64(1000), int64(1500), int64(5000), int64(5500)
+		t3000 := int64(3000)
 
 		Expect(lyrics.Line[0].Cue).To(Equal([]Cue{
-			{Start: &t1000, Value: "Some "},
-			{Start: &t1500, Value: "lyrics"},
+			{Start: &t1000, End: &t1500, Value: "Some "},
+			{Start: &t1500, End: &t3000, Value: "lyrics"},
 		}))
 		Expect(lyrics.Line[0].Value).To(Equal("Some lyrics"))
+		Expect(lyrics.Line[0].End).To(Equal(&t3000))
 
 		Expect(lyrics.Line[1].Cue).To(BeNil())
 		Expect(lyrics.Line[1].Value).To(Equal("Plain line"))
diff --git a/ui/src/audioplayer/KaraokeLyricsOverlay.jsx b/ui/src/audioplayer/KaraokeLyricsOverlay.jsx
index a44e50bf6..cd1484e41 100644
--- a/ui/src/audioplayer/KaraokeLyricsOverlay.jsx
+++ b/ui/src/audioplayer/KaraokeLyricsOverlay.jsx
@@ -3,8 +3,10 @@ import IconButton from '@material-ui/core/IconButton'
 import Popover from '@material-ui/core/Popover'
 import Slider from '@material-ui/core/Slider'
 import { makeStyles } from '@material-ui/core/styles'
+import Tooltip from '@material-ui/core/Tooltip'
 import Typography from '@material-ui/core/Typography'
 import CloseIcon from '@material-ui/icons/Close'
+import RestoreIcon from '@material-ui/icons/Restore'
 import TuneIcon from '@material-ui/icons/Tune'
 import clsx from 'clsx'
 import React, {
@@ -16,8 +18,11 @@ import React, {
   useState,
 } from 'react'
 import {
+  buildHighlightedAuxLine,
+  buildHighlightedMainLine,
   buildKaraokeLines,
   getActiveKaraokeState,
+  hasUsableKaraokeTiming,
   hasStructuredLyricContent,
   resolveKaraokeTokenWindow,
   resolveLayerLineForMain,
@@ -36,6 +41,12 @@ const KARAOKE_MAX_HEIGHT_RATIO = 0.72
 const KARAOKE_MAX_HEIGHT_PX = 760
 const KARAOKE_CENTER_SPACER_RATIO = 0.5
 const KARAOKE_CENTER_SPACER_MIN_PX = 132
+const KARAOKE_DEFAULT_LINE_HEIGHT = 1.3
+const KARAOKE_MIN_LINE_HEIGHT = 1
+const KARAOKE_MAX_LINE_HEIGHT = 2.2
+const KARAOKE_LINE_HEIGHT_STEP = 0.02
+const KARAOKE_GROUP_SPACING_BASE_PX = 14
+const KARAOKE_AUX_LINE_HEIGHT = 1.2
 
 const TOKEN_DONE_ALPHA = 1
 const TOKEN_FUTURE_ALPHA = 0.34
@@ -55,33 +66,65 @@ const COLOR_PRESETS = [
 ]
 
 const DEFAULT_LYRICS_SETTINGS = {
-  tr: { fontSize: 14, colorKey: 'blue' },
-  main: { fontSize: 24, colorKey: 'white' },
-  pr: { fontSize: 14, colorKey: 'green' },
+  lineHeight: KARAOKE_DEFAULT_LINE_HEIGHT,
+  overlayHeight: KARAOKE_DEFAULT_HEIGHT_PX,
+  tr: { fontSize: 18, colorKey: 'blue' },
+  main: { fontSize: 30, colorKey: 'white' },
+  pr: { fontSize: 18, colorKey: 'green' },
 }
 
 const SETTINGS_STORAGE_KEY = 'karaoke-lyrics-settings'
 
+const createDefaultLyricsSettings = () => ({
+  lineHeight: KARAOKE_DEFAULT_LINE_HEIGHT,
+  overlayHeight: KARAOKE_DEFAULT_HEIGHT_PX,
+  tr: { ...DEFAULT_LYRICS_SETTINGS.tr },
+  main: { ...DEFAULT_LYRICS_SETTINGS.main },
+  pr: { ...DEFAULT_LYRICS_SETTINGS.pr },
+})
+
+const clampLineHeight = (value) => {
+  const numeric = Number(value)
+  if (!Number.isFinite(numeric)) {
+    return KARAOKE_DEFAULT_LINE_HEIGHT
+  }
+  return clamp(numeric, KARAOKE_MIN_LINE_HEIGHT, KARAOKE_MAX_LINE_HEIGHT)
+}
+
+const clampOverlayHeightPreference = (value) => {
+  const numeric = Number(value)
+  if (!Number.isFinite(numeric)) {
+    return KARAOKE_DEFAULT_HEIGHT_PX
+  }
+  return clamp(numeric, KARAOKE_MIN_HEIGHT_PX, KARAOKE_MAX_HEIGHT_PX)
+}
+
+const normalizeLyricsSettings = (settings) => ({
+  lineHeight: clampLineHeight(settings?.lineHeight),
+  overlayHeight: clampOverlayHeightPreference(settings?.overlayHeight),
+  tr: { ...DEFAULT_LYRICS_SETTINGS.tr, ...settings?.tr },
+  main: { ...DEFAULT_LYRICS_SETTINGS.main, ...settings?.main },
+  pr: { ...DEFAULT_LYRICS_SETTINGS.pr, ...settings?.pr },
+})
+
 const loadLyricsSettings = () => {
   try {
     const raw = localStorage.getItem(SETTINGS_STORAGE_KEY)
     if (raw) {
-      const parsed = JSON.parse(raw)
-      return {
-        tr: { ...DEFAULT_LYRICS_SETTINGS.tr, ...parsed.tr },
-        main: { ...DEFAULT_LYRICS_SETTINGS.main, ...parsed.main },
-        pr: { ...DEFAULT_LYRICS_SETTINGS.pr, ...parsed.pr },
-      }
+      return normalizeLyricsSettings(JSON.parse(raw))
     }
   } catch {
     /* ignore */
   }
-  return { ...DEFAULT_LYRICS_SETTINGS }
+  return normalizeLyricsSettings()
 }
 
 const saveLyricsSettings = (settings) => {
   try {
-    localStorage.setItem(SETTINGS_STORAGE_KEY, JSON.stringify(settings))
+    localStorage.setItem(
+      SETTINGS_STORAGE_KEY,
+      JSON.stringify(normalizeLyricsSettings(settings)),
+    )
   } catch {
     /* ignore */
   }
@@ -97,7 +140,7 @@ const useStyles = makeStyles((theme) => ({
     bottom: 100,
     transform: 'translateX(-50%)',
     zIndex: 1400,
-    width: 'min(900px, calc(100vw - 32px))',
+    width: 'min(1000px, calc(100vw - 32px))',
     minHeight: KARAOKE_MIN_HEIGHT_PX,
     background: 'rgba(6, 8, 12, 0.9)',
     borderRadius: 12,
@@ -149,13 +192,39 @@ const useStyles = makeStyles((theme) => ({
     gap: theme.spacing(1),
     minWidth: 0,
   },
-  language: {
-    fontSize: 11,
-    letterSpacing: '0.08em',
-    opacity: 0.72,
-    textTransform: 'uppercase',
+  languageBadges: {
+    display: 'flex',
+    alignItems: 'center',
+    gap: theme.spacing(0.5),
+    flexWrap: 'wrap',
+    minWidth: 0,
+  },
+  languageBadge: {
+    display: 'inline-flex',
+    alignItems: 'center',
+    gap: theme.spacing(0.35),
+    padding: theme.spacing(0.2, 0.7),
+    borderRadius: 999,
+    border: '1px solid rgba(148, 163, 184, 0.28)',
+    background: 'rgba(15, 23, 42, 0.42)',
+    color: 'rgba(226, 232, 240, 0.8)',
+    fontSize: 10,
+    letterSpacing: '0.04em',
     whiteSpace: 'nowrap',
   },
+  languageBadgeActive: {
+    borderColor: 'rgba(148, 163, 184, 0.46)',
+    background: 'rgba(30, 41, 59, 0.56)',
+    color: 'rgba(248, 250, 252, 0.94)',
+  },
+  languageBadgeLabel: {
+    fontWeight: 700,
+    textTransform: 'uppercase',
+    opacity: 0.78,
+  },
+  languageBadgeValue: {
+    opacity: 0.9,
+  },
   layerControls: {
     display: 'flex',
     alignItems: 'center',
@@ -186,21 +255,31 @@ const useStyles = makeStyles((theme) => ({
   closeButton: {
     color: 'rgba(255, 255, 255, 0.72)',
   },
+  lineGroup: {
+    display: 'flex',
+    flexDirection: 'column',
+    alignItems: 'center',
+    gap: theme.spacing(0.35),
+  },
   inlineTr: {
-    margin: '0 0 2px 0',
+    margin: 0,
     textAlign: 'center',
     fontWeight: 400,
-    lineHeight: 1.2,
+    lineHeight: KARAOKE_AUX_LINE_HEIGHT,
     letterSpacing: '0.01em',
     transition: `opacity ${KARAOKE_ANIMATION_MS}ms ease-in-out, font-size ${KARAOKE_ANIMATION_MS}ms ease-in-out`,
   },
   inlinePr: {
-    margin: '2px 0 0 0',
+    margin: 0,
     textAlign: 'center',
     fontWeight: 400,
-    lineHeight: 1.2,
+    lineHeight: KARAOKE_AUX_LINE_HEIGHT,
     letterSpacing: '0.01em',
     transition: `opacity ${KARAOKE_ANIMATION_MS}ms ease-in-out, font-size ${KARAOKE_ANIMATION_MS}ms ease-in-out`,
+    padding: theme.spacing(0.15, 0.9),
+    borderRadius: 999,
+    background: 'rgba(255, 255, 255, 0.08)',
+    border: '1px solid rgba(255, 255, 255, 0.12)',
   },
   body: {
     padding: theme.spacing(0.5, 2, 1.4, 2),
@@ -252,15 +331,29 @@ const useStyles = makeStyles((theme) => ({
     border: '1px solid rgba(255, 255, 255, 0.12)',
     borderRadius: 10,
     padding: theme.spacing(1.5, 2),
-    width: 260,
+    width: 278,
     backdropFilter: 'blur(12px)',
   },
+  settingsHeader: {
+    display: 'flex',
+    alignItems: 'center',
+    justifyContent: 'space-between',
+    gap: theme.spacing(1),
+    marginBottom: theme.spacing(1.25),
+  },
   settingsSection: {
     marginBottom: theme.spacing(1.2),
     '&:last-child': {
       marginBottom: 0,
     },
   },
+  settingsTitle: {
+    fontSize: 11,
+    fontWeight: 700,
+    letterSpacing: '0.08em',
+    textTransform: 'uppercase',
+    color: 'rgba(255, 255, 255, 0.78)',
+  },
   settingsLabel: {
     fontSize: 10,
     fontWeight: 600,
@@ -291,6 +384,21 @@ const useStyles = makeStyles((theme) => ({
     minWidth: 22,
     textAlign: 'right',
   },
+  settingsControlLabel: {
+    fontSize: 10,
+    letterSpacing: '0.06em',
+    textTransform: 'uppercase',
+    color: 'rgba(255, 255, 255, 0.45)',
+    minWidth: 72,
+    whiteSpace: 'nowrap',
+  },
+  resetButton: {
+    color: 'rgba(255, 255, 255, 0.58)',
+    padding: 4,
+    '&:hover': {
+      color: 'rgba(255, 255, 255, 0.9)',
+    },
+  },
   colorDots: {
     display: 'flex',
     gap: 5,
@@ -314,6 +422,9 @@ const useStyles = makeStyles((theme) => ({
 
 const clamp = (v, min, max) => Math.max(min, Math.min(max, v))
 const lerp = (from, to, t) => from + (to - from) * t
+const formatLineHeight = (value) => clampLineHeight(value).toFixed(2)
+const getLineGapPx = (lineHeight) =>
+  `${Math.round(clampLineHeight(lineHeight) * KARAOKE_GROUP_SPACING_BASE_PX)}px`
 
 const normalizeForComparison = (text) =>
   (text || '').replace(/[\s\p{P}]/gu, '').toLowerCase()
@@ -326,6 +437,34 @@ const shouldShowAuxLine = (mainLine, auxLine) => {
   )
 }
 
+const buildLanguageBadges = ({
+  mainLyric,
+  translationLyric,
+  pronunciationLyric,
+  showTranslation,
+  showPronunciation,
+}) =>
+  [
+    {
+      key: 'main',
+      label: 'Main',
+      lang: mainLyric?.lang,
+      active: true,
+    },
+    {
+      key: 'pr',
+      label: 'PR',
+      lang: pronunciationLyric?.lang,
+      active: showPronunciation,
+    },
+    {
+      key: 'tr',
+      label: 'TR',
+      lang: translationLyric?.lang,
+      active: showTranslation,
+    },
+  ].filter((badge) => badge.lang)
+
 const SettingsSection = ({ label, layer, settings, onChange, classes }) => {
   const s = settings[layer]
   return (
@@ -363,7 +502,37 @@ const SettingsSection = ({ label, layer, settings, onChange, classes }) => {
   )
 }
 
-const LyricsSettingsPopover = ({ settings, onChange }) => {
+const LineHeightSetting = ({ settings, onChange, classes }) => (
+  <div className={classes.settingsSection}>
+    <div className={classes.settingsLabel}>Spacing</div>
+    <div className={classes.settingsRow}>
+      <div className={classes.settingsControlLabel}>Line height</div>
+      <Slider
+        className={classes.settingsSlider}
+        min={KARAOKE_MIN_LINE_HEIGHT}
+        max={KARAOKE_MAX_LINE_HEIGHT}
+        step={KARAOKE_LINE_HEIGHT_STEP}
+        value={settings.lineHeight}
+        aria-label="Line height"
+        data-testid="lyrics-line-height-slider"
+        onChange={(_, val) =>
+          onChange({
+            ...settings,
+            lineHeight: clampLineHeight(Array.isArray(val) ? val[0] : val),
+          })
+        }
+      />
+      <span
+        className={classes.settingsSliderValue}
+        data-testid="lyrics-line-height-value"
+      >
+        {formatLineHeight(settings.lineHeight)}
+      </span>
+    </div>
+  </div>
+)
+
+const LyricsSettingsPopover = ({ settings, onChange, onReset }) => {
   const classes = useStyles()
   const [anchorEl, setAnchorEl] = useState(null)
 
@@ -376,14 +545,19 @@ const LyricsSettingsPopover = ({ settings, onChange }) => {
 
   return (
     <>
-      <IconButton
-        className={classes.settingsButton}
-        size="small"
-        onClick={handleToggle}
-        aria-label="Lyrics settings"
-      >
-        <TuneIcon style={{ fontSize: 18 }} />
-      </IconButton>
+      <Tooltip title="Appearance">
+        <span>
+          <IconButton
+            className={classes.settingsButton}
+            size="small"
+            onClick={handleToggle}
+            aria-label="Lyrics settings"
+            data-testid="lyrics-settings-button"
+          >
+            <TuneIcon style={{ fontSize: 18 }} />
+          </IconButton>
+        </span>
+      </Tooltip>
       <Popover
         open={Boolean(anchorEl)}
         anchorEl={anchorEl}
@@ -393,6 +567,27 @@ const LyricsSettingsPopover = ({ settings, onChange }) => {
         PaperProps={{ className: classes.settingsPanel }}
         style={{ zIndex: 1500 }}
       >
+        <div className={classes.settingsHeader}>
+          <Typography className={classes.settingsTitle}>Appearance</Typography>
+          <Tooltip title="Reset appearance">
+            <span>
+              <IconButton
+                className={classes.resetButton}
+                size="small"
+                onClick={onReset}
+                aria-label="Reset appearance"
+                data-testid="lyrics-reset-appearance"
+              >
+                <RestoreIcon style={{ fontSize: 18 }} />
+              </IconButton>
+            </span>
+          </Tooltip>
+        </div>
+        <LineHeightSetting
+          settings={settings}
+          onChange={onChange}
+          classes={classes}
+        />
         <SettingsSection
           label="Translation"
           layer="tr"
@@ -401,7 +596,7 @@ const LyricsSettingsPopover = ({ settings, onChange }) => {
           classes={classes}
         />
         <SettingsSection
-          label="Default"
+          label="Main"
           layer="main"
           settings={settings}
           onChange={onChange}
@@ -595,7 +790,8 @@ const areLineStylesEqual = (prevStyle, nextStyle) => {
     a.opacity === b.opacity &&
     a.color === b.color &&
     a.fontSize === b.fontSize &&
-    a.fontWeight === b.fontWeight
+    a.fontWeight === b.fontWeight &&
+    a.lineHeight === b.lineHeight
   )
 }
 
@@ -778,7 +974,6 @@ const KaraokeLyricsOverlay = ({
 }) => {
   const classes = useStyles()
   const [playbackMs, setPlaybackMs] = useState(0)
-  const [overlayHeight, setOverlayHeight] = useState(KARAOKE_DEFAULT_HEIGHT_PX)
   const [maxHeightPx, setMaxHeightPx] = useState(getMaxHeightPx())
   const [bodyViewportHeight, setBodyViewportHeight] = useState(0)
   const [isCompact, setIsCompact] = useState(
@@ -787,8 +982,15 @@ const KaraokeLyricsOverlay = ({
   const [lyricsSettings, setLyricsSettings] = useState(loadLyricsSettings)
 
   const handleSettingsChange = useCallback((next) => {
-    setLyricsSettings(next)
-    saveLyricsSettings(next)
+    const normalized = normalizeLyricsSettings(next)
+    setLyricsSettings(normalized)
+    saveLyricsSettings(normalized)
+  }, [])
+
+  const handleResetAppearance = useCallback(() => {
+    const defaults = createDefaultLyricsSettings()
+    setLyricsSettings(defaults)
+    saveLyricsSettings(defaults)
   }, [])
 
   const bodyRef = useRef(null)
@@ -803,15 +1005,17 @@ const KaraokeLyricsOverlay = ({
     () => buildKaraokeLines(pronunciationLyric),
     [pronunciationLyric],
   )
+  const overlayHeight = clamp(
+    lyricsSettings.overlayHeight,
+    KARAOKE_MIN_HEIGHT_PX,
+    maxHeightPx,
+  )
 
   useEffect(() => {
     const onResize = () => {
       const nextMaxHeight = getMaxHeightPx()
       setIsCompact(window.innerWidth <= 810)
       setMaxHeightPx(nextMaxHeight)
-      setOverlayHeight((previous) =>
-        clamp(previous, KARAOKE_MIN_HEIGHT_PX, nextMaxHeight),
-      )
     }
 
     onResize()
@@ -853,9 +1057,14 @@ const KaraokeLyricsOverlay = ({
 
       const onMove = (moveEvent) => {
         const delta = startY - moveEvent.clientY
-        setOverlayHeight(
-          clamp(startHeight + delta, KARAOKE_MIN_HEIGHT_PX, maxHeightPx),
-        )
+        handleSettingsChange({
+          ...lyricsSettings,
+          overlayHeight: clamp(
+            startHeight + delta,
+            KARAOKE_MIN_HEIGHT_PX,
+            maxHeightPx,
+          ),
+        })
       }
 
       const onUp = () => {
@@ -866,7 +1075,13 @@ const KaraokeLyricsOverlay = ({
       window.addEventListener('mousemove', onMove)
       window.addEventListener('mouseup', onUp)
     },
-    [isCompact, maxHeightPx, overlayHeight],
+    [
+      handleSettingsChange,
+      isCompact,
+      lyricsSettings,
+      maxHeightPx,
+      overlayHeight,
+    ],
   )
 
   useEffect(() => {
@@ -967,13 +1182,29 @@ const KaraokeLyricsOverlay = ({
   }, [audioInstance, visible])
 
   const renderPlaybackMs = playbackMs + KARAOKE_RENDER_LEAD_MS
-
-  const { lineIndex } = useMemo(
-    () => getActiveKaraokeState(mainLines, renderPlaybackMs),
-    [mainLines, renderPlaybackMs],
+  const hasTimedMainLines = useMemo(
+    () => hasUsableKaraokeTiming(mainLines),
+    [mainLines],
   )
 
-  const activeIndex = lineIndex >= 0 ? lineIndex : 0
+  const { lineIndex } = useMemo(
+    () =>
+      hasTimedMainLines
+        ? getActiveKaraokeState(mainLines, renderPlaybackMs)
+        : { lineIndex: -1, tokenIndex: -1 },
+    [hasTimedMainLines, mainLines, renderPlaybackMs],
+  )
+
+  const activeIndex = hasTimedMainLines && lineIndex >= 0 ? lineIndex : -1
+  const lineHeight = lyricsSettings.lineHeight
+  const lineGap = getLineGapPx(lineHeight)
+  const languageBadges = buildLanguageBadges({
+    mainLyric,
+    translationLyric,
+    pronunciationLyric,
+    showTranslation,
+    showPronunciation,
+  })
 
   const trByMainIndex = useMemo(() => {
     if (!showTranslation || translationLines.length === 0) return {}
@@ -1008,12 +1239,14 @@ const KaraokeLyricsOverlay = ({
           ? 260
           : Math.max(220, overlayHeight - 170)
   const centerSpacerPx = Math.max(
-    KARAOKE_CENTER_SPACER_MIN_PX,
-    Math.floor(estimatedViewportHeight * KARAOKE_CENTER_SPACER_RATIO),
+    hasTimedMainLines ? KARAOKE_CENTER_SPACER_MIN_PX : 0,
+    hasTimedMainLines
+      ? Math.floor(estimatedViewportHeight * KARAOKE_CENTER_SPACER_RATIO)
+      : 0,
   )
 
   useEffect(() => {
-    if (!visible) {
+    if (!visible || !hasTimedMainLines) {
       return
     }
 
@@ -1050,6 +1283,7 @@ const KaraokeLyricsOverlay = ({
     return () => window.cancelAnimationFrame(rafId)
   }, [
     centerSpacerPx,
+    hasTimedMainLines,
     hasPronunciationLine,
     hasTranslationLine,
     lineIndex,
@@ -1066,10 +1300,19 @@ const KaraokeLyricsOverlay = ({
   }
 
   const getMainLineStyle = (idx) => {
+    const [r, g, b] = parseColorRGB(getColorValue(lyricsSettings.main.colorKey))
+    if (!hasTimedMainLines) {
+      return {
+        opacity: 1,
+        color: `rgba(${r}, ${g}, ${b}, 0.98)`,
+        fontSize: lyricsSettings.main.fontSize,
+        lineHeight,
+      }
+    }
+
     const delta = idx - activeIndex
     const isActive = delta === 0
     let opacity = isActive ? 1 : delta < 0 ? 0.6 : 0.72
-    const [r, g, b] = parseColorRGB(getColorValue(lyricsSettings.main.colorKey))
     const color = isActive
       ? `rgba(${r}, ${g}, ${b}, 0.98)`
       : delta < 0
@@ -1093,6 +1336,48 @@ const KaraokeLyricsOverlay = ({
       opacity,
       color,
       fontSize,
+      lineHeight,
+    }
+  }
+
+  const getAuxLineStyle = (idx, layerKey) => {
+    const [r, g, b] = parseColorRGB(
+      getColorValue(lyricsSettings[layerKey].colorKey),
+    )
+    if (!hasTimedMainLines) {
+      return {
+        opacity: 0.94,
+        fontSize: lyricsSettings[layerKey].fontSize,
+        color: `rgba(${r}, ${g}, ${b}, 0.94)`,
+        lineHeight: KARAOKE_AUX_LINE_HEIGHT,
+      }
+    }
+
+    const delta = idx - activeIndex
+    const isActive = delta === 0
+
+    let opacity = isActive ? 0.94 : delta < 0 ? 0.5 : 0.62
+    const color = isActive
+      ? `rgba(${r}, ${g}, ${b}, 0.94)`
+      : delta < 0
+        ? `rgba(${r}, ${g}, ${b}, 0.42)`
+        : `rgba(${r}, ${g}, ${b}, 0.56)`
+
+    if (delta > 1) {
+      const level = clamp(delta, 1, 6)
+      opacity = Math.max(0.28, 0.64 - level * 0.08)
+    }
+
+    if (delta < -1) {
+      const level = clamp(Math.abs(delta), 1, 6)
+      opacity = Math.max(0.22, 0.5 - level * 0.08)
+    }
+
+    return {
+      opacity,
+      fontSize: lyricsSettings[layerKey].fontSize,
+      color,
+      lineHeight: KARAOKE_AUX_LINE_HEIGHT,
     }
   }
 
@@ -1109,36 +1394,61 @@ const KaraokeLyricsOverlay = ({
       data-testid="karaoke-lyrics-overlay"
       style={overlayStyle}
     >
-      <div className={classes.resizeHandle} onMouseDown={onResizeStart} />
+      <div
+        className={classes.resizeHandle}
+        onMouseDown={onResizeStart}
+        data-testid="lyrics-resize-handle"
+      />
 
       <div className={classes.header}>
         <div className={classes.headerLeft}>
-          <Typography className={classes.language}>
-            {mainLyric?.lang || 'xxx'}
-          </Typography>
+          <div className={classes.languageBadges}>
+            {languageBadges.map((badge) => (
+              <div
+                key={badge.key}
+                className={clsx(classes.languageBadge, {
+                  [classes.languageBadgeActive]: badge.active,
+                })}
+                data-testid={`lyrics-language-badge-${badge.key}`}
+              >
+                <span className={classes.languageBadgeLabel}>
+                  {badge.label}
+                </span>
+                <span className={classes.languageBadgeValue}>{badge.lang}</span>
+              </div>
+            ))}
+          </div>
           <div className={classes.layerControls}>
-            <Button
-              size="small"
-              onClick={onToggleTranslation}
-              disabled={!translationEnabled}
-              className={clsx(classes.layerToggle, {
-                [classes.layerToggleActive]: showTranslation,
-              })}
-              data-testid="lyrics-toggle-translation"
-            >
-              TR
-            </Button>
-            <Button
-              size="small"
-              onClick={onTogglePronunciation}
-              disabled={!pronunciationEnabled}
-              className={clsx(classes.layerToggle, {
-                [classes.layerToggleActive]: showPronunciation,
-              })}
-              data-testid="lyrics-toggle-pronunciation"
-            >
-              PR
-            </Button>
+            <Tooltip title="Toggle translations">
+              <span>
+                <Button
+                  size="small"
+                  onClick={onToggleTranslation}
+                  disabled={!translationEnabled}
+                  className={clsx(classes.layerToggle, {
+                    [classes.layerToggleActive]: showTranslation,
+                  })}
+                  data-testid="lyrics-toggle-translation"
+                >
+                  TR
+                </Button>
+              </span>
+            </Tooltip>
+            <Tooltip title="Toggle pronunciations">
+              <span>
+                <Button
+                  size="small"
+                  onClick={onTogglePronunciation}
+                  disabled={!pronunciationEnabled}
+                  className={clsx(classes.layerToggle, {
+                    [classes.layerToggleActive]: showPronunciation,
+                  })}
+                  data-testid="lyrics-toggle-pronunciation"
+                >
+                  PR
+                </Button>
+              </span>
+            </Tooltip>
           </div>
         </div>
 
@@ -1146,6 +1456,7 @@ const KaraokeLyricsOverlay = ({
           <LyricsSettingsPopover
             settings={lyricsSettings}
             onChange={handleSettingsChange}
+            onReset={handleResetAppearance}
           />
           <IconButton
             className={classes.closeButton}
@@ -1159,30 +1470,40 @@ const KaraokeLyricsOverlay = ({
       </div>
 
       <div className={classes.body} ref={bodyRef}>
-        <div className={classes.lines}>
+        <div className={classes.lines} style={{ gap: lineGap }}>
           <div aria-hidden style={{ height: centerSpacerPx }} />
           {mainLines.map((line, idx) => {
             const trLine = trByMainIndex[idx]
             const prLine = prByMainIndex[idx]
+            const mainNextLineStart = mainLines[idx + 1]?.start ?? null
+            const highlightedMainLine = buildHighlightedMainLine(
+              line,
+              mainNextLineStart,
+            )
+            const highlightedTrLine = buildHighlightedAuxLine(
+              line,
+              trLine,
+              mainNextLineStart,
+            )
+            const highlightedPrLine = buildHighlightedAuxLine(
+              line,
+              prLine,
+              mainNextLineStart,
+            )
             const showTr = shouldShowAuxLine(line, trLine)
             const showPr = shouldShowAuxLine(line, prLine)
             const lineStyle = getMainLineStyle(idx)
-            const auxOpacity =
-              lineStyle.opacity != null ? lineStyle.opacity * 0.85 : 1
-            const trStyle = {
-              opacity: auxOpacity,
-              fontSize: lyricsSettings.tr.fontSize,
-              color: getColorValue(lyricsSettings.tr.colorKey),
-            }
-            const prStyle = {
-              opacity: auxOpacity,
-              fontSize: lyricsSettings.pr.fontSize,
-              color: getColorValue(lyricsSettings.pr.colorKey),
-            }
+            const trStyle = getAuxLineStyle(idx, 'tr')
+            const prStyle = getAuxLineStyle(idx, 'pr')
             return (
               <div
                 key={`line-${line.index}-${line.start ?? idx}`}
-                ref={idx === activeIndex ? activeLineRef : null}
+                ref={
+                  idx === activeIndex && hasTimedMainLines
+                    ? activeLineRef
+                    : null
+                }
+                className={classes.lineGroup}
                 style={{ cursor: line.start != null ? 'pointer' : undefined }}
                 onClick={() => {
                   if (audioInstance && line.start != null) {
@@ -1190,33 +1511,35 @@ const KaraokeLyricsOverlay = ({
                   }
                 }}
               >
-                {showTr && (
-                  <KaraokeLineRow
-                    line={trLine}
-                    nextLineStart={null}
-                    renderPlaybackMs={renderPlaybackMs}
-                    className={classes.inlineTr}
-                    style={trStyle}
-                    tokenClassName={classes.token}
-                    highlightTokens={false}
-                  />
-                )}
                 <KaraokeLineRow
-                  line={line}
-                  nextLineStart={mainLines[idx + 1]?.start ?? null}
+                  line={highlightedMainLine}
+                  nextLineStart={mainNextLineStart}
                   renderPlaybackMs={renderPlaybackMs}
                   className={classes.line}
                   style={lineStyle}
                   tokenClassName={classes.token}
+                  highlightTokens={hasTimedMainLines}
                 />
                 {showPr && (
                   <KaraokeLineRow
-                    line={prLine}
+                    line={highlightedPrLine}
                     nextLineStart={null}
                     renderPlaybackMs={renderPlaybackMs}
                     className={classes.inlinePr}
                     style={prStyle}
                     tokenClassName={classes.token}
+                    highlightTokens={hasTimedMainLines}
+                  />
+                )}
+                {showTr && (
+                  <KaraokeLineRow
+                    line={highlightedTrLine}
+                    nextLineStart={null}
+                    renderPlaybackMs={renderPlaybackMs}
+                    className={classes.inlineTr}
+                    style={trStyle}
+                    tokenClassName={classes.token}
+                    highlightTokens={hasTimedMainLines}
                   />
                 )}
               </div>
diff --git a/ui/src/audioplayer/KaraokeLyricsOverlay.test.jsx b/ui/src/audioplayer/KaraokeLyricsOverlay.test.jsx
new file mode 100644
index 000000000..411116eae
--- /dev/null
+++ b/ui/src/audioplayer/KaraokeLyricsOverlay.test.jsx
@@ -0,0 +1,344 @@
+import React from 'react'
+import {
+  cleanup,
+  fireEvent,
+  render,
+  screen,
+  waitFor,
+} from '@testing-library/react'
+import KaraokeLyricsOverlay from './KaraokeLyricsOverlay'
+
+const DEFAULT_LINE_HEIGHT_TEXT = '1.30'
+const NEXT_LINE_HEIGHT_TEXT = '1.32'
+
+const audioInstance = {
+  currentTime: 0,
+  paused: true,
+  seeking: false,
+  playbackRate: 1,
+}
+
+const buildLyric = (kind, lang, value) => ({
+  kind,
+  lang,
+  synced: true,
+  line: [{ start: 1000, value }],
+})
+
+const renderOverlay = (props = {}) =>
+  render(
+    <KaraokeLyricsOverlay
+      visible={true}
+      mainLyric={buildLyric('main', 'ja', 'こんにちは')}
+      translationLyric={buildLyric('translation', 'en', 'Hello')}
+      pronunciationLyric={buildLyric('pronunciation', 'ja-Latn', 'konnichiwa')}
+      showTranslation={false}
+      showPronunciation={true}
+      translationEnabled={true}
+      pronunciationEnabled={true}
+      onToggleTranslation={() => {}}
+      onTogglePronunciation={() => {}}
+      audioInstance={audioInstance}
+      onClose={() => {}}
+      {...props}
+    />,
+  )
+
+describe('<KaraokeLyricsOverlay /> behavior', () => {
+  beforeEach(() => {
+    localStorage.clear()
+    window.innerWidth = 1200
+    window.innerHeight = 900
+    vi.spyOn(window, 'requestAnimationFrame').mockImplementation(() => 1)
+    vi.spyOn(window, 'cancelAnimationFrame').mockImplementation(() => {})
+  })
+
+  afterEach(() => {
+    vi.restoreAllMocks()
+    cleanup()
+  })
+
+  it('shows tooltips for translation, pronunciation, and appearance controls', async () => {
+    renderOverlay()
+
+    fireEvent.mouseOver(screen.getByTestId('lyrics-toggle-translation'))
+    expect(await screen.findByText('Toggle translations')).toBeInTheDocument()
+
+    fireEvent.mouseOver(screen.getByTestId('lyrics-toggle-pronunciation'))
+    expect(await screen.findByText('Toggle pronunciations')).toBeInTheDocument()
+
+    fireEvent.mouseOver(screen.getByTestId('lyrics-settings-button'))
+    expect(await screen.findByText('Appearance')).toBeInTheDocument()
+  })
+
+  it('renders the appearance popup with Main label and default line height for older settings', async () => {
+    localStorage.setItem(
+      'karaoke-lyrics-settings',
+      JSON.stringify({
+        tr: { fontSize: 16, colorKey: 'blue' },
+        main: { fontSize: 26, colorKey: 'white' },
+        pr: { fontSize: 15, colorKey: 'green' },
+      }),
+    )
+
+    renderOverlay()
+
+    fireEvent.click(screen.getByTestId('lyrics-settings-button'))
+
+    expect(await screen.findByText('Appearance')).toBeInTheDocument()
+    expect(screen.getByText('Main', { selector: 'div' })).toBeInTheDocument()
+    expect(screen.queryByText('Default')).not.toBeInTheDocument()
+    expect(screen.getByTestId('lyrics-reset-appearance')).toBeInTheDocument()
+    expect(screen.getByTestId('lyrics-line-height-value')).toHaveTextContent(
+      DEFAULT_LINE_HEIGHT_TEXT,
+    )
+  })
+
+  it('renders the lyric group in main, pronunciation, translation order with layer badges', () => {
+    renderOverlay({
+      showTranslation: true,
+      showPronunciation: true,
+    })
+
+    const mainLine = screen.getByText('こんにちは')
+    const pronunciationLine = screen.getByText('konnichiwa')
+    const translationLine = screen.getByText('Hello')
+
+    expect(
+      mainLine.compareDocumentPosition(pronunciationLine) &
+        Node.DOCUMENT_POSITION_FOLLOWING,
+    ).toBeTruthy()
+    expect(
+      pronunciationLine.compareDocumentPosition(translationLine) &
+        Node.DOCUMENT_POSITION_FOLLOWING,
+    ).toBeTruthy()
+
+    expect(screen.getByTestId('lyrics-language-badge-main')).toHaveTextContent(
+      'Mainja',
+    )
+    expect(screen.getByTestId('lyrics-language-badge-pr')).toHaveTextContent(
+      'PRja-Latn',
+    )
+    expect(screen.getByTestId('lyrics-language-badge-tr')).toHaveTextContent(
+      'TRen',
+    )
+  })
+
+  it('renders line-timed rows as whole-line spans without synthetic token splits', () => {
+    renderOverlay({
+      mainLyric: {
+        kind: 'main',
+        lang: 'en',
+        synced: true,
+        line: [
+          { start: 1000, end: 2400, value: 'Batter up, batter up, batter up' },
+        ],
+      },
+      translationLyric: {
+        kind: 'translation',
+        lang: 'ja',
+        synced: true,
+        line: [
+          {
+            start: 1000,
+            end: 2400,
+            value: 'バッターアップ、バッターアップ、バッターアップ',
+          },
+        ],
+      },
+      pronunciationLyric: {
+        kind: 'pronunciation',
+        lang: 'ja-Latn',
+        synced: true,
+        line: [
+          {
+            start: 1000,
+            end: 2400,
+            value: 'Battaa appu, battaa appu, battaa appu',
+          },
+        ],
+      },
+      showTranslation: true,
+      showPronunciation: true,
+    })
+
+    const mainLine = screen.getByText(
+      'Batter up, batter up, batter up',
+    ).parentElement
+    const pronunciationLine = screen.getByText(
+      'Battaa appu, battaa appu, battaa appu',
+    ).parentElement
+    const translationLine = screen.getByText(
+      'バッターアップ、バッターアップ、バッターアップ',
+    ).parentElement
+
+    expect(mainLine.querySelectorAll('span')).toHaveLength(1)
+    expect(pronunciationLine.querySelectorAll('span')).toHaveLength(1)
+    expect(translationLine.querySelectorAll('span')).toHaveLength(1)
+  })
+
+  it('highlights line-timed pronunciation and translation rows with the active main line', () => {
+    renderOverlay({
+      mainLyric: {
+        kind: 'main',
+        lang: 'en',
+        synced: true,
+        line: [
+          { start: 1000, end: 1800, value: 'Line one' },
+          { start: 2500, end: 3300, value: 'Line two' },
+        ],
+      },
+      translationLyric: {
+        kind: 'translation',
+        lang: 'ja',
+        synced: true,
+        line: [
+          { start: 1000, end: 1800, value: '一行目' },
+          { start: 2500, end: 3300, value: '二行目' },
+        ],
+      },
+      pronunciationLyric: {
+        kind: 'pronunciation',
+        lang: 'ja-Latn',
+        synced: true,
+        line: [
+          { start: 1000, end: 1800, value: 'ichigyoume' },
+          { start: 2500, end: 3300, value: 'nigyoume' },
+        ],
+      },
+      showTranslation: true,
+      showPronunciation: true,
+      audioInstance: {
+        ...audioInstance,
+        currentTime: 1.2,
+      },
+    })
+
+    const activePronunciation = screen.getByText('ichigyoume').parentElement
+    const inactivePronunciation = screen.getByText('nigyoume').parentElement
+    const activeTranslation = screen.getByText('一行目').parentElement
+    const inactiveTranslation = screen.getByText('二行目').parentElement
+
+    expect(parseFloat(activePronunciation.style.opacity)).toBeGreaterThan(
+      parseFloat(inactivePronunciation.style.opacity),
+    )
+    expect(parseFloat(activeTranslation.style.opacity)).toBeGreaterThan(
+      parseFloat(inactiveTranslation.style.opacity),
+    )
+  })
+
+  it('renders untimed text lyrics in manual reading mode without a pinned active line', () => {
+    renderOverlay({
+      mainLyric: {
+        kind: 'main',
+        lang: 'en',
+        synced: false,
+        line: [{ value: 'First plain line' }, { value: 'Second plain line' }],
+      },
+      translationLyric: null,
+      pronunciationLyric: null,
+      showTranslation: false,
+      showPronunciation: false,
+      translationEnabled: false,
+      pronunciationEnabled: false,
+    })
+
+    const firstLine = screen.getByText('First plain line').parentElement
+    const secondLine = screen.getByText('Second plain line').parentElement
+
+    expect(firstLine.style.opacity).toBe('1')
+    expect(secondLine.style.opacity).toBe('1')
+    expect(firstLine.style.color).toBe(secondLine.style.color)
+  })
+
+  it('persists line height changes, keeps aux line spacing fixed, and stores overlay height', async () => {
+    renderOverlay({
+      mainLyric: buildLyric('main', 'en', 'Hello world'),
+      translationLyric: buildLyric('translation', 'es', 'Hola'),
+      pronunciationLyric: buildLyric('pronunciation', 'en-Latn', 'heh-loh'),
+      showTranslation: true,
+      showPronunciation: true,
+      translationEnabled: true,
+      pronunciationEnabled: true,
+    })
+
+    const overlay = screen.getByTestId('karaoke-lyrics-overlay')
+    const mainLine = screen.getByText('Hello world').parentElement
+    const pronunciationLine = screen.getByText('heh-loh').parentElement
+    expect(mainLine).toHaveStyle(`line-height: ${DEFAULT_LINE_HEIGHT_TEXT}`)
+    expect(pronunciationLine).toHaveStyle('line-height: 1.2')
+
+    fireEvent.click(screen.getByTestId('lyrics-settings-button'))
+
+    const slider = screen.getByRole('slider', { name: 'Line height' })
+    slider.focus()
+    fireEvent.keyDown(slider, { key: 'ArrowRight' })
+
+    await waitFor(() =>
+      expect(screen.getByTestId('lyrics-line-height-value')).toHaveTextContent(
+        NEXT_LINE_HEIGHT_TEXT,
+      ),
+    )
+
+    await waitFor(() =>
+      expect(mainLine).toHaveStyle(`line-height: ${NEXT_LINE_HEIGHT_TEXT}`),
+    )
+    expect(pronunciationLine).toHaveStyle('line-height: 1.2')
+
+    fireEvent.mouseDown(screen.getByTestId('lyrics-resize-handle'), {
+      clientY: 400,
+    })
+    fireEvent.mouseMove(window, { clientY: 360 })
+    fireEvent.mouseUp(window)
+
+    await waitFor(() => expect(overlay).toHaveStyle('height: 340px'))
+
+    const stored = JSON.parse(localStorage.getItem('karaoke-lyrics-settings'))
+    expect(stored.lineHeight).toBeCloseTo(1.32, 2)
+    expect(stored.overlayHeight).toBe(340)
+  })
+
+  it('resets appearance back to the default spacing and overlay height', async () => {
+    localStorage.setItem(
+      'karaoke-lyrics-settings',
+      JSON.stringify({
+        lineHeight: 1.8,
+        overlayHeight: 420,
+        tr: { fontSize: 16, colorKey: 'yellow' },
+        main: { fontSize: 28, colorKey: 'cyan' },
+        pr: { fontSize: 15, colorKey: 'pink' },
+      }),
+    )
+
+    renderOverlay({
+      mainLyric: buildLyric('main', 'en', 'Hello world'),
+      translationLyric: null,
+      pronunciationLyric: null,
+      showPronunciation: false,
+      translationEnabled: false,
+      pronunciationEnabled: false,
+    })
+
+    const overlay = screen.getByTestId('karaoke-lyrics-overlay')
+    const mainLine = screen.getByText('Hello world').parentElement
+    expect(overlay).toHaveStyle('height: 420px')
+    expect(mainLine).toHaveStyle('line-height: 1.8')
+
+    fireEvent.click(screen.getByTestId('lyrics-settings-button'))
+    fireEvent.click(screen.getByTestId('lyrics-reset-appearance'))
+
+    await waitFor(() =>
+      expect(screen.getByTestId('lyrics-line-height-value')).toHaveTextContent(
+        DEFAULT_LINE_HEIGHT_TEXT,
+      ),
+    )
+    await waitFor(() => expect(overlay).toHaveStyle('height: 300px'))
+    await waitFor(() =>
+      expect(mainLine).toHaveStyle(`line-height: ${DEFAULT_LINE_HEIGHT_TEXT}`),
+    )
+
+    const stored = JSON.parse(localStorage.getItem('karaoke-lyrics-settings'))
+    expect(stored.lineHeight).toBeCloseTo(1.3, 2)
+    expect(stored.overlayHeight).toBe(300)
+  })
+})
diff --git a/ui/src/audioplayer/Player.jsx b/ui/src/audioplayer/Player.jsx
index b8b33b6d5..c6e73c916 100644
--- a/ui/src/audioplayer/Player.jsx
+++ b/ui/src/audioplayer/Player.jsx
@@ -40,6 +40,10 @@ import {
   selectLyricLayers,
   structuredLyricToLrc,
 } from './lyrics'
+import {
+  resolveLyricsOverlayState,
+  togglePronunciationPreference,
+} from './lyricsOverlayState'
 import KaraokeLyricsOverlay from './KaraokeLyricsOverlay'
 
 const emptyLyricLayers = {
@@ -143,11 +147,12 @@ const Player = () => {
   const lyricCacheRef = useRef(new Map())
   const lyricRequestIdRef = useRef(0)
   const playerRef = useRef(null)
-  const [karaokeVisible, setKaraokeVisible] = useState(false)
+  const [karaokeVisiblePreference, setKaraokeVisiblePreference] =
+    useState(false)
   const [selectedLyricLayers, setSelectedLyricLayers] =
     useState(emptyLyricLayers)
-  const [showTranslation, setShowTranslation] = useState(false)
-  const [showPronunciation, setShowPronunciation] = useState(false)
+  const [translationPreference, setTranslationPreference] = useState(false)
+  const [pronunciationPreference, setPronunciationPreference] = useState(null)
   const currentTrackId = playerState.current?.trackId
   const currentTrackIsRadio = playerState.current?.isRadio
   const selectedStructuredLyric = selectedLyricLayers.main
@@ -158,6 +163,15 @@ const Player = () => {
   const hasPronunciationLyric = hasStructuredLyricContent(
     selectedLyricLayers.pronunciation,
   )
+  const { karaokeVisible, showTranslation, showPronunciation } =
+    resolveLyricsOverlayState({
+      karaokeVisiblePreference,
+      translationPreference,
+      pronunciationPreference,
+      hasKaraokeLyric,
+      hasTranslationLyric,
+      hasPronunciationLyric,
+    })
 
   const applyLyricToRuntimePlayer = useCallback((trackId, lyric) => {
     if (!trackId) {
@@ -255,9 +269,6 @@ const Player = () => {
   useEffect(() => {
     if (!currentTrackId || currentTrackIsRadio) {
       setSelectedLyricLayers(emptyLyricLayers)
-      setShowTranslation(false)
-      setShowPronunciation(false)
-      setKaraokeVisible(false)
       return
     }
 
@@ -273,8 +284,6 @@ const Player = () => {
       }
     }
     setSelectedLyricLayers(layers)
-    setShowTranslation(false)
-    setShowPronunciation(hasStructuredLyricContent(layers.pronunciation))
   }, [currentTrackId, currentTrackIsRadio])
 
   useEffect(() => {
@@ -297,10 +306,6 @@ const Player = () => {
             : normalizeLyricLayers({ main: cached?.structuredLyric })
 
       setSelectedLyricLayers(cachedLayers)
-      setShowTranslation(false)
-      setShowPronunciation(
-        hasStructuredLyricContent(cachedLayers.pronunciation),
-      )
       if (cachedLyric) {
         dispatch(updateQueueLyric(currentTrackId, cachedLyric))
         applyLyricToRuntimePlayer(currentTrackId, cachedLyric)
@@ -327,8 +332,6 @@ const Player = () => {
           layers,
         })
         setSelectedLyricLayers(layers)
-        setShowTranslation(false)
-        setShowPronunciation(hasStructuredLyricContent(layers.pronunciation))
 
         if (lyric !== '') {
           dispatch(updateQueueLyric(currentTrackId, lyric))
@@ -340,19 +343,11 @@ const Player = () => {
           return
         }
         setSelectedLyricLayers(emptyLyricLayers)
-        setShowTranslation(false)
-        setShowPronunciation(false)
         // Do not cache network/request failures as empty lyrics, so we can retry.
         lyricCacheRef.current.delete(currentTrackId)
       })
   }, [dispatch, currentTrackId, currentTrackIsRadio, applyLyricToRuntimePlayer])
 
-  useEffect(() => {
-    if (!hasKaraokeLyric && karaokeVisible) {
-      setKaraokeVisible(false)
-    }
-  }, [hasKaraokeLyric, karaokeVisible])
-
   const defaultOptions = useMemo(
     () => ({
       theme: playerTheme,
@@ -404,7 +399,9 @@ const Player = () => {
         <PlayerToolbar
           id={current.trackId}
           isRadio={current.isRadio}
-          onToggleLyrics={() => setKaraokeVisible((visible) => !visible)}
+          onToggleLyrics={() =>
+            setKaraokeVisiblePreference((visible) => !visible)
+          }
           lyricsActive={karaokeVisible}
           lyricsDisabled={!hasKaraokeLyric}
         />
@@ -616,17 +613,17 @@ const Player = () => {
         translationEnabled={hasTranslationLyric}
         pronunciationEnabled={hasPronunciationLyric}
         onToggleTranslation={() =>
-          setShowTranslation((previous) =>
+          setTranslationPreference((previous) =>
             hasTranslationLyric ? !previous : false,
           )
         }
         onTogglePronunciation={() =>
-          setShowPronunciation((previous) =>
-            hasPronunciationLyric ? !previous : false,
+          setPronunciationPreference((previous) =>
+            togglePronunciationPreference(previous, hasPronunciationLyric),
           )
         }
         audioInstance={audioInstance}
-        onClose={() => setKaraokeVisible(false)}
+        onClose={() => setKaraokeVisiblePreference(false)}
       />
       <GlobalHotKeys handlers={handlers} keyMap={keyMap} allowChanges />
     </ThemeProvider>
diff --git a/ui/src/audioplayer/Player.lyricsState.test.jsx b/ui/src/audioplayer/Player.lyricsState.test.jsx
new file mode 100644
index 000000000..c47abea76
--- /dev/null
+++ b/ui/src/audioplayer/Player.lyricsState.test.jsx
@@ -0,0 +1,77 @@
+import {
+  resolveLyricsOverlayState,
+  togglePronunciationPreference,
+} from './lyricsOverlayState'
+
+describe('Player lyrics state helpers', () => {
+  it('keeps the lyrics window preference across track changes in the session', () => {
+    const visibleOnCurrentTrack = resolveLyricsOverlayState({
+      karaokeVisiblePreference: true,
+      translationPreference: false,
+      pronunciationPreference: null,
+      hasKaraokeLyric: true,
+      hasTranslationLyric: true,
+      hasPronunciationLyric: true,
+    })
+    expect(visibleOnCurrentTrack.karaokeVisible).toBe(true)
+
+    const hiddenForTrackWithoutLyrics = resolveLyricsOverlayState({
+      karaokeVisiblePreference: true,
+      translationPreference: false,
+      pronunciationPreference: null,
+      hasKaraokeLyric: false,
+      hasTranslationLyric: false,
+      hasPronunciationLyric: false,
+    })
+    expect(hiddenForTrackWithoutLyrics.karaokeVisible).toBe(false)
+
+    const restoredOnNextLyricsTrack = resolveLyricsOverlayState({
+      karaokeVisiblePreference: true,
+      translationPreference: false,
+      pronunciationPreference: null,
+      hasKaraokeLyric: true,
+      hasTranslationLyric: false,
+      hasPronunciationLyric: false,
+    })
+    expect(restoredOnNextLyricsTrack.karaokeVisible).toBe(true)
+  })
+
+  it('restores translation and pronunciation preferences after tracks without those layers', () => {
+    const initialState = resolveLyricsOverlayState({
+      karaokeVisiblePreference: false,
+      translationPreference: false,
+      pronunciationPreference: null,
+      hasKaraokeLyric: true,
+      hasTranslationLyric: true,
+      hasPronunciationLyric: true,
+    })
+    expect(initialState.showTranslation).toBe(false)
+    expect(initialState.showPronunciation).toBe(true)
+
+    const translationPreference = true
+    const pronunciationPreference = togglePronunciationPreference(null, true)
+    expect(pronunciationPreference).toBe(false)
+
+    const hiddenOnTrackWithoutAuxLayers = resolveLyricsOverlayState({
+      karaokeVisiblePreference: false,
+      translationPreference,
+      pronunciationPreference,
+      hasKaraokeLyric: true,
+      hasTranslationLyric: false,
+      hasPronunciationLyric: false,
+    })
+    expect(hiddenOnTrackWithoutAuxLayers.showTranslation).toBe(false)
+    expect(hiddenOnTrackWithoutAuxLayers.showPronunciation).toBe(false)
+
+    const restoredOnTrackWithAuxLayers = resolveLyricsOverlayState({
+      karaokeVisiblePreference: false,
+      translationPreference,
+      pronunciationPreference,
+      hasKaraokeLyric: true,
+      hasTranslationLyric: true,
+      hasPronunciationLyric: true,
+    })
+    expect(restoredOnTrackWithAuxLayers.showTranslation).toBe(true)
+    expect(restoredOnTrackWithAuxLayers.showPronunciation).toBe(false)
+  })
+})
diff --git a/ui/src/audioplayer/lyrics.js b/ui/src/audioplayer/lyrics.js
index 87b218d05..e9cd16d5a 100644
--- a/ui/src/audioplayer/lyrics.js
+++ b/ui/src/audioplayer/lyrics.js
@@ -12,6 +12,9 @@ const padTime = (value) => {
 }
 
 const toTime = (value) => {
+  if (value == null || value === '') {
+    return null
+  }
   const numeric = Number(value)
   return Number.isFinite(numeric) ? numeric : null
 }
@@ -179,64 +182,6 @@ const lineTimeWindow = (lines, index) => {
   return { start, end }
 }
 
-const buildSyntheticWordTokens = (line, token) => {
-  const text = typeof line?.value === 'string' ? line.value : ''
-  if (!text.trim()) {
-    return null
-  }
-
-  const chunks = text.match(/\S+\s*/g) || []
-  if (chunks.length < 2) {
-    return null
-  }
-
-  const normalizedLine = text.replace(/\s+/g, ' ').trim().toLowerCase()
-  const normalizedTokenValue = (token?.value || '')
-    .replace(/\s+/g, ' ')
-    .trim()
-    .toLowerCase()
-  if (!normalizedTokenValue || !normalizedLine) {
-    return null
-  }
-
-  const compressedLine = normalizedLine.replace(/\s+/g, '')
-  const compressedToken = normalizedTokenValue.replace(/\s+/g, '')
-  const tokenLooksLikeWholeLine =
-    compressedToken === compressedLine ||
-    compressedToken.length >= Math.floor(compressedLine.length * 0.8)
-  if (!tokenLooksLikeWholeLine) {
-    return null
-  }
-
-  const tokenStart = toTime(token?.start)
-  const tokenEnd = toTime(token?.end)
-  const lineStart = toTime(line?.start)
-  const lineEnd = toTime(line?.end)
-
-  const baseStart = tokenStart ?? lineStart
-  const baseEnd = tokenEnd ?? lineEnd
-  if (
-    baseStart == null ||
-    baseEnd == null ||
-    !Number.isFinite(baseStart) ||
-    !Number.isFinite(baseEnd) ||
-    baseEnd <= baseStart
-  ) {
-    return null
-  }
-
-  const duration = baseEnd - baseStart
-  return chunks.map((chunk, idx) => ({
-    start: baseStart + (duration * idx) / chunks.length,
-    end: baseStart + (duration * (idx + 1)) / chunks.length,
-    value: chunk,
-    role: typeof token?.role === 'string' ? token.role : '',
-    agentId: typeof token?.agentId === 'string' ? token.agentId : '',
-    agentName: typeof token?.agentName === 'string' ? token.agentName : '',
-    agentRole: typeof token?.agentRole === 'string' ? token.agentRole : '',
-  }))
-}
-
 export const hasCueTiming = (structuredLyric) =>
   Boolean(
     structuredLyric &&
@@ -449,19 +394,6 @@ export const buildKaraokeLines = (structuredLyric) => {
       }
       return a.index - b.index
     })
-    .map((line) => {
-      const nextLine = { ...line }
-      if (nextLine.tokens.length === 1) {
-        const syntheticTokens = buildSyntheticWordTokens(
-          nextLine,
-          nextLine.tokens[0],
-        )
-        if (syntheticTokens) {
-          nextLine.tokens = syntheticTokens
-        }
-      }
-      return nextLine
-    })
 
   for (let i = 0; i < normalized.length; i += 1) {
     if (normalized[i].end == null) {
@@ -628,6 +560,17 @@ export const getActiveKaraokeState = (lines, currentTimeMs) => {
   return { lineIndex, tokenIndex }
 }
 
+export const hasUsableKaraokeTiming = (lines) =>
+  Array.isArray(lines) &&
+  lines.some(
+    (line) =>
+      toTime(line?.start) != null ||
+      (Array.isArray(line?.tokens) &&
+        line.tokens.some(
+          (token) => toTime(token?.start) != null || toTime(token?.end) != null,
+        )),
+  )
+
 export const findLayerLineIndexForMain = (mainLines, layerLines, mainIndex) => {
   if (
     !Array.isArray(mainLines) ||
@@ -692,3 +635,8 @@ export const resolveLayerLineForMain = (mainLines, layerLines, mainIndex) => {
     line: index >= 0 ? layerLines[index] : null,
   }
 }
+
+export const buildHighlightedMainLine = (line) => line
+
+export const buildHighlightedAuxLine = (_referenceLine, auxiliaryLine) =>
+  auxiliaryLine ?? null
diff --git a/ui/src/audioplayer/lyrics.test.js b/ui/src/audioplayer/lyrics.test.js
index 3a5f83b2d..2fcf1df40 100644
--- a/ui/src/audioplayer/lyrics.test.js
+++ b/ui/src/audioplayer/lyrics.test.js
@@ -1,8 +1,11 @@
 import {
+  buildHighlightedAuxLine,
+  buildHighlightedMainLine,
   buildKaraokeLines,
   findLayerLineIndexForMain,
   getActiveKaraokeState,
   getPreferredLyricLanguage,
+  hasUsableKaraokeTiming,
   hasStructuredLyricContent,
   pickStructuredLyric,
   resolveKaraokeTokenWindow,
@@ -201,6 +204,110 @@ describe('lyrics helpers', () => {
     )
   })
 
+  it('keeps translation lines line-level when they do not have real cue timing', () => {
+    const mainLine = {
+      index: 0,
+      start: 1000,
+      end: 2200,
+      value: '불을 질러라',
+      tokens: [
+        { start: 1000, end: 1300, value: '불을 ' },
+        { start: 1300, end: 1650, value: '질' },
+        { start: 1650, end: 2200, value: '러라' },
+      ],
+    }
+    const translationLine = {
+      index: 0,
+      start: 1000,
+      end: 2200,
+      value: 'Set it on fire',
+      tokens: [],
+    }
+
+    const highlighted = buildHighlightedAuxLine(mainLine, translationLine, 2600)
+
+    expect(highlighted).toBe(translationLine)
+    expect(highlighted.tokens).toEqual([])
+  })
+
+  it('keeps pronunciation lines line-level when they do not have real cue timing', () => {
+    const mainLine = {
+      index: 0,
+      start: 1000,
+      end: 2200,
+      value: 'You もっと強く 素早く 吹き飛ばせ',
+      tokens: [],
+    }
+    const pronunciationLine = {
+      index: 0,
+      start: 1000,
+      end: 2200,
+      value: 'You motto tsuyoku subayaku fukitobase',
+      tokens: [],
+    }
+
+    const highlighted = buildHighlightedAuxLine(
+      mainLine,
+      pronunciationLine,
+      2600,
+    )
+
+    expect(highlighted).toBe(pronunciationLine)
+    expect(highlighted.tokens).toEqual([])
+  })
+
+  it('keeps main lines line-level when they do not have real cue timing', () => {
+    const line = {
+      index: 0,
+      start: 1000,
+      end: 2200,
+      value: 'Youもっと強く 素早く 吹き飛ばせ',
+      tokens: [],
+    }
+
+    const highlighted = buildHighlightedMainLine(line, 2600)
+
+    expect(highlighted).toBe(line)
+    expect(highlighted.tokens).toEqual([])
+  })
+
+  it('keeps auxiliary lines line-level when end time is missing and they lack cues', () => {
+    const mainLine = {
+      index: 0,
+      start: 1000,
+      end: null,
+      value: 'Hello there',
+      tokens: [],
+    }
+    const translationLine = {
+      index: 0,
+      start: 1000,
+      end: null,
+      value: 'Bonjour toi',
+      tokens: [],
+    }
+
+    const highlighted = buildHighlightedAuxLine(mainLine, translationLine, 2400)
+
+    expect(highlighted).toBe(translationLine)
+    expect(highlighted.tokens).toEqual([])
+  })
+
+  it('keeps main lines line-level when end time is missing and they lack cues', () => {
+    const line = {
+      index: 0,
+      start: 1000,
+      end: null,
+      value: 'One more time',
+      tokens: [],
+    }
+
+    const highlighted = buildHighlightedMainLine(line, 2400)
+
+    expect(highlighted).toBe(line)
+    expect(highlighted.tokens).toEqual([])
+  })
+
   it('returns no layer match when the nearest line is too far in time', () => {
     const mainLines = [
       { index: 0, start: 1000, end: 1800, value: 'Line A', tokens: [] },
@@ -353,7 +460,7 @@ describe('lyrics helpers', () => {
     ])
   })
 
-  it('splits a single full-line token into synthetic word tokens', () => {
+  it('keeps a single full-line token unchanged instead of expanding it synthetically', () => {
     const lines = buildKaraokeLines({
       lang: 'ko-Latn',
       synced: true,
@@ -371,17 +478,13 @@ describe('lyrics helpers', () => {
     })
 
     expect(lines).toHaveLength(1)
-    expect(lines[0].tokens).toHaveLength(2)
-    expect(lines[0].tokens[0].value).toBe('Da-la-lun, ')
-    expect(lines[0].tokens[1].value).toBe('dun')
+    expect(lines[0].tokens).toHaveLength(1)
+    expect(lines[0].tokens[0].value).toBe('Da-la-lun, dun')
 
     const firstWindow = resolveKaraokeTokenWindow(lines[0], 0)
-    const secondWindow = resolveKaraokeTokenWindow(lines[0], 1)
 
     expect(firstWindow.start).toBeCloseTo(1000)
-    expect(firstWindow.end).toBeCloseTo(1500)
-    expect(secondWindow.start).toBeCloseTo(1500)
-    expect(secondWindow.end).toBeCloseTo(2000)
+    expect(firstWindow.end).toBeCloseTo(2000)
   })
 
   it('detects active line and token for karaoke timing', () => {
@@ -509,4 +612,19 @@ describe('lyrics helpers', () => {
       }),
     ).toBe(true)
   })
+
+  it('detects when built karaoke lines have no usable timing', () => {
+    expect(
+      hasUsableKaraokeTiming([
+        { index: 0, value: 'First line', tokens: [] },
+        { index: 1, value: 'Second line', tokens: [] },
+      ]),
+    ).toBe(false)
+
+    expect(
+      hasUsableKaraokeTiming([
+        { index: 0, start: 1000, value: 'Timed line', tokens: [] },
+      ]),
+    ).toBe(true)
+  })
 })
diff --git a/ui/src/audioplayer/lyricsOverlayState.js b/ui/src/audioplayer/lyricsOverlayState.js
new file mode 100644
index 000000000..e8ff0e0a8
--- /dev/null
+++ b/ui/src/audioplayer/lyricsOverlayState.js
@@ -0,0 +1,27 @@
+export const resolveLyricsOverlayState = ({
+  karaokeVisiblePreference,
+  translationPreference,
+  pronunciationPreference,
+  hasKaraokeLyric,
+  hasTranslationLyric,
+  hasPronunciationLyric,
+}) => ({
+  karaokeVisible: karaokeVisiblePreference && hasKaraokeLyric,
+  showTranslation: translationPreference && hasTranslationLyric,
+  showPronunciation:
+    (pronunciationPreference == null
+      ? hasPronunciationLyric
+      : pronunciationPreference) && hasPronunciationLyric,
+})
+
+export const togglePronunciationPreference = (
+  previousPreference,
+  hasPronunciationLyric,
+) => {
+  if (!hasPronunciationLyric) {
+    return false
+  }
+  const currentPreference =
+    previousPreference == null ? hasPronunciationLyric : previousPreference
+  return !currentPreference
+}

From aeae6d221706e392272b8c2697b886144e81b48e Mon Sep 17 00:00:00 2001
From: ranokay <github@ranokay.com>
Date: Tue, 14 Apr 2026 02:19:09 +0300
Subject: [PATCH 11/14] feat(lyrics): require cue byte offsets

---
 core/lyrics/lyrics_test.go                    |  16 +-
 core/lyrics/sources_test.go                   |  14 ++
 core/lyrics/ttml.go                           | 226 +++++++++++++++---
 core/lyrics/ttml_test.go                      |  20 +-
 model/lyrics.go                               |  86 ++++---
 model/lyrics_test.go                          |  34 ++-
 server/subsonic/helpers.go                    |   6 +-
 server/subsonic/media_retrieval_test.go       | 164 +++++++++++--
 server/subsonic/responses/responses.go        |  10 +-
 ui/src/audioplayer/KaraokeLyricsOverlay.jsx   |  67 ++++++
 .../audioplayer/KaraokeLyricsOverlay.test.jsx |  68 ++++++
 ui/src/audioplayer/lyrics.js                  |  80 +++++++
 ui/src/audioplayer/lyrics.test.js             |  56 +++++
 ui/src/subsonic/index.test.js                 |   3 +-
 14 files changed, 730 insertions(+), 120 deletions(-)

diff --git a/core/lyrics/lyrics_test.go b/core/lyrics/lyrics_test.go
index 917c530ac..822e975ce 100644
--- a/core/lyrics/lyrics_test.go
+++ b/core/lyrics/lyrics_test.go
@@ -56,14 +56,18 @@ var _ = Describe("sources", func() {
 					Value: "Lead words",
 					Cue: []model.Cue{
 						{
-							Start: gg.P(int64(1000)),
-							End:   gg.P(int64(1500)),
-							Value: "Lead ",
+							Start:     gg.P(int64(1000)),
+							End:       gg.P(int64(1500)),
+							Value:     "Lead ",
+							ByteStart: 0,
+							ByteEnd:   4,
 						},
 						{
-							Start: gg.P(int64(1500)),
-							End:   gg.P(int64(3000)),
-							Value: "words",
+							Start:     gg.P(int64(1500)),
+							End:       gg.P(int64(3000)),
+							Value:     "words",
+							ByteStart: 5,
+							ByteEnd:   9,
 						},
 					},
 				},
diff --git a/core/lyrics/sources_test.go b/core/lyrics/sources_test.go
index a86c84cd0..5ba03336e 100644
--- a/core/lyrics/sources_test.go
+++ b/core/lyrics/sources_test.go
@@ -108,12 +108,18 @@ var _ = Describe("sources", func() {
 			Expect(*lyrics[0].Line[0].Cue[0].Start).To(Equal(int64(1000)))
 			Expect(lyrics[0].Line[0].Cue[0].Value).To(Equal("Some "))
 			Expect(lyrics[0].Line[0].Cue[0].End).To(Equal(gg.P(int64(1500))))
+			Expect(lyrics[0].Line[0].Cue[0].ByteStart).To(Equal(0))
+			Expect(lyrics[0].Line[0].Cue[0].ByteEnd).To(Equal(4))
 			Expect(*lyrics[0].Line[0].Cue[1].Start).To(Equal(int64(1500)))
 			Expect(lyrics[0].Line[0].Cue[1].Value).To(Equal("lyrics "))
 			Expect(lyrics[0].Line[0].Cue[1].End).To(Equal(gg.P(int64(2000))))
+			Expect(lyrics[0].Line[0].Cue[1].ByteStart).To(Equal(5))
+			Expect(lyrics[0].Line[0].Cue[1].ByteEnd).To(Equal(11))
 			Expect(*lyrics[0].Line[0].Cue[2].Start).To(Equal(int64(2000)))
 			Expect(lyrics[0].Line[0].Cue[2].Value).To(Equal("here"))
 			Expect(lyrics[0].Line[0].Cue[2].End).To(Equal(gg.P(int64(3000))))
+			Expect(lyrics[0].Line[0].Cue[2].ByteStart).To(Equal(12))
+			Expect(lyrics[0].Line[0].Cue[2].ByteEnd).To(Equal(15))
 
 			// Line 2: has inline markers
 			Expect(lyrics[0].Line[1].Start).To(Equal(gg.P(int64(3000))))
@@ -122,6 +128,10 @@ var _ = Describe("sources", func() {
 			Expect(lyrics[0].Line[1].Cue).To(HaveLen(2))
 			Expect(lyrics[0].Line[1].Cue[0].End).To(Equal(gg.P(int64(3500))))
 			Expect(lyrics[0].Line[1].Cue[1].End).To(Equal(gg.P(int64(5000))))
+			Expect(lyrics[0].Line[1].Cue[0].ByteStart).To(Equal(0))
+			Expect(lyrics[0].Line[1].Cue[0].ByteEnd).To(Equal(4))
+			Expect(lyrics[0].Line[1].Cue[1].ByteStart).To(Equal(5))
+			Expect(lyrics[0].Line[1].Cue[1].ByteEnd).To(Equal(9))
 
 			// Line 3: plain line, no cues
 			Expect(lyrics[0].Line[2].Start).To(Equal(gg.P(int64(5000))))
@@ -148,9 +158,13 @@ var _ = Describe("sources", func() {
 			Expect(*lyrics[0].Line[0].Cue[0].Start).To(Equal(int64(1000)))
 			Expect(lyrics[0].Line[0].Cue[0].Value).To(Equal("Lead "))
 			Expect(lyrics[0].Line[0].Cue[0].End).To(Equal(gg.P(int64(1500))))
+			Expect(lyrics[0].Line[0].Cue[0].ByteStart).To(Equal(0))
+			Expect(lyrics[0].Line[0].Cue[0].ByteEnd).To(Equal(4))
 			Expect(*lyrics[0].Line[0].Cue[1].Start).To(Equal(int64(1500)))
 			Expect(lyrics[0].Line[0].Cue[1].Value).To(Equal("words"))
 			Expect(lyrics[0].Line[0].Cue[1].End).To(Equal(gg.P(int64(3000))))
+			Expect(lyrics[0].Line[0].Cue[1].ByteStart).To(Equal(5))
+			Expect(lyrics[0].Line[0].Cue[1].ByteEnd).To(Equal(9))
 
 			Expect(lyrics[0].Line[1].Start).To(Equal(gg.P(int64(3000))))
 			Expect(lyrics[0].Line[1].Value).To(Equal("Fallback line"))
diff --git a/core/lyrics/ttml.go b/core/lyrics/ttml.go
index a02fa52d8..6e4ce9da3 100644
--- a/core/lyrics/ttml.go
+++ b/core/lyrics/ttml.go
@@ -10,6 +10,7 @@ import (
 	"sort"
 	"strconv"
 	"strings"
+	"unicode"
 
 	"github.com/navidrome/navidrome/log"
 	"github.com/navidrome/navidrome/model"
@@ -78,6 +79,11 @@ type ttmlDefinedAgent struct {
 	Name string
 }
 
+type ttmlPiece struct {
+	raw string
+	cue *model.Cue
+}
+
 type ttmlParser struct {
 	decoder *xml.Decoder
 	params  ttmlTimingParams
@@ -294,7 +300,7 @@ func (p *ttmlParser) parseMetadataText(start xml.StartElement, parent ttmlTiming
 	forKey, hasFor := attrValue(start.Attr, "for")
 	forKey = strings.TrimSpace(forKey)
 
-	value, tokens, err := p.parseInlineElement(start, parent)
+	pieces, err := p.parseInlineElement(start, parent)
 	if err != nil {
 		return ttmlMetadataEntry{}, false, err
 	}
@@ -307,7 +313,8 @@ func (p *ttmlParser) parseMetadataText(start xml.StartElement, parent ttmlTiming
 		return ttmlMetadataEntry{}, false, nil
 	}
 
-	line := model.Line{Value: sanitizeTTMLText(value)}
+	value, tokens := buildTTMLLineFromPieces(pieces)
+	line := model.Line{Value: value}
 	if ctx.hasBegin {
 		startMs := ctx.begin
 		line.Start = &startMs
@@ -329,8 +336,7 @@ func (p *ttmlParser) parseMetadataText(start xml.StartElement, parent ttmlTiming
 }
 
 func (p *ttmlParser) parseParagraph(parent ttmlTimingContext) (string, []model.Cue, error) {
-	var text strings.Builder
-	var tokens []model.Cue
+	var pieces []ttmlPiece
 
 	for {
 		token, err := p.decoder.Token()
@@ -340,26 +346,26 @@ func (p *ttmlParser) parseParagraph(parent ttmlTimingContext) (string, []model.C
 
 		switch t := token.(type) {
 		case xml.StartElement:
-			value, inlineTokens, err := p.parseInlineElement(t, parent)
+			inlinePieces, err := p.parseInlineElement(t, parent)
 			if err != nil {
 				return "", nil, err
 			}
-			text.WriteString(value)
-			tokens = append(tokens, inlineTokens...)
+			pieces = append(pieces, inlinePieces...)
 		case xml.EndElement:
 			if strings.EqualFold(t.Name.Local, "p") {
-				return sanitizeTTMLText(text.String()), tokens, nil
+				value, tokens := buildTTMLLineFromPieces(pieces)
+				return value, tokens, nil
 			}
 		case xml.CharData:
-			text.WriteString(string(t))
+			pieces = append(pieces, ttmlPiece{raw: string(t)})
 		}
 	}
 }
 
-func (p *ttmlParser) parseInlineElement(start xml.StartElement, parent ttmlTimingContext) (string, []model.Cue, error) {
+func (p *ttmlParser) parseInlineElement(start xml.StartElement, parent ttmlTimingContext) ([]ttmlPiece, error) {
 	local := strings.ToLower(start.Name.Local)
 	if local == "br" {
-		return "\n", nil, nil
+		return []ttmlPiece{{raw: "\n"}}, nil
 	}
 
 	ctx := p.childContext(start.Attr, parent)
@@ -368,53 +374,203 @@ func (p *ttmlParser) parseInlineElement(start xml.StartElement, parent ttmlTimin
 	_, hasDur := attrValue(start.Attr, "dur")
 	hasOwnTiming := hasBegin || hasEnd || hasDur
 
-	var text strings.Builder
-	var tokens []model.Cue
+	var pieces []ttmlPiece
 
 	for {
 		token, err := p.decoder.Token()
 		if err != nil {
-			return "", nil, err
+			return nil, err
 		}
 
 		switch t := token.(type) {
 		case xml.StartElement:
-			value, inlineTokens, err := p.parseInlineElement(t, ctx)
+			inlinePieces, err := p.parseInlineElement(t, ctx)
 			if err != nil {
-				return "", nil, err
+				return nil, err
 			}
-			text.WriteString(value)
-			tokens = append(tokens, inlineTokens...)
+			pieces = append(pieces, inlinePieces...)
 		case xml.EndElement:
 			if !strings.EqualFold(t.Name.Local, start.Name.Local) {
 				continue
 			}
 
-			value := text.String()
-			tokenText := sanitizeTTMLText(value)
-			if local == "span" && hasOwnTiming && !ctx.invalid && tokenText != "" && len(tokens) == 0 {
-				parsedToken := model.Cue{
-					Value:   tokenText,
-					AgentID: p.resolveCueAgentID(ctx),
+			if local == "span" && hasOwnTiming && !ctx.invalid && !ttmlPiecesContainCue(pieces) {
+				rawValue := concatTTMLPieceRaw(pieces)
+				tokenText := sanitizeTTMLText(rawValue)
+				if tokenText != "" {
+					parsedToken := model.Cue{
+						AgentID: p.resolveCueAgentID(ctx),
+					}
+					if ctx.hasBegin {
+						startMs := ctx.begin
+						parsedToken.Start = &startMs
+					}
+					if ctx.hasEnd {
+						endMs := ctx.end
+						parsedToken.End = &endMs
+					}
+
+					return []ttmlPiece{{
+						raw: rawValue,
+						cue: &parsedToken,
+					}}, nil
 				}
-				if ctx.hasBegin {
-					startMs := ctx.begin
-					parsedToken.Start = &startMs
-				}
-				if ctx.hasEnd {
-					endMs := ctx.end
-					parsedToken.End = &endMs
-				}
-				tokens = append(tokens, parsedToken)
 			}
 
-			return value, tokens, nil
+			return pieces, nil
 		case xml.CharData:
-			text.WriteString(string(t))
+			pieces = append(pieces, ttmlPiece{raw: string(t)})
 		}
 	}
 }
 
+func buildTTMLLineFromPieces(pieces []ttmlPiece) (string, []model.Cue) {
+	finalized := finalizeTTMLLines(splitTTMLPiecesByNewline(pieces))
+	for len(finalized) > 0 && finalized[0].text == "" && len(finalized[0].cues) == 0 {
+		finalized = finalized[1:]
+	}
+	for len(finalized) > 0 {
+		last := finalized[len(finalized)-1]
+		if last.text != "" || len(last.cues) > 0 {
+			break
+		}
+		finalized = finalized[:len(finalized)-1]
+	}
+
+	var value strings.Builder
+	cues := make([]model.Cue, 0, 8)
+	byteOffset := 0
+	for i, line := range finalized {
+		if i > 0 {
+			value.WriteByte('\n')
+			byteOffset++
+		}
+		value.WriteString(line.text)
+		for _, cue := range line.cues {
+			cue.ByteStart += byteOffset
+			cue.ByteEnd += byteOffset
+			cues = append(cues, cue)
+		}
+		byteOffset += len(line.text)
+	}
+
+	return value.String(), cues
+}
+
+type ttmlFinalLine struct {
+	text string
+	cues []model.Cue
+}
+
+func finalizeTTMLLines(lines [][]ttmlPiece) []ttmlFinalLine {
+	finalized := make([]ttmlFinalLine, 0, len(lines))
+	for _, line := range lines {
+		text, cues := finalizeTTMLLogicalLine(line)
+		finalized = append(finalized, ttmlFinalLine{text: text, cues: cues})
+	}
+	return finalized
+}
+
+func splitTTMLPiecesByNewline(pieces []ttmlPiece) [][]ttmlPiece {
+	lines := [][]ttmlPiece{{}}
+	for _, piece := range pieces {
+		raw := normalizeTTMLPieceRaw(piece.raw)
+		if raw == "" {
+			continue
+		}
+
+		start := 0
+		for i := 0; i < len(raw); i++ {
+			if raw[i] != '\n' {
+				continue
+			}
+			if start < i {
+				lines[len(lines)-1] = append(lines[len(lines)-1], ttmlPiece{
+					raw: raw[start:i],
+					cue: cloneTTMLCue(piece.cue),
+				})
+			}
+			lines = append(lines, []ttmlPiece{})
+			start = i + 1
+		}
+		if start < len(raw) {
+			lines[len(lines)-1] = append(lines[len(lines)-1], ttmlPiece{
+				raw: raw[start:],
+				cue: cloneTTMLCue(piece.cue),
+			})
+		}
+	}
+	return lines
+}
+
+func finalizeTTMLLogicalLine(line []ttmlPiece) (string, []model.Cue) {
+	rawLine := concatTTMLPieceRaw(line)
+	if rawLine == "" {
+		return "", nil
+	}
+
+	leftTrimBytes := len(rawLine) - len(strings.TrimLeftFunc(rawLine, unicode.IsSpace))
+	rightTrimBytes := len(rawLine) - len(strings.TrimRightFunc(rawLine, unicode.IsSpace))
+	trimmedEnd := len(rawLine) - rightTrimBytes
+	if trimmedEnd < leftTrimBytes {
+		trimmedEnd = leftTrimBytes
+	}
+
+	trimmed := strings.TrimSpace(rawLine)
+	cues := make([]model.Cue, 0, len(line))
+	cursor := 0
+	for _, piece := range line {
+		pieceEnd := cursor + len(piece.raw)
+		if piece.cue != nil {
+			byteStart := max(cursor, leftTrimBytes)
+			byteEnd := min(pieceEnd, trimmedEnd)
+			if byteStart < byteEnd {
+				cue := *piece.cue
+				cue.Value = rawLine[byteStart:byteEnd]
+				cue.ByteStart = byteStart - leftTrimBytes
+				cue.ByteEnd = byteEnd - leftTrimBytes - 1
+				cues = append(cues, cue)
+			}
+		}
+		cursor = pieceEnd
+	}
+
+	return trimmed, cues
+}
+
+func normalizeTTMLPieceRaw(raw string) string {
+	raw = str.SanitizeText(raw)
+	raw = strings.ReplaceAll(raw, "\r\n", "\n")
+	raw = strings.ReplaceAll(raw, "\r", "\n")
+	return raw
+}
+
+func concatTTMLPieceRaw(pieces []ttmlPiece) string {
+	var raw strings.Builder
+	for _, piece := range pieces {
+		raw.WriteString(normalizeTTMLPieceRaw(piece.raw))
+	}
+	return raw.String()
+}
+
+func ttmlPiecesContainCue(pieces []ttmlPiece) bool {
+	for _, piece := range pieces {
+		if piece.cue != nil {
+			return true
+		}
+	}
+	return false
+}
+
+func cloneTTMLCue(cue *model.Cue) *model.Cue {
+	if cue == nil {
+		return nil
+	}
+
+	cloned := *cue
+	return &cloned
+}
+
 func (p *ttmlParser) toLyricList() model.LyricList {
 	res := make(model.LyricList, 0, len(p.mainLangOrder)+len(p.translationLangOrder)+len(p.pronunciationLangOrder))
 	for _, lang := range p.mainLangOrder {
diff --git a/core/lyrics/ttml_test.go b/core/lyrics/ttml_test.go
index 4e81197d4..5f9092e36 100644
--- a/core/lyrics/ttml_test.go
+++ b/core/lyrics/ttml_test.go
@@ -141,9 +141,9 @@ var _ = Describe("parseTTML", func() {
 			Expect(line.End).To(Equal(gg.P(int64(3000))))
 			Expect(line.Cue).To(HaveLen(3))
 
-			Expect(line.Cue[0]).To(Equal(model.Cue{Start: gg.P(int64(1000)), End: gg.P(int64(1400)), Value: "He", AgentID: "main"}))
-			Expect(line.Cue[1]).To(Equal(model.Cue{Start: gg.P(int64(1400)), End: gg.P(int64(1800)), Value: "llo", AgentID: "main"}))
-			Expect(line.Cue[2]).To(Equal(model.Cue{Start: gg.P(int64(2000)), End: gg.P(int64(2500)), Value: "echo", AgentID: "__nd_bg__|main"}))
+			Expect(line.Cue[0]).To(Equal(model.Cue{Start: gg.P(int64(1000)), End: gg.P(int64(1400)), Value: "He", ByteStart: 0, ByteEnd: 1, AgentID: "main"}))
+			Expect(line.Cue[1]).To(Equal(model.Cue{Start: gg.P(int64(1400)), End: gg.P(int64(1800)), Value: "llo", ByteStart: 2, ByteEnd: 4, AgentID: "main"}))
+			Expect(line.Cue[2]).To(Equal(model.Cue{Start: gg.P(int64(2000)), End: gg.P(int64(2500)), Value: "echo", ByteStart: 6, ByteEnd: 9, AgentID: "__nd_bg__|main"}))
 		})
 
 		It("should parse named TTML agents into main, voice, and group roles", func() {
@@ -241,8 +241,8 @@ var _ = Describe("parseTTML", func() {
 			Expect(line.Value).To(Equal("go\ngo"))
 			Expect(line.End).To(Equal(gg.P(int64(45570))))
 			Expect(line.Cue).To(HaveLen(2))
-			Expect(line.Cue[0]).To(Equal(model.Cue{Start: gg.P(int64(43444)), End: gg.P(int64(43716)), Value: "go"}))
-			Expect(line.Cue[1]).To(Equal(model.Cue{Start: gg.P(int64(43716)), End: gg.P(int64(43887)), Value: "go"}))
+			Expect(line.Cue[0]).To(Equal(model.Cue{Start: gg.P(int64(43444)), End: gg.P(int64(43716)), Value: "go", ByteStart: 0, ByteEnd: 1}))
+			Expect(line.Cue[1]).To(Equal(model.Cue{Start: gg.P(int64(43716)), End: gg.P(int64(43887)), Value: "go", ByteStart: 3, ByteEnd: 4}))
 		})
 	})
 
@@ -325,8 +325,8 @@ var _ = Describe("parseTTML", func() {
 			Expect(pronunciation.Line[0].Value).To(Equal("konni"))
 			Expect(pronunciation.Line[0].End).To(Equal(gg.P(int64(2600))))
 			Expect(pronunciation.Line[0].Cue).To(HaveLen(2))
-			Expect(pronunciation.Line[0].Cue[0]).To(Equal(model.Cue{Start: gg.P(int64(2000)), End: gg.P(int64(2300)), Value: "ko"}))
-			Expect(pronunciation.Line[0].Cue[1]).To(Equal(model.Cue{Start: gg.P(int64(2300)), End: gg.P(int64(2600)), Value: "nni"}))
+			Expect(pronunciation.Line[0].Cue[0]).To(Equal(model.Cue{Start: gg.P(int64(2000)), End: gg.P(int64(2300)), Value: "ko", ByteStart: 0, ByteEnd: 1}))
+			Expect(pronunciation.Line[0].Cue[1]).To(Equal(model.Cue{Start: gg.P(int64(2300)), End: gg.P(int64(2600)), Value: "nni", ByteStart: 2, ByteEnd: 4}))
 		})
 	})
 
@@ -369,9 +369,9 @@ var _ = Describe("parseTTML", func() {
 			Expect(line.Start).To(Equal(gg.P(int64(2747))))
 			Expect(line.Value).To(Equal("I woke up"))
 			Expect(line.Cue).To(HaveLen(3))
-			Expect(line.Cue[0]).To(Equal(model.Cue{Start: gg.P(int64(2747)), End: gg.P(int64(3018)), Value: "I"}))
-			Expect(line.Cue[1]).To(Equal(model.Cue{Start: gg.P(int64(3018)), End: gg.P(int64(3179)), Value: "woke"}))
-			Expect(line.Cue[2]).To(Equal(model.Cue{Start: gg.P(int64(3179)), End: gg.P(int64(3582)), Value: "up"}))
+			Expect(line.Cue[0]).To(Equal(model.Cue{Start: gg.P(int64(2747)), End: gg.P(int64(3018)), Value: "I", ByteStart: 0, ByteEnd: 0}))
+			Expect(line.Cue[1]).To(Equal(model.Cue{Start: gg.P(int64(3018)), End: gg.P(int64(3179)), Value: "woke", ByteStart: 2, ByteEnd: 5}))
+			Expect(line.Cue[2]).To(Equal(model.Cue{Start: gg.P(int64(3179)), End: gg.P(int64(3582)), Value: "up", ByteStart: 7, ByteEnd: 8}))
 		})
 	})
 })
diff --git a/model/lyrics.go b/model/lyrics.go
index ec0df9f34..9a57ebaad 100644
--- a/model/lyrics.go
+++ b/model/lyrics.go
@@ -6,16 +6,19 @@ import (
 	"slices"
 	"strconv"
 	"strings"
+	"unicode"
 
 	"github.com/navidrome/navidrome/log"
 	"github.com/navidrome/navidrome/utils/str"
 )
 
 type Cue struct {
-	Start   *int64 `structs:"start,omitempty"   json:"start,omitempty"`
-	End     *int64 `structs:"end,omitempty"     json:"end,omitempty"`
-	Value   string `structs:"value"             json:"value"`
-	AgentID string `structs:"agentId,omitempty" json:"agentId,omitempty"`
+	Start     *int64 `structs:"start,omitempty"   json:"start,omitempty"`
+	End       *int64 `structs:"end,omitempty"     json:"end,omitempty"`
+	Value     string `structs:"value"             json:"value"`
+	ByteStart int    `structs:"byteStart"         json:"byteStart"`
+	ByteEnd   int    `structs:"byteEnd"           json:"byteEnd"`
+	AgentID   string `structs:"agentId,omitempty" json:"agentId,omitempty"`
 }
 
 type Agent struct {
@@ -127,14 +130,10 @@ func ToLyrics(language, text string) (*Lyrics, error) {
 
 			if validLine {
 				for idx := range timestamps {
-					cues := parseEnhancedCues(priorLine)
-					value := priorLine
-					if cues != nil {
-						value = stripEnhancedMarkers(value)
-					}
+					value, cues := parseEnhancedLine(priorLine)
 					structuredLines = append(structuredLines, Line{
 						Start: &timestamps[idx],
-						Value: strings.TrimSpace(value),
+						Value: value,
 						Cue:   cues,
 					})
 				}
@@ -181,14 +180,10 @@ func ToLyrics(language, text string) (*Lyrics, error) {
 
 	if validLine {
 		for idx := range timestamps {
-			cues := parseEnhancedCues(priorLine)
-			value := priorLine
-			if cues != nil {
-				value = stripEnhancedMarkers(value)
-			}
+			value, cues := parseEnhancedLine(priorLine)
 			structuredLines = append(structuredLines, Line{
 				Start: &timestamps[idx],
-				Value: strings.TrimSpace(value),
+				Value: value,
 				Cue:   cues,
 			})
 		}
@@ -213,21 +208,22 @@ func ToLyrics(language, text string) (*Lyrics, error) {
 	return &lyrics, nil
 }
 
-// parseEnhancedCues extracts word-level timing cues from Enhanced LRC inline markers.
-// Format: <mm:ss.mm>word <mm:ss.mm>word ...
-// Returns nil if no inline markers are found.
-func parseEnhancedCues(text string) []Cue {
+// parseEnhancedLine extracts word-level timing cues from Enhanced LRC inline markers
+// and computes UTF-8 byte offsets against the final stripped line value.
+func parseEnhancedLine(text string) (string, []Cue) {
 	matches := enhancedLRCRegex.FindAllStringSubmatchIndex(text, -1)
 	if len(matches) == 0 {
-		return nil
+		return strings.TrimSpace(text), nil
 	}
 
 	type segment struct {
-		start int64
-		text  string
+		start    int64
+		rawStart int
+		rawEnd   int
 	}
 
 	segments := make([]segment, 0, len(matches))
+	var rawValue strings.Builder
 	for i, match := range matches {
 		timeMs, err := parseTime(
 			// Rewrite <...> as [...] so parseTime can handle it with the same logic
@@ -258,22 +254,46 @@ func parseEnhancedCues(text string) []Cue {
 		if word == "" {
 			continue
 		}
-		segments = append(segments, segment{start: timeMs, text: word})
+
+		rawStart := rawValue.Len()
+		rawValue.WriteString(word)
+		segments = append(segments, segment{
+			start:    timeMs,
+			rawStart: rawStart,
+			rawEnd:   rawValue.Len(),
+		})
 	}
 
 	if len(segments) == 0 {
-		return nil
+		return strings.TrimSpace(stripEnhancedMarkers(text)), nil
 	}
 
-	cues := make([]Cue, len(segments))
-	for i, seg := range segments {
-		start := seg.start
-		cues[i] = Cue{
-			Start: &start,
-			Value: seg.text,
-		}
+	finalRaw := rawValue.String()
+	leftTrimBytes := len(finalRaw) - len(strings.TrimLeftFunc(finalRaw, unicode.IsSpace))
+	rightTrimBytes := len(finalRaw) - len(strings.TrimRightFunc(finalRaw, unicode.IsSpace))
+	trimmedEnd := len(finalRaw) - rightTrimBytes
+	if trimmedEnd < leftTrimBytes {
+		trimmedEnd = leftTrimBytes
 	}
-	return cues
+
+	cues := make([]Cue, 0, len(segments))
+	for _, seg := range segments {
+		start := seg.start
+		byteStart := max(seg.rawStart, leftTrimBytes)
+		byteEnd := min(seg.rawEnd, trimmedEnd)
+		if byteStart >= byteEnd {
+			continue
+		}
+
+		cues = append(cues, Cue{
+			Start:     &start,
+			Value:     finalRaw[byteStart:byteEnd],
+			ByteStart: byteStart - leftTrimBytes,
+			ByteEnd:   byteEnd - leftTrimBytes - 1,
+		})
+	}
+
+	return strings.TrimSpace(finalRaw), cues
 }
 
 // adjustGroup remaps a capture group index from the original match to our rewritten "[...]" string.
diff --git a/model/lyrics_test.go b/model/lyrics_test.go
index 6f189f024..1fa82f258 100644
--- a/model/lyrics_test.go
+++ b/model/lyrics_test.go
@@ -130,9 +130,9 @@ var _ = Describe("ToLyrics", func() {
 		Expect(line0.End).To(Equal(&t3000))
 		Expect(line0.Value).To(Equal("Some lyrics here"))
 		Expect(line0.Cue).To(Equal([]Cue{
-			{Start: &t1000, End: &t1500, Value: "Some "},
-			{Start: &t1500, End: &t2000, Value: "lyrics "},
-			{Start: &t2000, End: &t3000, Value: "here"},
+			{Start: &t1000, End: &t1500, Value: "Some ", ByteStart: 0, ByteEnd: 4},
+			{Start: &t1500, End: &t2000, Value: "lyrics ", ByteStart: 5, ByteEnd: 11},
+			{Start: &t2000, End: &t3000, Value: "here", ByteStart: 12, ByteEnd: 15},
 		}))
 
 		line1 := lyrics.Line[1]
@@ -140,8 +140,8 @@ var _ = Describe("ToLyrics", func() {
 		Expect(line1.End).To(Equal(&t3500))
 		Expect(line1.Value).To(Equal("More words"))
 		Expect(line1.Cue).To(Equal([]Cue{
-			{Start: &t3000, Value: "More "},
-			{Start: &t3500, Value: "words"},
+			{Start: &t3000, Value: "More ", ByteStart: 0, ByteEnd: 4},
+			{Start: &t3500, Value: "words", ByteStart: 5, ByteEnd: 9},
 		}))
 
 		Expect(line1.Cue[1].End).To(BeNil())
@@ -166,8 +166,8 @@ var _ = Describe("ToLyrics", func() {
 		t3000 := int64(3000)
 
 		Expect(lyrics.Line[0].Cue).To(Equal([]Cue{
-			{Start: &t1000, End: &t1500, Value: "Some "},
-			{Start: &t1500, End: &t3000, Value: "lyrics"},
+			{Start: &t1000, End: &t1500, Value: "Some ", ByteStart: 0, ByteEnd: 4},
+			{Start: &t1500, End: &t3000, Value: "lyrics", ByteStart: 5, ByteEnd: 10},
 		}))
 		Expect(lyrics.Line[0].Value).To(Equal("Some lyrics"))
 		Expect(lyrics.Line[0].End).To(Equal(&t3000))
@@ -176,9 +176,25 @@ var _ = Describe("ToLyrics", func() {
 		Expect(lyrics.Line[1].Value).To(Equal("Plain line"))
 
 		Expect(lyrics.Line[2].Cue).To(Equal([]Cue{
-			{Start: &t5000, Value: "More "},
-			{Start: &t5500, Value: "words"},
+			{Start: &t5000, Value: "More ", ByteStart: 0, ByteEnd: 4},
+			{Start: &t5500, Value: "words", ByteStart: 5, ByteEnd: 9},
 		}))
 		Expect(lyrics.Line[2].Value).To(Equal("More words"))
 	})
+
+	It("should preserve byte offsets for Enhanced LRC cues", func() {
+		lyrics, err := ToLyrics("xxx", "[00:00.00]<00:00.00>Oh <00:00.90>love<00:01.30> me <00:01.60>tonight")
+		Expect(err).ToNot(HaveOccurred())
+		Expect(lyrics.Line).To(HaveLen(1))
+
+		t0, t900, t1300, t1600 := int64(0), int64(900), int64(1300), int64(1600)
+		line := lyrics.Line[0]
+		Expect(line.Value).To(Equal("Oh love me tonight"))
+		Expect(line.Cue).To(Equal([]Cue{
+			{Start: &t0, Value: "Oh ", ByteStart: 0, ByteEnd: 2},
+			{Start: &t900, Value: "love", ByteStart: 3, ByteEnd: 6},
+			{Start: &t1300, Value: " me ", ByteStart: 7, ByteEnd: 10},
+			{Start: &t1600, Value: "tonight", ByteStart: 11, ByteEnd: 17},
+		}))
+	})
 })
diff --git a/server/subsonic/helpers.go b/server/subsonic/helpers.go
index ad769ee94..6a14aa4aa 100644
--- a/server/subsonic/helpers.go
+++ b/server/subsonic/helpers.go
@@ -619,8 +619,10 @@ func buildLyricCues(cues []model.Cue, lineEnd *int64) []responses.LyricCue {
 		}
 
 		cue := responses.LyricCue{
-			Start: *cues[i].Start,
-			Value: cues[i].Value,
+			Start:     *cues[i].Start,
+			Value:     cues[i].Value,
+			ByteStart: cues[i].ByteStart,
+			ByteEnd:   cues[i].ByteEnd,
 		}
 		if hasAnyEnd {
 			end := cues[i].End
diff --git a/server/subsonic/media_retrieval_test.go b/server/subsonic/media_retrieval_test.go
index e4f6a21d4..faa90e375 100644
--- a/server/subsonic/media_retrieval_test.go
+++ b/server/subsonic/media_retrieval_test.go
@@ -277,6 +277,8 @@ var _ = Describe("MediaRetrievalController", func() {
 						expectedCue := expectedCueLine.Cue[k]
 						Expect(realCue.Value).To(Equal(expectedCue.Value))
 						Expect(realCue.Start).To(Equal(expectedCue.Start))
+						Expect(realCue.ByteStart).To(Equal(expectedCue.ByteStart))
+						Expect(realCue.ByteEnd).To(Equal(expectedCue.ByteEnd))
 						if expectedCue.End == nil {
 							Expect(realCue.End).To(BeNil())
 						} else {
@@ -514,14 +516,18 @@ var _ = Describe("MediaRetrievalController", func() {
 								Value: "konni",
 								Cue: []responses.LyricCue{
 									{
-										Start: tokenStartA,
-										End:   &tokenEndA,
-										Value: "ko",
+										Start:     tokenStartA,
+										End:       &tokenEndA,
+										ByteStart: 0,
+										ByteEnd:   1,
+										Value:     "ko",
 									},
 									{
-										Start: tokenStartB,
-										End:   &tokenEndB,
-										Value: "nni",
+										Start:     tokenStartB,
+										End:       &tokenEndB,
+										ByteStart: 2,
+										ByteEnd:   4,
+										Value:     "nni",
 									},
 								},
 							},
@@ -552,16 +558,20 @@ var _ = Describe("MediaRetrievalController", func() {
 							Value: "Hello echo",
 							Cue: []model.Cue{
 								{
-									Start:   &tokenStartA,
-									End:     &tokenEndA,
-									Value:   "Hello",
-									AgentID: "lead",
+									Start:     &tokenStartA,
+									End:       &tokenEndA,
+									Value:     "Hello",
+									ByteStart: 0,
+									ByteEnd:   4,
+									AgentID:   "lead",
 								},
 								{
-									Start:   &tokenStartB,
-									End:     &tokenEndB,
-									Value:   "echo",
-									AgentID: "__nd_bg__|lead",
+									Start:     &tokenStartB,
+									End:       &tokenEndB,
+									Value:     "echo",
+									ByteStart: 6,
+									ByteEnd:   9,
+									AgentID:   "__nd_bg__|lead",
 								},
 							},
 						},
@@ -608,9 +618,11 @@ var _ = Describe("MediaRetrievalController", func() {
 								AgentID: "lead",
 								Cue: []responses.LyricCue{
 									{
-										Start: tokenStartA,
-										End:   &tokenEndA,
-										Value: "Hello",
+										Start:     tokenStartA,
+										End:       &tokenEndA,
+										ByteStart: 0,
+										ByteEnd:   4,
+										Value:     "Hello",
 									},
 								},
 							},
@@ -622,9 +634,11 @@ var _ = Describe("MediaRetrievalController", func() {
 								AgentID: "__nd_bg__|lead",
 								Cue: []responses.LyricCue{
 									{
-										Start: tokenStartB,
-										End:   &tokenEndB,
-										Value: "echo",
+										Start:     tokenStartB,
+										End:       &tokenEndB,
+										ByteStart: 6,
+										ByteEnd:   9,
+										Value:     "echo",
 									},
 								},
 							},
@@ -633,6 +647,116 @@ var _ = Describe("MediaRetrievalController", func() {
 				},
 			})
 		})
+
+		It("should return required cue byte offsets for ambiguous and multibyte cue lines", func() {
+			r := newGetRequest("id=1&enhanced=true")
+
+			asciiLineStart := int64(0)
+			asciiLineEnd := int64(2400)
+			asciiCueStartA := int64(0)
+			asciiCueEndA := int64(300)
+			asciiCueStartB := int64(900)
+			asciiCueEndB := int64(1300)
+			asciiCueStartC := int64(1300)
+			asciiCueEndC := int64(1600)
+			asciiCueStartD := int64(1600)
+
+			utfLineStart := int64(2747)
+			utfLineEnd := int64(6214)
+			utfCueStartA := int64(2747)
+			utfCueEndA := int64(3018)
+			utfCueStartB := int64(3018)
+			utfCueEndB := int64(3179)
+			utfCueStartC := int64(3582)
+			utfCueEndC := int64(4100)
+			utfCueStartD := int64(4500)
+			utfCueEndD := int64(6214)
+
+			lyricsJSON, err := json.Marshal(model.LyricList{
+				{
+					Lang:   "eng",
+					Synced: true,
+					Line: []model.Line{
+						{
+							Start: &asciiLineStart,
+							End:   &asciiLineEnd,
+							Value: "Oh love love me tonight",
+							Cue: []model.Cue{
+								{Start: &asciiCueStartA, End: &asciiCueEndA, Value: "Oh", ByteStart: 0, ByteEnd: 1},
+								{Start: &asciiCueStartB, End: &asciiCueEndB, Value: "love", ByteStart: 8, ByteEnd: 11},
+								{Start: &asciiCueStartC, End: &asciiCueEndC, Value: "me", ByteStart: 13, ByteEnd: 14},
+								{Start: &asciiCueStartD, Value: "tonight", ByteStart: 16, ByteEnd: 22},
+							},
+						},
+						{
+							Start: &utfLineStart,
+							End:   &utfLineEnd,
+							Value: "눈을 뜬 순간",
+							Cue: []model.Cue{
+								{Start: &utfCueStartA, End: &utfCueEndA, Value: "눈", ByteStart: 0, ByteEnd: 2},
+								{Start: &utfCueStartB, End: &utfCueEndB, Value: "을", ByteStart: 3, ByteEnd: 5},
+								{Start: &utfCueStartC, End: &utfCueEndC, Value: "뜬", ByteStart: 7, ByteEnd: 9},
+								{Start: &utfCueStartD, End: &utfCueEndD, Value: "순간", ByteStart: 11, ByteEnd: 16},
+							},
+						},
+					},
+				},
+			})
+			Expect(err).ToNot(HaveOccurred())
+
+			mockRepo.SetData(model.MediaFiles{
+				{
+					ID:     "1",
+					Artist: "Rick Astley",
+					Title:  "Never Gonna Give You Up",
+					Lyrics: string(lyricsJSON),
+				},
+			})
+
+			response, err := router.GetLyricsBySongId(r)
+			Expect(err).ToNot(HaveOccurred())
+			compareResponses(response.LyricsList, responses.LyricsList{
+				StructuredLyrics: responses.StructuredLyrics{
+					{
+						DisplayArtist: "Rick Astley",
+						DisplayTitle:  "Never Gonna Give You Up",
+						Kind:          "main",
+						Lang:          "eng",
+						Synced:        true,
+						Line: []responses.Line{
+							{Start: &asciiLineStart, Value: "Oh love love me tonight"},
+							{Start: &utfLineStart, Value: "눈을 뜬 순간"},
+						},
+						CueLine: []responses.CueLine{
+							{
+								Index: 0,
+								Start: &asciiLineStart,
+								End:   &asciiLineEnd,
+								Value: "Oh love love me tonight",
+								Cue: []responses.LyricCue{
+									{Start: asciiCueStartA, End: &asciiCueEndA, Value: "Oh", ByteStart: 0, ByteEnd: 1},
+									{Start: asciiCueStartB, End: &asciiCueEndB, Value: "love", ByteStart: 8, ByteEnd: 11},
+									{Start: asciiCueStartC, End: &asciiCueEndC, Value: "me", ByteStart: 13, ByteEnd: 14},
+									{Start: asciiCueStartD, End: &asciiLineEnd, Value: "tonight", ByteStart: 16, ByteEnd: 22},
+								},
+							},
+							{
+								Index: 1,
+								Start: &utfLineStart,
+								End:   &utfLineEnd,
+								Value: "눈을 뜬 순간",
+								Cue: []responses.LyricCue{
+									{Start: utfCueStartA, End: &utfCueEndA, Value: "눈", ByteStart: 0, ByteEnd: 2},
+									{Start: utfCueStartB, End: &utfCueEndB, Value: "을", ByteStart: 3, ByteEnd: 5},
+									{Start: utfCueStartC, End: &utfCueEndC, Value: "뜬", ByteStart: 7, ByteEnd: 9},
+									{Start: utfCueStartD, End: &utfCueEndD, Value: "순간", ByteStart: 11, ByteEnd: 16},
+								},
+							},
+						},
+					},
+				},
+			})
+		})
 	})
 })
 
diff --git a/server/subsonic/responses/responses.go b/server/subsonic/responses/responses.go
index 344dd9999..d1ecdb307 100644
--- a/server/subsonic/responses/responses.go
+++ b/server/subsonic/responses/responses.go
@@ -538,9 +538,11 @@ type Line struct {
 }
 
 type LyricCue struct {
-	Start int64  `xml:"start,attr"           json:"start"`
-	End   *int64 `xml:"end,attr,omitempty"   json:"end,omitempty"`
-	Value string `xml:",chardata"            json:"value"`
+	Start     int64  `xml:"start,attr"           json:"start"`
+	End       *int64 `xml:"end,attr,omitempty"   json:"end,omitempty"`
+	ByteStart int    `xml:"byteStart,attr"       json:"byteStart"`
+	ByteEnd   int    `xml:"byteEnd,attr"         json:"byteEnd"`
+	Value     string `xml:",chardata"            json:"value"`
 }
 
 type Agent struct {
@@ -553,7 +555,7 @@ type CueLine struct {
 	Index   int32      `xml:"index,attr"                    json:"index"`
 	Start   *int64     `xml:"start,attr,omitempty"          json:"start,omitempty"`
 	End     *int64     `xml:"end,attr,omitempty"            json:"end,omitempty"`
-	Value   string     `xml:"value,attr,omitempty"          json:"value,omitempty"`
+	Value   string     `xml:"value,attr"                    json:"value"`
 	AgentID string     `xml:"agentId,attr,omitempty"        json:"agentId,omitempty"`
 	Cue     []LyricCue `xml:"cue,omitempty"                 json:"cue,omitempty"`
 }
diff --git a/ui/src/audioplayer/KaraokeLyricsOverlay.jsx b/ui/src/audioplayer/KaraokeLyricsOverlay.jsx
index cd1484e41..799f8bdc2 100644
--- a/ui/src/audioplayer/KaraokeLyricsOverlay.jsx
+++ b/ui/src/audioplayer/KaraokeLyricsOverlay.jsx
@@ -26,6 +26,7 @@ import {
   hasStructuredLyricContent,
   resolveKaraokeTokenWindow,
   resolveLayerLineForMain,
+  utf8ByteRangeToCodeUnitRange,
 } from './lyrics'
 
 const KARAOKE_RENDER_LEAD_MS = 24
@@ -635,6 +636,72 @@ const buildSegmentsFromLine = (line) => {
   }
 
   const text = line.value || ''
+  const exactSegments = (() => {
+    if (!text) {
+      return null
+    }
+
+    const rangedTokens = line.tokens
+      .map((token, tokenIndex) => ({
+        token,
+        tokenIndex,
+        range: utf8ByteRangeToCodeUnitRange(
+          text,
+          token?.byteStart,
+          token?.byteEnd,
+        ),
+      }))
+      .filter((entry) => entry.range != null)
+
+    if (
+      rangedTokens.length !== line.tokens.length ||
+      rangedTokens.length === 0
+    ) {
+      return null
+    }
+
+    rangedTokens.sort(
+      (a, b) =>
+        a.range.start - b.range.start ||
+        a.range.end - b.range.end ||
+        a.tokenIndex - b.tokenIndex,
+    )
+
+    const segments = []
+    let cursor = 0
+    for (const entry of rangedTokens) {
+      if (entry.range.start < cursor) {
+        return null
+      }
+      if (entry.range.start > cursor) {
+        segments.push({
+          text: text.slice(cursor, entry.range.start),
+          token: null,
+          tokenIndex: -1,
+        })
+      }
+      segments.push({
+        text: entry.range.text,
+        token: entry.token,
+        tokenIndex: entry.tokenIndex,
+      })
+      cursor = entry.range.end
+    }
+
+    if (cursor < text.length) {
+      segments.push({
+        text: text.slice(cursor),
+        token: null,
+        tokenIndex: -1,
+      })
+    }
+
+    return segments
+  })()
+  if (exactSegments) {
+    return exactSegments
+  }
+
   const matchedSegments = []
   const fallbackSegments = []
   let cursor = 0
diff --git a/ui/src/audioplayer/KaraokeLyricsOverlay.test.jsx b/ui/src/audioplayer/KaraokeLyricsOverlay.test.jsx
index 411116eae..412bc3946 100644
--- a/ui/src/audioplayer/KaraokeLyricsOverlay.test.jsx
+++ b/ui/src/audioplayer/KaraokeLyricsOverlay.test.jsx
@@ -177,6 +177,74 @@ describe('<KaraokeLyricsOverlay /> behavior', () => {
     expect(translationLine.querySelectorAll('span')).toHaveLength(1)
   })
 
+  it('uses cue byte offsets to segment repeated words in the karaoke line', () => {
+    renderOverlay({
+      mainLyric: {
+        kind: 'main',
+        lang: 'en',
+        synced: true,
+        line: [{ start: 0, end: 2400, value: 'Oh love love me tonight' }],
+        cueLine: [
+          {
+            index: 0,
+            start: 0,
+            end: 2400,
+            value: 'Oh love love me tonight',
+            cue: [
+              { start: 0, end: 300, value: 'Oh', byteStart: 0, byteEnd: 1 },
+              {
+                start: 900,
+                end: 1300,
+                value: 'love',
+                byteStart: 8,
+                byteEnd: 11,
+              },
+              {
+                start: 1300,
+                end: 1600,
+                value: 'me',
+                byteStart: 13,
+                byteEnd: 14,
+              },
+              {
+                start: 1600,
+                end: 2400,
+                value: 'tonight',
+                byteStart: 16,
+                byteEnd: 22,
+              },
+            ],
+          },
+        ],
+      },
+      translationLyric: null,
+      pronunciationLyric: null,
+      showTranslation: false,
+      showPronunciation: false,
+      translationEnabled: false,
+      pronunciationEnabled: false,
+      audioInstance: {
+        ...audioInstance,
+        currentTime: 1.0,
+      },
+    })
+
+    const mainLine = screen.getByText('Oh').parentElement
+    const segments = Array.from(mainLine.querySelectorAll('span')).map(
+      (span) => span.textContent,
+    )
+
+    expect(segments).toEqual([
+      'Oh',
+      ' love ',
+      'love',
+      ' ',
+      'me',
+      ' ',
+      'tonight',
+    ])
+  })
+
   it('highlights line-timed pronunciation and translation rows with the active main line', () => {
     renderOverlay({
       mainLyric: {
diff --git a/ui/src/audioplayer/lyrics.js b/ui/src/audioplayer/lyrics.js
index e9cd16d5a..6fa627ee5 100644
--- a/ui/src/audioplayer/lyrics.js
+++ b/ui/src/audioplayer/lyrics.js
@@ -19,6 +19,17 @@ const toTime = (value) => {
   return Number.isFinite(numeric) ? numeric : null
 }
 
+const toByteOffset = (value) => {
+  if (value == null || value === '') {
+    return null
+  }
+  const numeric = Number(value)
+  if (!Number.isInteger(numeric) || numeric < 0) {
+    return null
+  }
+  return numeric
+}
+
 const compareNullableTime = (a, b) => {
   if (a == null && b == null) {
     return 0
@@ -78,10 +89,79 @@ const normalizeToken = (token) => {
   if (!value.trim()) {
     return null
   }
+  const byteStart = toByteOffset(token.byteStart)
+  const byteEnd = toByteOffset(token.byteEnd)
   return {
     start: toTime(token.start),
     end: toTime(token.end),
     value,
+    ...(byteStart != null ? { byteStart } : {}),
+    ...(byteEnd != null ? { byteEnd } : {}),
+  }
+}
+
+const utf8BytesForCodePoint = (codePoint) => {
+  if (codePoint <= 0x7f) {
+    return 1
+  }
+  if (codePoint <= 0x7ff) {
+    return 2
+  }
+  if (codePoint <= 0xffff) {
+    return 3
+  }
+  return 4
+}
+
+export const utf8ByteOffsetToCodeUnitIndex = (text, targetByteOffset) => {
+  if (typeof text !== 'string' || text.length === 0) {
+    return 0
+  }
+
+  const target = toByteOffset(targetByteOffset)
+  if (target == null || target <= 0) {
+    return 0
+  }
+
+  let byteOffset = 0
+  let index = 0
+  while (index < text.length) {
+    if (byteOffset >= target) {
+      return index
+    }
+    const codePoint = text.codePointAt(index)
+    byteOffset += utf8BytesForCodePoint(codePoint)
+    index += codePoint > 0xffff ? 2 : 1
+  }
+
+  return text.length
+}
+
+export const utf8ByteRangeToCodeUnitRange = (text, byteStart, byteEnd) => {
+  if (typeof text !== 'string') {
+    return null
+  }
+
+  const start = toByteOffset(byteStart)
+  const end = toByteOffset(byteEnd)
+  if (start == null || end == null || end < start) {
+    return null
+  }
+
+  const startIndex = utf8ByteOffsetToCodeUnitIndex(text, start)
+  const endIndex = utf8ByteOffsetToCodeUnitIndex(text, end + 1)
+  if (
+    startIndex >= endIndex ||
+    startIndex > text.length ||
+    endIndex > text.length
+  ) {
+    return null
+  }
+
+  return {
+    start: startIndex,
+    end: endIndex,
+    text: text.slice(startIndex, endIndex),
   }
 }
 
diff --git a/ui/src/audioplayer/lyrics.test.js b/ui/src/audioplayer/lyrics.test.js
index 2fcf1df40..961fdb10b 100644
--- a/ui/src/audioplayer/lyrics.test.js
+++ b/ui/src/audioplayer/lyrics.test.js
@@ -13,6 +13,8 @@ import {
   selectLyricLayers,
   structuredLyricsToLrc,
   structuredLyricToLrc,
+  utf8ByteOffsetToCodeUnitIndex,
+  utf8ByteRangeToCodeUnitRange,
 } from './lyrics'
 
 describe('lyrics helpers', () => {
@@ -412,6 +414,60 @@ describe('lyrics helpers', () => {
     ])
   })
 
+  it('preserves cue byte offsets on karaoke tokens', () => {
+    const lines = buildKaraokeLines({
+      lang: 'eng',
+      synced: true,
+      line: [{ start: 0, end: 2400, value: 'Oh love love me tonight' }],
+      cueLine: [
+        {
+          index: 0,
+          start: 0,
+          end: 2400,
+          value: 'Oh love love me tonight',
+          cue: [
+            { start: 0, end: 300, value: 'Oh', byteStart: 0, byteEnd: 1 },
+            { start: 900, end: 1300, value: 'love', byteStart: 8, byteEnd: 11 },
+            { start: 1300, end: 1600, value: 'me', byteStart: 13, byteEnd: 14 },
+            {
+              start: 1600,
+              end: 2400,
+              value: 'tonight',
+              byteStart: 16,
+              byteEnd: 22,
+            },
+          ],
+        },
+      ],
+    })
+
+    expect(
+      lines[0].tokens.map((token) => [
+        token.value,
+        token.byteStart,
+        token.byteEnd,
+      ]),
+    ).toEqual([
+      ['Oh', 0, 1],
+      ['love', 8, 11],
+      ['me', 13, 14],
+      ['tonight', 16, 22],
+    ])
+  })
+
+  it('maps UTF-8 byte offsets to string ranges for multibyte lyrics', () => {
+    const text = '눈을 뜬 순간'
+
+    expect(utf8ByteOffsetToCodeUnitIndex(text, 0)).toBe(0)
+    expect(utf8ByteOffsetToCodeUnitIndex(text, 3)).toBe(1)
+    expect(utf8ByteOffsetToCodeUnitIndex(text, 7)).toBe(3)
+    expect(utf8ByteRangeToCodeUnitRange(text, 11, 16)).toEqual({
+      start: 5,
+      end: 7,
+      text: '순간',
+    })
+  })
+
   it('falls back to legacy cueLine role values when agents are absent', () => {
     const lines = buildKaraokeLines({
       lang: 'eng',
diff --git a/ui/src/subsonic/index.test.js b/ui/src/subsonic/index.test.js
index 6910fdc8d..6bd5e08ee 100644
--- a/ui/src/subsonic/index.test.js
+++ b/ui/src/subsonic/index.test.js
@@ -1,5 +1,4 @@
 import { vi } from 'vitest'
-import { COVER_ART_SIZE } from '../consts'
 import { httpClient } from '../dataProvider'
 import subsonic from './index'
 
@@ -7,6 +6,8 @@ vi.mock('../dataProvider', () => ({
   httpClient: vi.fn(() => Promise.resolve({})),
 }))
 
+const COVER_ART_SIZE = 600
+
 describe('getCoverArtUrl', () => {
   beforeEach(() => {
     // Mock window.location

From cc15a2f820244fe395b3a3d099489851cacf8e4e Mon Sep 17 00:00:00 2001
From: ranokay <github@ranokay.com>
Date: Tue, 14 Apr 2026 05:10:54 +0300
Subject: [PATCH 12/14] fix(lyrics): polish karaoke rendering and mobile layout

---
 core/lyrics/lyrics.go                         |  59 +++++++---
 core/lyrics/lyrics_test.go                    |  23 ++++
 core/lyrics/sources_test.go                   |   2 +-
 server/subsonic/media_retrieval.go            |  48 ++++++---
 server/subsonic/media_retrieval_test.go       |  11 +-
 ui/src/audioplayer/KaraokeLyricsOverlay.jsx   |  83 +++++++++++---
 .../audioplayer/KaraokeLyricsOverlay.test.jsx | 102 ++++++++++++++++++
 .../audioplayer/MobileKaraokeLyricsPortal.jsx |  65 +++++++++++
 .../MobileKaraokeLyricsPortal.test.jsx        |  55 ++++++++++
 ui/src/audioplayer/Player.jsx                 |  78 ++++++++++----
 ui/src/audioplayer/lyrics.js                  |   2 +-
 ui/src/audioplayer/lyrics.test.js             |  37 +++++++
 ui/src/audioplayer/styles.js                  |  18 ++++
 13 files changed, 512 insertions(+), 71 deletions(-)
 create mode 100644 ui/src/audioplayer/MobileKaraokeLyricsPortal.jsx
 create mode 100644 ui/src/audioplayer/MobileKaraokeLyricsPortal.test.jsx

diff --git a/core/lyrics/lyrics.go b/core/lyrics/lyrics.go
index 758053042..cc3d574b3 100644
--- a/core/lyrics/lyrics.go
+++ b/core/lyrics/lyrics.go
@@ -14,6 +14,12 @@ type Lyrics interface {
 	GetLyrics(ctx context.Context, mf *model.MediaFile) (model.LyricList, error)
 }
 
+// BatchLyrics can resolve lyrics across multiple candidate media files while
+// still honoring the configured source priority globally.
+type BatchLyrics interface {
+	GetLyricsForMediaFiles(ctx context.Context, mediaFiles []model.MediaFile) (model.LyricList, error)
+}
+
 // PluginLoader discovers and loads lyrics provider plugins.
 type PluginLoader interface {
 	LoadLyricsProvider(name string) (Lyrics, bool)
@@ -32,28 +38,53 @@ func NewLyrics(pluginLoader PluginLoader) Lyrics {
 // GetLyrics returns lyrics for the given media file, trying sources in the
 // order specified by conf.Server.LyricsPriority.
 func (l *lyricsService) GetLyrics(ctx context.Context, mf *model.MediaFile) (model.LyricList, error) {
-	var lyricsList model.LyricList
-	var err error
+	return l.getLyricsForCandidates(ctx, []*model.MediaFile{mf})
+}
 
+// GetLyricsForMediaFiles resolves lyrics across duplicate media files while
+// preserving the configured source priority across the full candidate set.
+func (l *lyricsService) GetLyricsForMediaFiles(ctx context.Context, mediaFiles []model.MediaFile) (model.LyricList, error) {
+	candidates := make([]*model.MediaFile, 0, len(mediaFiles))
+	for i := range mediaFiles {
+		candidates = append(candidates, &mediaFiles[i])
+	}
+	return l.getLyricsForCandidates(ctx, candidates)
+}
+
+func (l *lyricsService) getLyricsForCandidates(ctx context.Context, mediaFiles []*model.MediaFile) (model.LyricList, error) {
 	for pattern := range strings.SplitSeq(conf.Server.LyricsPriority, ",") {
 		pattern = strings.TrimSpace(pattern)
-		switch {
-		case strings.EqualFold(pattern, "embedded"):
-			lyricsList, err = fromEmbedded(ctx, mf)
-		case strings.HasPrefix(pattern, "."):
-			lyricsList, err = fromExternalFile(ctx, mf, strings.ToLower(pattern))
-		default:
-			lyricsList, err = l.fromPlugin(ctx, mf, pattern)
+		if pattern == "" {
+			continue
 		}
 
-		if err != nil {
-			log.Error(ctx, "error getting lyrics", "source", pattern, err)
-		}
+		for _, mf := range mediaFiles {
+			if mf == nil {
+				continue
+			}
 
-		if len(lyricsList) > 0 {
-			return lyricsList, nil
+			lyricsList, err := l.getLyricsFromSource(ctx, mf, pattern)
+			if err != nil {
+				log.Error(ctx, "error getting lyrics", "source", pattern, err)
+				continue
+			}
+
+			if len(lyricsList) > 0 {
+				return lyricsList, nil
+			}
 		}
 	}
 
 	return nil, nil
 }
+
+func (l *lyricsService) getLyricsFromSource(ctx context.Context, mf *model.MediaFile, pattern string) (model.LyricList, error) {
+	switch {
+	case strings.EqualFold(pattern, "embedded"):
+		return fromEmbedded(ctx, mf)
+	case strings.HasPrefix(pattern, "."):
+		return fromExternalFile(ctx, mf, strings.ToLower(pattern))
+	default:
+		return l.fromPlugin(ctx, mf, pattern)
+	}
+}
diff --git a/core/lyrics/lyrics_test.go b/core/lyrics/lyrics_test.go
index 822e975ce..26bdd5aa9 100644
--- a/core/lyrics/lyrics_test.go
+++ b/core/lyrics/lyrics_test.go
@@ -169,6 +169,29 @@ var _ = Describe("sources", func() {
 		Entry("txt > lrc > embedded", ".txt,.lrc,embedded", unsyncedLyrics),
 		Entry("ttml > elrc > lrc > srt > embedded", ".ttml,.elrc,.lrc,.srt,embedded", ttmlLyrics))
 
+	It("resolves source priority across duplicate media files", func() {
+		conf.Server.LyricsPriority = ".ttml,embedded"
+		embeddedJSON, err := json.Marshal(embeddedLyrics)
+		Expect(err).To(BeNil())
+
+		svc := lyrics.NewLyrics(nil)
+		batchSvc, ok := svc.(lyrics.BatchLyrics)
+		Expect(ok).To(BeTrue())
+
+		list, err := batchSvc.GetLyricsForMediaFiles(ctx, []model.MediaFile{
+			{
+				Lyrics: string(embeddedJSON),
+				Path:   "tests/fixtures/01 Invisible (RED) Edit Version.mp3",
+			},
+			{
+				Lyrics: "[]",
+				Path:   "tests/fixtures/test.mp3",
+			},
+		})
+		Expect(err).To(BeNil())
+		Expect(list).To(Equal(ttmlLyrics))
+	})
+
 	Context("Errors", func() {
 		var RegularUserContext = XContext
 		var isRegularUser = os.Getuid() != 0
diff --git a/core/lyrics/sources_test.go b/core/lyrics/sources_test.go
index 5ba03336e..1e98323ca 100644
--- a/core/lyrics/sources_test.go
+++ b/core/lyrics/sources_test.go
@@ -299,7 +299,7 @@ var _ = Describe("sources", func() {
 			Expect(lyrics[0].Line[0].Value).To(Equal("BOM test line"))
 		})
 
-		It("should handle UTF-16 LE encoded TTML files", func() {
+		It("should handle UTF-16 BE encoded TTML files", func() {
 			mf := model.MediaFile{Path: "tests/fixtures/bom-utf16-test.mp3"}
 			lyrics, err := fromExternalFile(ctx, &mf, ".ttml")
 
diff --git a/server/subsonic/media_retrieval.go b/server/subsonic/media_retrieval.go
index 16d0d2666..c3c6d98ea 100644
--- a/server/subsonic/media_retrieval.go
+++ b/server/subsonic/media_retrieval.go
@@ -10,6 +10,7 @@ import (
 
 	"github.com/navidrome/navidrome/conf"
 	"github.com/navidrome/navidrome/consts"
+	lyricssvc "github.com/navidrome/navidrome/core/lyrics"
 	"github.com/navidrome/navidrome/log"
 	"github.com/navidrome/navidrome/model"
 	"github.com/navidrome/navidrome/resources"
@@ -19,6 +20,8 @@ import (
 	"github.com/navidrome/navidrome/utils/req"
 )
 
+const maxLegacyLyricsCandidates = 10
+
 func (api *Router) GetAvatar(w http.ResponseWriter, r *http.Request) (*responses.Subsonic, error) {
 	if !conf.Server.EnableGravatar {
 		return api.getPlaceHolderAvatar(w, r)
@@ -99,9 +102,9 @@ func (api *Router) GetLyrics(r *http.Request) (*responses.Subsonic, error) {
 	lyricsResponse := responses.Lyrics{}
 	response.Lyrics = &lyricsResponse
 	opts := filter.SongsByArtistTitleWithLyricsFirst(artist, title)
-	// Keep the search exhaustive so an older duplicate can still supply the
-	// matching sidecar lyrics when the newest candidate only has embedded data.
-	opts.Max = 0
+	// Search a bounded duplicate window so source-priority fallback can still
+	// reach older matches without turning legacy getLyrics into an unbounded scan.
+	opts.Max = maxLegacyLyricsCandidates
 	mediaFiles, err := api.ds.MediaFile(r.Context()).GetAll(opts)
 
 	if err != nil {
@@ -112,26 +115,37 @@ func (api *Router) GetLyrics(r *http.Request) (*responses.Subsonic, error) {
 		return response, nil
 	}
 
-	for i := range mediaFiles {
-		structuredLyrics, err := api.lyrics.GetLyrics(r.Context(), &mediaFiles[i])
+	var structuredLyrics model.LyricList
+	if batchLyrics, ok := api.lyrics.(lyricssvc.BatchLyrics); ok {
+		structuredLyrics, err = batchLyrics.GetLyricsForMediaFiles(r.Context(), mediaFiles)
 		if err != nil {
 			return nil, err
 		}
-		if len(structuredLyrics) == 0 {
-			continue
+	} else {
+		for i := range mediaFiles {
+			structuredLyrics, err = api.lyrics.GetLyrics(r.Context(), &mediaFiles[i])
+			if err != nil {
+				return nil, err
+			}
+			if len(structuredLyrics) > 0 {
+				break
+			}
 		}
-
-		lyricsResponse.Artist = artist
-		lyricsResponse.Title = title
-
-		var lyricsText strings.Builder
-		for _, line := range structuredLyrics[0].Line {
-			lyricsText.WriteString(line.Value + "\n")
-		}
-		lyricsResponse.Value = lyricsText.String()
-		break
 	}
 
+	if len(structuredLyrics) == 0 {
+		return response, nil
+	}
+
+	lyricsResponse.Artist = artist
+	lyricsResponse.Title = title
+
+	var lyricsText strings.Builder
+	for _, line := range structuredLyrics[0].Line {
+		lyricsText.WriteString(line.Value + "\n")
+	}
+	lyricsResponse.Value = lyricsText.String()
+
 	return response, nil
 }
 
diff --git a/server/subsonic/media_retrieval_test.go b/server/subsonic/media_retrieval_test.go
index faa90e375..d02d5b9bd 100644
--- a/server/subsonic/media_retrieval_test.go
+++ b/server/subsonic/media_retrieval_test.go
@@ -187,18 +187,22 @@ var _ = Describe("MediaRetrievalController", func() {
 			Expect(response.Lyrics.Value).To(Equal("We're no strangers to love\nYou know the rules and so do I\n"))
 		})
 
-		It("should continue searching candidates for sidecar lyrics", func() {
+		It("should prefer higher-priority sidecar lyrics across duplicate candidates", func() {
 			conf.Server.LyricsPriority = ".ttml,embedded"
 			r := newGetRequest("artist=Rick+Astley", "title=Never+Gonna+Give+You+Up")
 			baseTime := time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC)
+			embedded, err := model.ToLyrics("eng", "Newest duplicate embedded lyrics")
+			Expect(err).ToNot(HaveOccurred())
+			embeddedJSON, err := json.Marshal(model.LyricList{*embedded})
+			Expect(err).ToNot(HaveOccurred())
 			mockRepo.SetData(model.MediaFiles{
 				{
 					ID:        "1",
 					Path:      "tests/fixtures/01 Invisible (RED) Edit Version.mp3",
 					Artist:    "Rick Astley",
 					Title:     "Never Gonna Give You Up",
-					Lyrics:    "[]",
-					UpdatedAt: baseTime.Add(2 * time.Hour), // Newer, but no TTML sidecar
+					Lyrics:    string(embeddedJSON),
+					UpdatedAt: baseTime.Add(2 * time.Hour), // Newer duplicate with embedded lyrics only
 				},
 				{
 					ID:        "2",
@@ -215,6 +219,7 @@ var _ = Describe("MediaRetrievalController", func() {
 			Expect(response.Lyrics.Artist).To(Equal("Rick Astley"))
 			Expect(response.Lyrics.Title).To(Equal("Never Gonna Give You Up"))
 			Expect(response.Lyrics.Value).To(Equal("We're no strangers to love\nYou know the rules and so do I\n"))
+			Expect(mockRepo.Options.Max).To(Equal(maxLegacyLyricsCandidates))
 		})
 	})
 
diff --git a/ui/src/audioplayer/KaraokeLyricsOverlay.jsx b/ui/src/audioplayer/KaraokeLyricsOverlay.jsx
index 799f8bdc2..c016df9e5 100644
--- a/ui/src/audioplayer/KaraokeLyricsOverlay.jsx
+++ b/ui/src/audioplayer/KaraokeLyricsOverlay.jsx
@@ -48,6 +48,7 @@ const KARAOKE_MAX_LINE_HEIGHT = 2.2
 const KARAOKE_LINE_HEIGHT_STEP = 0.02
 const KARAOKE_GROUP_SPACING_BASE_PX = 14
 const KARAOKE_AUX_LINE_HEIGHT = 1.2
+const KARAOKE_MAIN_INACTIVE_FONT_FACTOR = 0.8
 
 const TOKEN_DONE_ALPHA = 1
 const TOKEN_FUTURE_ALPHA = 0.34
@@ -160,6 +161,21 @@ const useStyles = makeStyles((theme) => ({
       maxHeight: '65vh',
     },
   },
+  overlayInline: {
+    position: 'absolute',
+    inset: 0,
+    width: '100%',
+    height: '100%',
+    minHeight: 0,
+    maxHeight: '100%',
+    transform: 'none',
+    borderRadius: 'inherit',
+    border: 'none',
+    boxShadow: 'none',
+    background: 'rgba(6, 8, 12, 0.92)',
+    backdropFilter: 'blur(12px)',
+    zIndex: 1,
+  },
   resizeHandle: {
     height: 14,
     cursor: 'ns-resize',
@@ -187,6 +203,10 @@ const useStyles = makeStyles((theme) => ({
     gap: theme.spacing(1),
     padding: theme.spacing(0.3, 1.3, 0.4, 1.3),
   },
+  headerInline: {
+    padding: theme.spacing(0.25, 0.65, 0.35, 0.65),
+    gap: theme.spacing(0.65),
+  },
   headerLeft: {
     display: 'flex',
     alignItems: 'center',
@@ -264,6 +284,8 @@ const useStyles = makeStyles((theme) => ({
   },
   inlineTr: {
     margin: 0,
+    display: 'inline-block',
+    maxWidth: '100%',
     textAlign: 'center',
     fontWeight: 400,
     lineHeight: KARAOKE_AUX_LINE_HEIGHT,
@@ -272,6 +294,14 @@ const useStyles = makeStyles((theme) => ({
   },
   inlinePr: {
     margin: 0,
+    display: 'inline-flex',
+    alignItems: 'center',
+    justifyContent: 'center',
+    flexWrap: 'wrap',
+    alignSelf: 'center',
+    width: 'fit-content',
+    maxWidth: '100%',
+    boxSizing: 'border-box',
     textAlign: 'center',
     fontWeight: 400,
     lineHeight: KARAOKE_AUX_LINE_HEIGHT,
@@ -300,6 +330,9 @@ const useStyles = makeStyles((theme) => ({
       padding: theme.spacing(0.35, 1.2, 1.2, 1.2),
     },
   },
+  bodyInline: {
+    padding: theme.spacing(0.25, 0.8, 0.85, 0.8),
+  },
   lines: {
     display: 'flex',
     flexDirection: 'column',
@@ -308,12 +341,14 @@ const useStyles = makeStyles((theme) => ({
   },
   line: {
     margin: 0,
+    display: 'inline-block',
+    maxWidth: '100%',
     fontWeight: 600,
     lineHeight: 1.24,
     letterSpacing: '0.01em',
     textAlign: 'center',
     color: 'rgba(255, 255, 255, 0.62)',
-    transition: `opacity ${KARAOKE_ANIMATION_MS}ms ease-in-out, color ${KARAOKE_ANIMATION_MS}ms ease-in-out, font-size 280ms ease-in-out`,
+    transition: `opacity ${KARAOKE_ANIMATION_MS}ms ease-in-out, color ${KARAOKE_ANIMATION_MS}ms ease-in-out, font-size 280ms ease-in-out, max-width 280ms ease-in-out`,
   },
   token: {
     display: 'inline-block',
@@ -858,7 +893,8 @@ const areLineStylesEqual = (prevStyle, nextStyle) => {
     a.color === b.color &&
     a.fontSize === b.fontSize &&
     a.fontWeight === b.fontWeight &&
-    a.lineHeight === b.lineHeight
+    a.lineHeight === b.lineHeight &&
+    a.maxWidth === b.maxWidth
   )
 }
 
@@ -1038,6 +1074,7 @@ const KaraokeLyricsOverlay = ({
   onTogglePronunciation,
   audioInstance,
   onClose,
+  inline = false,
 }) => {
   const classes = useStyles()
   const [playbackMs, setPlaybackMs] = useState(0)
@@ -1397,13 +1434,18 @@ const KaraokeLyricsOverlay = ({
     }
 
     const baseFontSize = lyricsSettings.main.fontSize
-    const fontSize = isActive ? baseFontSize : Math.round(baseFontSize * 0.8)
+    const fontSize = isActive
+      ? baseFontSize
+      : Math.round(baseFontSize * KARAOKE_MAIN_INACTIVE_FONT_FACTOR)
 
     return {
       opacity,
       color,
       fontSize,
       lineHeight,
+      maxWidth: isActive
+        ? '100%'
+        : `${Math.round(KARAOKE_MAIN_INACTIVE_FONT_FACTOR * 100)}%`,
     }
   }
 
@@ -1448,7 +1490,9 @@ const KaraokeLyricsOverlay = ({
     }
   }
 
-  const overlayStyle = isCompact
+  const overlayStyle = inline
+    ? undefined
+    : isCompact
     ? undefined
     : {
         height: overlayHeight,
@@ -1457,17 +1501,27 @@ const KaraokeLyricsOverlay = ({
 
   return (
     <div
-      className={classes.overlay}
+      className={clsx(classes.overlay, {
+        [classes.overlayInline]: inline,
+      })}
       data-testid="karaoke-lyrics-overlay"
+      data-inline={inline ? 'true' : 'false'}
       style={overlayStyle}
+      onClick={inline ? (event) => event.stopPropagation() : undefined}
     >
-      <div
-        className={classes.resizeHandle}
-        onMouseDown={onResizeStart}
-        data-testid="lyrics-resize-handle"
-      />
+      {!inline && (
+        <div
+          className={classes.resizeHandle}
+          onMouseDown={onResizeStart}
+          data-testid="lyrics-resize-handle"
+        />
+      )}
 
-      <div className={classes.header}>
+      <div
+        className={clsx(classes.header, {
+          [classes.headerInline]: inline,
+        })}
+      >
         <div className={classes.headerLeft}>
           <div className={classes.languageBadges}>
             {languageBadges.map((badge) => (
@@ -1536,7 +1590,12 @@ const KaraokeLyricsOverlay = ({
         </div>
       </div>
 
-      <div className={classes.body} ref={bodyRef}>
+      <div
+        className={clsx(classes.body, {
+          [classes.bodyInline]: inline,
+        })}
+        ref={bodyRef}
+      >
         <div className={classes.lines} style={{ gap: lineGap }}>
           <div aria-hidden style={{ height: centerSpacerPx }} />
           {mainLines.map((line, idx) => {
diff --git a/ui/src/audioplayer/KaraokeLyricsOverlay.test.jsx b/ui/src/audioplayer/KaraokeLyricsOverlay.test.jsx
index 412bc3946..2fccf693a 100644
--- a/ui/src/audioplayer/KaraokeLyricsOverlay.test.jsx
+++ b/ui/src/audioplayer/KaraokeLyricsOverlay.test.jsx
@@ -71,6 +71,16 @@ describe('<KaraokeLyricsOverlay /> behavior', () => {
     expect(await screen.findByText('Appearance')).toBeInTheDocument()
   })
 
+  it('renders inline mode without the desktop resize handle', () => {
+    renderOverlay({ inline: true })
+
+    expect(screen.getByTestId('karaoke-lyrics-overlay')).toHaveAttribute(
+      'data-inline',
+      'true',
+    )
+    expect(screen.queryByTestId('lyrics-resize-handle')).not.toBeInTheDocument()
+  })
+
   it('renders the appearance popup with Main label and default line height for older settings', async () => {
     localStorage.setItem(
       'karaoke-lyrics-settings',
@@ -245,6 +255,49 @@ describe('<KaraokeLyricsOverlay /> behavior', () => {
     ])
   })
 
+  it('uses cue byte offsets to preserve explicit space cues in multibyte karaoke lines', () => {
+    renderOverlay({
+      mainLyric: {
+        kind: 'main',
+        lang: 'ko',
+        synced: true,
+        line: [{ start: 0, end: 900, value: '눈을 뜬 순간' }],
+        cueLine: [
+          {
+            index: 0,
+            start: 0,
+            end: 900,
+            value: '눈을 뜬 순간',
+            cue: [
+              { start: 0, end: 150, value: '눈을', byteStart: 0, byteEnd: 5 },
+              { start: 150, end: 250, value: ' ', byteStart: 6, byteEnd: 6 },
+              { start: 250, end: 450, value: '뜬', byteStart: 7, byteEnd: 9 },
+              { start: 450, end: 550, value: ' ', byteStart: 10, byteEnd: 10 },
+              { start: 550, end: 900, value: '순간', byteStart: 11, byteEnd: 16 },
+            ],
+          },
+        ],
+      },
+      translationLyric: null,
+      pronunciationLyric: null,
+      showTranslation: false,
+      showPronunciation: false,
+      translationEnabled: false,
+      pronunciationEnabled: false,
+      audioInstance: {
+        ...audioInstance,
+        currentTime: 0.3,
+      },
+    })
+
+    const mainLine = screen.getByText('눈을').parentElement
+    const segments = Array.from(mainLine.querySelectorAll('span')).map(
+      (span) => span.textContent,
+    )
+
+    expect(segments).toEqual(['눈을', ' ', '뜬', ' ', '순간'])
+  })
+
   it('highlights line-timed pronunciation and translation rows with the active main line', () => {
     renderOverlay({
       mainLyric: {
@@ -295,6 +348,55 @@ describe('<KaraokeLyricsOverlay /> behavior', () => {
     )
   })
 
+  it('pre-wraps inactive main lines so the active line keeps the same wrap shape', () => {
+    renderOverlay({
+      mainLyric: {
+        kind: 'main',
+        lang: 'en',
+        synced: true,
+        line: [
+          { start: 1000, end: 1800, value: 'First line that is getting focus' },
+          { start: 2500, end: 3300, value: 'Second line waiting below' },
+        ],
+      },
+      translationLyric: null,
+      pronunciationLyric: null,
+      showTranslation: false,
+      showPronunciation: false,
+      translationEnabled: false,
+      pronunciationEnabled: false,
+      audioInstance: {
+        ...audioInstance,
+        currentTime: 1.2,
+      },
+    })
+
+    const activeLine = screen.getByText('First line that is getting focus')
+      .parentElement
+    const inactiveLine = screen.getByText('Second line waiting below')
+      .parentElement
+
+    expect(parseFloat(activeLine.style.fontSize)).toBeGreaterThan(
+      parseFloat(inactiveLine.style.fontSize),
+    )
+    expect(activeLine.style.maxWidth).toBe('100%')
+    expect(inactiveLine.style.maxWidth).toBe('80%')
+  })
+
+  it('centers pronunciation text inside the pill container', () => {
+    renderOverlay({
+      showTranslation: false,
+      showPronunciation: true,
+    })
+
+    const pronunciationLine = screen.getByText('konnichiwa').parentElement
+    const styles = window.getComputedStyle(pronunciationLine)
+
+    expect(styles.display).toBe('inline-flex')
+    expect(styles.justifyContent).toBe('center')
+    expect(styles.alignItems).toBe('center')
+  })
+
   it('renders untimed text lyrics in manual reading mode without a pinned active line', () => {
     renderOverlay({
       mainLyric: {
diff --git a/ui/src/audioplayer/MobileKaraokeLyricsPortal.jsx b/ui/src/audioplayer/MobileKaraokeLyricsPortal.jsx
new file mode 100644
index 000000000..636107184
--- /dev/null
+++ b/ui/src/audioplayer/MobileKaraokeLyricsPortal.jsx
@@ -0,0 +1,65 @@
+import React, { useEffect, useState } from 'react'
+import { createPortal } from 'react-dom'
+
+export const MOBILE_KARAOKE_LYRICS_HOST_SELECTOR =
+  '.react-jinke-music-player-mobile-cover'
+export const MOBILE_KARAOKE_LYRICS_ACTIVE_CLASS = 'nd-mobile-lyrics-active'
+
+const resolveMobileLyricsHost = () => {
+  if (typeof document === 'undefined') {
+    return null
+  }
+  return document.querySelector(MOBILE_KARAOKE_LYRICS_HOST_SELECTOR)
+}
+
+const MobileKaraokeLyricsPortal = ({ active, children }) => {
+  const [host, setHost] = useState(() =>
+    active ? resolveMobileLyricsHost() : null,
+  )
+
+  useEffect(() => {
+    if (typeof document === 'undefined') {
+      setHost(null)
+      return undefined
+    }
+
+    if (!active) {
+      setHost(null)
+      return undefined
+    }
+
+    const syncHost = () => {
+      setHost(resolveMobileLyricsHost())
+    }
+
+    syncHost()
+
+    const observer = new MutationObserver(syncHost)
+    observer.observe(document.body, {
+      childList: true,
+      subtree: true,
+    })
+
+    return () => observer.disconnect()
+  }, [active])
+
+  useEffect(() => {
+    if (!host) {
+      return undefined
+    }
+
+    host.classList.toggle(MOBILE_KARAOKE_LYRICS_ACTIVE_CLASS, active)
+
+    return () => {
+      host.classList.remove(MOBILE_KARAOKE_LYRICS_ACTIVE_CLASS)
+    }
+  }, [active, host])
+
+  if (!active || !host) {
+    return null
+  }
+
+  return createPortal(children, host)
+}
+
+export default MobileKaraokeLyricsPortal
diff --git a/ui/src/audioplayer/MobileKaraokeLyricsPortal.test.jsx b/ui/src/audioplayer/MobileKaraokeLyricsPortal.test.jsx
new file mode 100644
index 000000000..8b237e184
--- /dev/null
+++ b/ui/src/audioplayer/MobileKaraokeLyricsPortal.test.jsx
@@ -0,0 +1,55 @@
+import React from 'react'
+import { cleanup, render, screen, waitFor } from '@testing-library/react'
+import MobileKaraokeLyricsPortal, {
+  MOBILE_KARAOKE_LYRICS_ACTIVE_CLASS,
+} from './MobileKaraokeLyricsPortal'
+
+const HOST_CLASS = 'react-jinke-music-player-mobile-cover'
+
+describe('<MobileKaraokeLyricsPortal />', () => {
+  afterEach(() => {
+    cleanup()
+    document.body.innerHTML = ''
+  })
+
+  it('renders lyrics into the mobile cover host and toggles the active class', () => {
+    const host = document.createElement('div')
+    host.className = HOST_CLASS
+    document.body.appendChild(host)
+
+    const { rerender } = render(
+      <MobileKaraokeLyricsPortal active={true}>
+        <div data-testid="mobile-inline-lyrics">Lyrics</div>
+      </MobileKaraokeLyricsPortal>,
+    )
+
+    expect(host).toContainElement(screen.getByTestId('mobile-inline-lyrics'))
+    expect(host).toHaveClass(MOBILE_KARAOKE_LYRICS_ACTIVE_CLASS)
+
+    rerender(
+      <MobileKaraokeLyricsPortal active={false}>
+        <div data-testid="mobile-inline-lyrics">Lyrics</div>
+      </MobileKaraokeLyricsPortal>,
+    )
+
+    expect(screen.queryByTestId('mobile-inline-lyrics')).not.toBeInTheDocument()
+    expect(host).not.toHaveClass(MOBILE_KARAOKE_LYRICS_ACTIVE_CLASS)
+  })
+
+  it('attaches when the mobile cover host appears after mount', async () => {
+    render(
+      <MobileKaraokeLyricsPortal active={true}>
+        <div data-testid="mobile-inline-lyrics">Lyrics</div>
+      </MobileKaraokeLyricsPortal>,
+    )
+
+    const host = document.createElement('div')
+    host.className = HOST_CLASS
+    document.body.appendChild(host)
+
+    await waitFor(() =>
+      expect(host).toContainElement(screen.getByTestId('mobile-inline-lyrics')),
+    )
+    expect(host).toHaveClass(MOBILE_KARAOKE_LYRICS_ACTIVE_CLASS)
+  })
+})
diff --git a/ui/src/audioplayer/Player.jsx b/ui/src/audioplayer/Player.jsx
index c6e73c916..9a60655cd 100644
--- a/ui/src/audioplayer/Player.jsx
+++ b/ui/src/audioplayer/Player.jsx
@@ -45,6 +45,7 @@ import {
   togglePronunciationPreference,
 } from './lyricsOverlayState'
 import KaraokeLyricsOverlay from './KaraokeLyricsOverlay'
+import MobileKaraokeLyricsPortal from './MobileKaraokeLyricsPortal'
 
 const emptyLyricLayers = {
   main: null,
@@ -172,6 +173,7 @@ const Player = () => {
       hasTranslationLyric,
       hasPronunciationLyric,
     })
+  const useInlineMobileLyrics = karaokeVisible && !isDesktop
 
   const applyLyricToRuntimePlayer = useCallback((trackId, lyric) => {
     if (!trackId) {
@@ -535,10 +537,13 @@ const Player = () => {
   )
 
   const onCoverClick = useCallback((mode, audioLists, audioInfo) => {
+    if (!isDesktop && karaokeVisible) {
+      return
+    }
     if (mode === 'full' && audioInfo?.song?.albumId) {
       window.location.href = `#/album/${audioInfo.song.albumId}/show`
     }
-  }, [])
+  }, [isDesktop, karaokeVisible])
 
   const onAudioError = useCallback(
     (error, currentPlayId, audioLists, audioInfo) => {
@@ -603,28 +608,55 @@ const Player = () => {
         onBeforeDestroy={onBeforeDestroy}
         getAudioInstance={setAudioInstance}
       />
-      <KaraokeLyricsOverlay
-        visible={karaokeVisible}
-        mainLyric={selectedLyricLayers.main}
-        translationLyric={selectedLyricLayers.translation}
-        pronunciationLyric={selectedLyricLayers.pronunciation}
-        showTranslation={showTranslation}
-        showPronunciation={showPronunciation}
-        translationEnabled={hasTranslationLyric}
-        pronunciationEnabled={hasPronunciationLyric}
-        onToggleTranslation={() =>
-          setTranslationPreference((previous) =>
-            hasTranslationLyric ? !previous : false,
-          )
-        }
-        onTogglePronunciation={() =>
-          setPronunciationPreference((previous) =>
-            togglePronunciationPreference(previous, hasPronunciationLyric),
-          )
-        }
-        audioInstance={audioInstance}
-        onClose={() => setKaraokeVisiblePreference(false)}
-      />
+      {isDesktop && (
+        <KaraokeLyricsOverlay
+          visible={karaokeVisible}
+          mainLyric={selectedLyricLayers.main}
+          translationLyric={selectedLyricLayers.translation}
+          pronunciationLyric={selectedLyricLayers.pronunciation}
+          showTranslation={showTranslation}
+          showPronunciation={showPronunciation}
+          translationEnabled={hasTranslationLyric}
+          pronunciationEnabled={hasPronunciationLyric}
+          onToggleTranslation={() =>
+            setTranslationPreference((previous) =>
+              hasTranslationLyric ? !previous : false,
+            )
+          }
+          onTogglePronunciation={() =>
+            setPronunciationPreference((previous) =>
+              togglePronunciationPreference(previous, hasPronunciationLyric),
+            )
+          }
+          audioInstance={audioInstance}
+          onClose={() => setKaraokeVisiblePreference(false)}
+        />
+      )}
+      <MobileKaraokeLyricsPortal active={useInlineMobileLyrics}>
+        <KaraokeLyricsOverlay
+          visible={useInlineMobileLyrics}
+          inline={true}
+          mainLyric={selectedLyricLayers.main}
+          translationLyric={selectedLyricLayers.translation}
+          pronunciationLyric={selectedLyricLayers.pronunciation}
+          showTranslation={showTranslation}
+          showPronunciation={showPronunciation}
+          translationEnabled={hasTranslationLyric}
+          pronunciationEnabled={hasPronunciationLyric}
+          onToggleTranslation={() =>
+            setTranslationPreference((previous) =>
+              hasTranslationLyric ? !previous : false,
+            )
+          }
+          onTogglePronunciation={() =>
+            setPronunciationPreference((previous) =>
+              togglePronunciationPreference(previous, hasPronunciationLyric),
+            )
+          }
+          audioInstance={audioInstance}
+          onClose={() => setKaraokeVisiblePreference(false)}
+        />
+      </MobileKaraokeLyricsPortal>
       <GlobalHotKeys handlers={handlers} keyMap={keyMap} allowChanges />
     </ThemeProvider>
   )
diff --git a/ui/src/audioplayer/lyrics.js b/ui/src/audioplayer/lyrics.js
index 6fa627ee5..ae49c89e5 100644
--- a/ui/src/audioplayer/lyrics.js
+++ b/ui/src/audioplayer/lyrics.js
@@ -86,7 +86,7 @@ const normalizeToken = (token) => {
     return null
   }
   const value = typeof token.value === 'string' ? token.value : ''
-  if (!value.trim()) {
+  if (value.length === 0) {
     return null
   }
   const byteStart = toByteOffset(token.byteStart)
diff --git a/ui/src/audioplayer/lyrics.test.js b/ui/src/audioplayer/lyrics.test.js
index 961fdb10b..ae5fb5a66 100644
--- a/ui/src/audioplayer/lyrics.test.js
+++ b/ui/src/audioplayer/lyrics.test.js
@@ -455,6 +455,43 @@ describe('lyrics helpers', () => {
     ])
   })
 
+  it('preserves whitespace-only cues for exact byte-range rendering', () => {
+    const lines = buildKaraokeLines({
+      lang: 'kor',
+      synced: true,
+      line: [{ start: 0, end: 900, value: '눈을 뜬 순간' }],
+      cueLine: [
+        {
+          index: 0,
+          start: 0,
+          end: 900,
+          value: '눈을 뜬 순간',
+          cue: [
+            { start: 0, end: 150, value: '눈을', byteStart: 0, byteEnd: 5 },
+            { start: 150, end: 250, value: ' ', byteStart: 6, byteEnd: 6 },
+            { start: 250, end: 450, value: '뜬', byteStart: 7, byteEnd: 9 },
+            { start: 450, end: 550, value: ' ', byteStart: 10, byteEnd: 10 },
+            { start: 550, end: 900, value: '순간', byteStart: 11, byteEnd: 16 },
+          ],
+        },
+      ],
+    })
+
+    expect(
+      lines[0].tokens.map((token) => [
+        token.value,
+        token.byteStart,
+        token.byteEnd,
+      ]),
+    ).toEqual([
+      ['눈을', 0, 5],
+      [' ', 6, 6],
+      ['뜬', 7, 9],
+      [' ', 10, 10],
+      ['순간', 11, 16],
+    ])
+  })
+
   it('maps UTF-8 byte offsets to string ranges for multibyte lyrics', () => {
     const text = '눈을 뜬 순간'
 
diff --git a/ui/src/audioplayer/styles.js b/ui/src/audioplayer/styles.js
index 30a14d4db..09ccb8fcf 100644
--- a/ui/src/audioplayer/styles.js
+++ b/ui/src/audioplayer/styles.js
@@ -62,12 +62,30 @@ const useStyle = makeStyles(
           // Fix cover display when image is not square
           aspectRatio: '1/1',
           display: 'flex',
+          position: 'relative',
+        },
+      '& .react-jinke-music-player-mobile .react-jinke-music-player-mobile-cover.nd-mobile-lyrics-active':
+        {
+          width: 'calc(100% - 40px)',
+          maxWidth: 'none',
+          height: 'clamp(280px, 42vh, 460px)',
+          aspectRatio: 'auto',
+          borderRadius: 24,
+          border: '1px solid rgba(255, 255, 255, 0.1)',
+          boxShadow: '0 18px 40px rgba(0, 0, 0, 0.32)',
+          background: 'rgba(6, 8, 12, 0.82)',
+          cursor: 'default',
         },
       '& .react-jinke-music-player-mobile .react-jinke-music-player-mobile-cover img.cover':
         {
           animationDuration: (props) => !props.enableCoverAnimation && '0s',
           objectFit: 'contain', // Fix cover display when image is not square
         },
+      '& .react-jinke-music-player-mobile .react-jinke-music-player-mobile-cover.nd-mobile-lyrics-active img.cover':
+        {
+          opacity: 0,
+          pointerEvents: 'none',
+        },
       // Hide old singer display
       '& .react-jinke-music-player-mobile .react-jinke-music-player-mobile-singer':
         {

From 45fac622859848641dfb979cd76a456e989aa133 Mon Sep 17 00:00:00 2001
From: ranokay <github@ranokay.com>
Date: Tue, 14 Apr 2026 16:17:38 +0300
Subject: [PATCH 13/14] refactor(lyrics): clean up karaoke parsing and edge
 cases

---
 core/lyrics/ttml.go                     |  20 ++--
 core/lyrics/ttml_test.go                |  30 ++++++
 server/subsonic/media_retrieval_test.go |  51 ++++++++++
 ui/src/audioplayer/lyrics.js            | 127 ++++++++++++------------
 ui/src/audioplayer/lyrics.test.js       |  63 ++++++++++++
 5 files changed, 218 insertions(+), 73 deletions(-)

diff --git a/core/lyrics/ttml.go b/core/lyrics/ttml.go
index 6e4ce9da3..576d2ca3d 100644
--- a/core/lyrics/ttml.go
+++ b/core/lyrics/ttml.go
@@ -664,7 +664,6 @@ func (p *ttmlParser) buildMetadataLyrics(kind string, langOrder []string, entrie
 }
 
 func (p *ttmlParser) finalizeLyrics(lyrics model.Lyrics) model.Lyrics {
-	lyrics.Line = model.NormalizeCueLines(lyrics.Line)
 	lyrics.Line, lyrics.Agents = p.resolveAgents(lyrics.Line)
 	return model.NormalizeLyrics(lyrics)
 }
@@ -674,14 +673,13 @@ func (p *ttmlParser) resolveAgents(lines []model.Line) ([]model.Line, []model.Ag
 		return lines, nil
 	}
 
-	normalized := model.NormalizeCueLines(lines)
 	usedOrder := make([]string, 0, 4)
 	usedSet := make(map[string]struct{}, 4)
 	sawEmptyCue := false
 
-	for i := range normalized {
-		for j := range normalized[i].Cue {
-			agentID := strings.TrimSpace(normalized[i].Cue[j].AgentID)
+	for i := range lines {
+		for j := range lines[i].Cue {
+			agentID := strings.TrimSpace(lines[i].Cue[j].AgentID)
 			if agentID == "" {
 				sawEmptyCue = true
 				continue
@@ -694,7 +692,7 @@ func (p *ttmlParser) resolveAgents(lines []model.Line) ([]model.Line, []model.Ag
 	}
 
 	if len(usedOrder) == 0 {
-		return normalized, nil
+		return lines, nil
 	}
 
 	mainID := ""
@@ -725,10 +723,10 @@ func (p *ttmlParser) resolveAgents(lines []model.Line) ([]model.Line, []model.Ag
 		usedOrder = append([]string{mainID}, usedOrder...)
 	}
 
-	for i := range normalized {
-		for j := range normalized[i].Cue {
-			if strings.TrimSpace(normalized[i].Cue[j].AgentID) == "" {
-				normalized[i].Cue[j].AgentID = mainID
+	for i := range lines {
+		for j := range lines[i].Cue {
+			if strings.TrimSpace(lines[i].Cue[j].AgentID) == "" {
+				lines[i].Cue[j].AgentID = mainID
 			}
 		}
 	}
@@ -747,7 +745,7 @@ func (p *ttmlParser) resolveAgents(lines []model.Line) ([]model.Line, []model.Ag
 		agents = append(agents, agent)
 	}
 
-	return normalized, agents
+	return lines, agents
 }
 
 func (p *ttmlParser) resolveCueAgentID(ctx ttmlTimingContext) string {
diff --git a/core/lyrics/ttml_test.go b/core/lyrics/ttml_test.go
index 5f9092e36..14676975d 100644
--- a/core/lyrics/ttml_test.go
+++ b/core/lyrics/ttml_test.go
@@ -215,6 +215,36 @@ var _ = Describe("parseTTML", func() {
 			Expect(list[0].Line[1].Cue).To(HaveLen(1))
 			Expect(list[0].Line[1].Cue[0].AgentID).To(Equal("lead__bg"))
 		})
+
+		It("should fill missing cue agent ids with the resolved main agent", func() {
+			content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
+<tt xmlns="http://www.w3.org/ns/ttml" xmlns:ttm="http://www.w3.org/ns/ttml#metadata">
+  <head>
+    <metadata>
+      <ttm:agent xml:id="guest" type="person"><ttm:name>Guest Vocal</ttm:name></ttm:agent>
+    </metadata>
+  </head>
+  <body xml:lang="eng">
+    <div>
+      <p begin="1s" end="3s">
+        <span begin="1s" end="1.4s">Lead</span>
+        <span begin="2s" end="2.4s" ttm:agent="guest">Guest</span>
+      </p>
+    </div>
+  </body>
+</tt>`)
+
+			list, err := parseTTML(content)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(list).To(HaveLen(1))
+			Expect(list[0].Agents).To(Equal([]model.Agent{
+				{ID: "guest", Role: "main", Name: "Guest Vocal"},
+			}))
+			Expect(list[0].Line).To(HaveLen(1))
+			Expect(list[0].Line[0].Cue).To(HaveLen(2))
+			Expect(list[0].Line[0].Cue[0].AgentID).To(Equal("guest"))
+			Expect(list[0].Line[0].Cue[1].AgentID).To(Equal("guest"))
+		})
 	})
 
 	Describe("Ambiguous decimal timing", func() {
diff --git a/server/subsonic/media_retrieval_test.go b/server/subsonic/media_retrieval_test.go
index d02d5b9bd..aedf08ff7 100644
--- a/server/subsonic/media_retrieval_test.go
+++ b/server/subsonic/media_retrieval_test.go
@@ -653,6 +653,57 @@ var _ = Describe("MediaRetrievalController", func() {
 			})
 		})
 
+		It("should keep enhanced line-level lyrics when no cue data is available", func() {
+			r := newGetRequest("id=1&enhanced=true")
+
+			lineStart := int64(1000)
+			lineEnd := int64(3000)
+			lyricsJSON, err := json.Marshal(model.LyricList{
+				{
+					Kind:   "main",
+					Lang:   "eng",
+					Synced: true,
+					Line: []model.Line{
+						{
+							Start: &lineStart,
+							End:   &lineEnd,
+							Value: "Line without word timing",
+						},
+					},
+				},
+			})
+			Expect(err).ToNot(HaveOccurred())
+
+			mockRepo.SetData(model.MediaFiles{
+				{
+					ID:     "1",
+					Artist: "Rick Astley",
+					Title:  "Never Gonna Give You Up",
+					Lyrics: string(lyricsJSON),
+				},
+			})
+
+			response, err := router.GetLyricsBySongId(r)
+			Expect(err).ToNot(HaveOccurred())
+			compareResponses(response.LyricsList, responses.LyricsList{
+				StructuredLyrics: responses.StructuredLyrics{
+					{
+						DisplayArtist: "Rick Astley",
+						DisplayTitle:  "Never Gonna Give You Up",
+						Kind:          "main",
+						Lang:          "eng",
+						Synced:        true,
+						Line: []responses.Line{
+							{
+								Start: &lineStart,
+								Value: "Line without word timing",
+							},
+						},
+					},
+				},
+			})
+		})
+
 		It("should return required cue byte offsets for ambiguous and multibyte cue lines", func() {
 			r := newGetRequest("id=1&enhanced=true")
 
diff --git a/ui/src/audioplayer/lyrics.js b/ui/src/audioplayer/lyrics.js
index ae49c89e5..b44e4d9f0 100644
--- a/ui/src/audioplayer/lyrics.js
+++ b/ui/src/audioplayer/lyrics.js
@@ -1,6 +1,7 @@
 const normalizeLanguageTag = (language) =>
   (language || '').toLowerCase().replace('_', '-')
 
+// Roughly one 60fps frame; keeps line/token switching stable near tight boundaries.
 const KARAOKE_SWITCH_EPSILON_MS = 18
 const LYRIC_KIND_MAIN = 'main'
 const LYRIC_KIND_TRANSLATION = 'translation'
@@ -379,6 +380,68 @@ export const structuredLyricsToLrc = (structuredLyrics, preferredLanguage) => {
   return structuredLyricToLrc(selected)
 }
 
+const buildBaseKaraokeLines = (baseLines) =>
+  baseLines.map((line, index) => ({
+    index,
+    start: toTime(line.start),
+    end: toTime(line.end),
+    value: typeof line.value === 'string' ? line.value : '',
+    tokens: [],
+  }))
+
+export const buildKaraokeLinesFromCueLines = (
+  rawCueLines,
+  baseLines,
+  agentLookup,
+) => {
+  const normalizedCueLines = rawCueLines.map((cueLine, fallbackIndex) => {
+    const normalized = normalizeCueLine(cueLine, fallbackIndex, agentLookup)
+    return {
+      ...normalized,
+      tokens: normalized.tokens.map((token) => ({
+        ...token,
+        role: normalized.role,
+        agentId: normalized.agentId,
+        agentName: normalized.agentName,
+        agentRole: normalized.agentRole,
+      })),
+    }
+  })
+
+  const byIndex = new Map()
+  for (const cueLine of normalizedCueLines) {
+    if (!byIndex.has(cueLine.index)) {
+      byIndex.set(cueLine.index, [])
+    }
+    byIndex.get(cueLine.index).push(cueLine)
+  }
+
+  return Array.from(byIndex.entries()).map(([index, group]) => {
+    const first = group[0]
+    const baseLine = baseLines[index] || {}
+    const tokens = sortTokensByStart(group.flatMap((cueLine) => cueLine.tokens))
+    const fallbackStart =
+      tokens.find((token) => token.start != null)?.start ?? null
+    const fallbackEnd =
+      [...tokens].reverse().find((token) => token.end != null)?.end ?? null
+    const value =
+      first.value ||
+      (typeof baseLine.value === 'string' ? baseLine.value : '') ||
+      tokens.map((token) => token.value).join('')
+
+    return {
+      index,
+      start: first.start ?? toTime(baseLine.start) ?? fallbackStart,
+      end: first.end ?? toTime(baseLine.end) ?? fallbackEnd,
+      value,
+      agentId: first.agentId,
+      agentName: first.agentName,
+      agentRole: first.agentRole,
+      tokens,
+    }
+  })
+}
+
 export const buildKaraokeLines = (structuredLyric) => {
   if (!structuredLyric) {
     return []
@@ -394,68 +457,8 @@ export const buildKaraokeLines = (structuredLyric) => {
 
   const lines =
     rawCueLines.length > 0
-      ? (() => {
-          const normalizedCueLines = rawCueLines.map(
-            (cueLine, fallbackIndex) => {
-              const normalized = normalizeCueLine(
-                cueLine,
-                fallbackIndex,
-                agentLookup,
-              )
-              return {
-                ...normalized,
-                tokens: normalized.tokens.map((token) => ({
-                  ...token,
-                  role: normalized.role,
-                  agentId: normalized.agentId,
-                  agentName: normalized.agentName,
-                  agentRole: normalized.agentRole,
-                })),
-              }
-            },
-          )
-
-          const byIndex = new Map()
-          for (const cl of normalizedCueLines) {
-            if (!byIndex.has(cl.index)) {
-              byIndex.set(cl.index, [])
-            }
-            byIndex.get(cl.index).push(cl)
-          }
-
-          return Array.from(byIndex.entries()).map(([index, group]) => {
-            const first = group[0]
-            const baseLine = baseLines[index] || {}
-            const tokens = sortTokensByStart(group.flatMap((cl) => cl.tokens))
-            const fallbackStart =
-              tokens.find((token) => token.start != null)?.start ?? null
-            const fallbackEnd =
-              [...tokens].reverse().find((token) => token.end != null)?.end ??
-              null
-            const value =
-              first.value ||
-              (typeof baseLine.value === 'string' ? baseLine.value : '') ||
-              tokens.map((token) => token.value).join('')
-
-            return {
-              index,
-              start: first.start ?? toTime(baseLine.start) ?? fallbackStart,
-              end: first.end ?? toTime(baseLine.end) ?? fallbackEnd,
-              value,
-              agentId: first.agentId,
-              agentName: first.agentName,
-              agentRole: first.agentRole,
-              tokens,
-            }
-          })
-        })()
-      : baseLines.map((line, index) => ({
-          index,
-          start: toTime(line.start),
-          end: toTime(line.end),
-          value: typeof line.value === 'string' ? line.value : '',
-          tokens: [],
-        }))
+      ? buildKaraokeLinesFromCueLines(rawCueLines, baseLines, agentLookup)
+      : buildBaseKaraokeLines(baseLines)
 
   const normalized = lines
     .filter((line) => line.value || line.tokens.length > 0)
diff --git a/ui/src/audioplayer/lyrics.test.js b/ui/src/audioplayer/lyrics.test.js
index ae5fb5a66..1abea57a5 100644
--- a/ui/src/audioplayer/lyrics.test.js
+++ b/ui/src/audioplayer/lyrics.test.js
@@ -2,6 +2,7 @@ import {
   buildHighlightedAuxLine,
   buildHighlightedMainLine,
   buildKaraokeLines,
+  buildKaraokeLinesFromCueLines,
   findLayerLineIndexForMain,
   getActiveKaraokeState,
   getPreferredLyricLanguage,
@@ -414,6 +415,68 @@ describe('lyrics helpers', () => {
     ])
   })
 
+  it('builds grouped karaoke lines directly from cue lines', () => {
+    const agentLookup = new Map([
+      ['lead', { id: 'lead', role: 'main', name: 'Lead Vocal' }],
+      ['backing', { id: 'backing', role: 'bg', name: '' }],
+    ])
+
+    const lines = buildKaraokeLinesFromCueLines(
+      [
+        {
+          index: 0,
+          start: 1000,
+          end: 3000,
+          value: 'Hello world',
+          agentId: 'lead',
+          cue: [{ start: 1000, end: 1500, value: 'Hello' }],
+        },
+        {
+          index: 0,
+          start: 1000,
+          end: 3000,
+          value: 'Hello world',
+          agentId: 'backing',
+          cue: [{ start: 2000, end: 2500, value: 'world' }],
+        },
+      ],
+      [{ start: 1000, end: 3000, value: 'Hello world' }],
+      agentLookup,
+    )
+
+    expect(lines).toEqual([
+      {
+        agentId: 'lead',
+        agentName: 'Lead Vocal',
+        agentRole: 'main',
+        index: 0,
+        start: 1000,
+        end: 3000,
+        value: 'Hello world',
+        tokens: [
+          {
+            start: 1000,
+            end: 1500,
+            value: 'Hello',
+            role: '',
+            agentId: 'lead',
+            agentName: 'Lead Vocal',
+            agentRole: 'main',
+          },
+          {
+            start: 2000,
+            end: 2500,
+            value: 'world',
+            role: 'bg',
+            agentId: 'backing',
+            agentName: '',
+            agentRole: 'bg',
+          },
+        ],
+      },
+    ])
+  })
+
   it('preserves cue byte offsets on karaoke tokens', () => {
     const lines = buildKaraokeLines({
       lang: 'eng',

From 7c6ecd0cf683d279201e79fd9e502c139bb78a17 Mon Sep 17 00:00:00 2001
From: ranokay <github@ranokay.com>
Date: Tue, 14 Apr 2026 23:17:31 +0300
Subject: [PATCH 14/14] refine karaoke lyrics overlay UI
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Remove glow effect, keep gradient text highlight with soft wipe edge
- Theme-adaptive: use theme.palette.background.default (85% opacity) for overlay bg
- Light theme support: dark text, borders, badges, settings panel colors
- Auto-switch main lyrics color (white↔black) on theme change
- Add black color preset, default for light themes
- Merge TR/PR toggle buttons into language badges (clickable, with tooltips)
- Fade edges via CSS mask-image (theme-independent, no pseudo-elements)
- Rising effect (font-size scaling) for TR/PR lines matching main
- Smooth scroll: custom rAF ease-out cubic animation (400ms)
- Mobile: full-width panel, backdrop blur, transparent background
- Timing: KARAOKE_RENDER_LEAD_MS=80, KARAOKE_SWITCH_EPSILON_MS=50
- Hide TR/PR badges when no data available
- Badge/pill vertical centering with lineHeight:1
- Remove unused Button import, layerControls/layerToggle styles
---
 ui/src/audioplayer/KaraokeLyricsOverlay.jsx | 338 ++++++++++++--------
 ui/src/audioplayer/lyrics.js                |   2 +-
 ui/src/audioplayer/styles.js                |  10 +-
 3 files changed, 207 insertions(+), 143 deletions(-)

diff --git a/ui/src/audioplayer/KaraokeLyricsOverlay.jsx b/ui/src/audioplayer/KaraokeLyricsOverlay.jsx
index c016df9e5..aefb0127e 100644
--- a/ui/src/audioplayer/KaraokeLyricsOverlay.jsx
+++ b/ui/src/audioplayer/KaraokeLyricsOverlay.jsx
@@ -1,8 +1,7 @@
-import Button from '@material-ui/core/Button'
 import IconButton from '@material-ui/core/IconButton'
 import Popover from '@material-ui/core/Popover'
 import Slider from '@material-ui/core/Slider'
-import { makeStyles } from '@material-ui/core/styles'
+import { makeStyles, useTheme } from '@material-ui/core/styles'
 import Tooltip from '@material-ui/core/Tooltip'
 import Typography from '@material-ui/core/Typography'
 import CloseIcon from '@material-ui/icons/Close'
@@ -29,7 +28,7 @@ import {
   utf8ByteRangeToCodeUnitRange,
 } from './lyrics'
 
-const KARAOKE_RENDER_LEAD_MS = 24
+const KARAOKE_RENDER_LEAD_MS = 80
 const KARAOKE_CLOCK_DRIFT_RESET_MS = 140
 const KARAOKE_CLOCK_RESET_THRESHOLD_MS = 320
 const KARAOKE_MONOTONIC_JITTER_MS = 60
@@ -49,15 +48,17 @@ const KARAOKE_LINE_HEIGHT_STEP = 0.02
 const KARAOKE_GROUP_SPACING_BASE_PX = 14
 const KARAOKE_AUX_LINE_HEIGHT = 1.2
 const KARAOKE_MAIN_INACTIVE_FONT_FACTOR = 0.8
+const KARAOKE_AUX_INACTIVE_FONT_FACTOR = 0.88
 
 const TOKEN_DONE_ALPHA = 1
 const TOKEN_FUTURE_ALPHA = 0.34
 const TOKEN_ACTIVE_ALPHA = 1
+const TOKEN_WIPE_SOFT_SPREAD_PCT = 12
 const TOKEN_WIPE_EDGE_PCT = 8
-const TOKEN_WIPE_GLOW_PCT = 16
 
 const COLOR_PRESETS = [
   { key: 'white', label: 'White', value: 'rgba(255, 255, 255, 0.92)' },
+  { key: 'black', label: 'Black', value: 'rgba(0, 0, 0, 0.87)' },
   { key: 'blue', label: 'Blue', value: 'rgba(120, 160, 220, 0.75)' },
   { key: 'green', label: 'Green', value: 'rgba(100, 200, 130, 0.7)' },
   { key: 'pink', label: 'Pink', value: 'rgba(240, 140, 170, 0.75)' },
@@ -77,11 +78,11 @@ const DEFAULT_LYRICS_SETTINGS = {
 
 const SETTINGS_STORAGE_KEY = 'karaoke-lyrics-settings'
 
-const createDefaultLyricsSettings = () => ({
+const createDefaultLyricsSettings = (isDark = true) => ({
   lineHeight: KARAOKE_DEFAULT_LINE_HEIGHT,
   overlayHeight: KARAOKE_DEFAULT_HEIGHT_PX,
   tr: { ...DEFAULT_LYRICS_SETTINGS.tr },
-  main: { ...DEFAULT_LYRICS_SETTINGS.main },
+  main: { ...DEFAULT_LYRICS_SETTINGS.main, colorKey: isDark ? 'white' : 'black' },
   pr: { ...DEFAULT_LYRICS_SETTINGS.pr },
 })
 
@@ -135,7 +136,30 @@ const saveLyricsSettings = (settings) => {
 const getColorValue = (colorKey) =>
   COLOR_PRESETS.find((c) => c.key === colorKey)?.value || COLOR_PRESETS[0].value
 
-const useStyles = makeStyles((theme) => ({
+const hexToRgba = (hex, alpha) => {
+  const m = (hex || '').match(/#([0-9a-f]{2})([0-9a-f]{2})([0-9a-f]{2})/i)
+  if (m) return `rgba(${parseInt(m[1], 16)}, ${parseInt(m[2], 16)}, ${parseInt(m[3], 16)}, ${alpha})`
+  const rm = (hex || '').match(/rgba?\((\d+),\s*(\d+),\s*(\d+)/)
+  if (rm) return `rgba(${rm[1]}, ${rm[2]}, ${rm[3]}, ${alpha})`
+  return `rgba(48, 48, 48, ${alpha})`
+}
+
+const useStyles = makeStyles((theme) => {
+  const isDark = theme.palette.type === 'dark'
+  const overlayBg = hexToRgba(theme.palette.background.default, 0.85)
+  const primaryMain = theme.palette.primary.main
+  const primaryRgb = (() => {
+    const m = (primaryMain || '').match(/#([0-9a-f]{2})([0-9a-f]{2})([0-9a-f]{2})/i)
+    if (m) return [parseInt(m[1], 16), parseInt(m[2], 16), parseInt(m[3], 16)]
+    const rm = (primaryMain || '').match(/rgba?\((\d+),\s*(\d+),\s*(\d+)/)
+    if (rm) return [parseInt(rm[1]), parseInt(rm[2]), parseInt(rm[3])]
+    return [144, 202, 249]
+  })()
+  const textPrimary = isDark ? 'rgba(255, 255, 255, 0.92)' : 'rgba(0, 0, 0, 0.87)'
+  const textSecondary = isDark ? 'rgba(255, 255, 255, 0.55)' : 'rgba(0, 0, 0, 0.54)'
+  const borderSubtle = isDark ? 'rgba(255, 255, 255, 0.12)' : 'rgba(0, 0, 0, 0.12)'
+
+  return ({
   overlay: {
     position: 'fixed',
     left: '50%',
@@ -144,19 +168,19 @@ const useStyles = makeStyles((theme) => ({
     zIndex: 1400,
     width: 'min(1000px, calc(100vw - 32px))',
     minHeight: KARAOKE_MIN_HEIGHT_PX,
-    background: 'rgba(6, 8, 12, 0.9)',
+    background: overlayBg,
     borderRadius: 12,
-    border: '1px solid rgba(255, 255, 255, 0.12)',
+    border: `1px solid ${borderSubtle}`,
     boxShadow: '0 18px 48px rgba(0, 0, 0, 0.42)',
-    backdropFilter: 'blur(10px)',
-    color: theme.palette.common.white,
+    backdropFilter: 'blur(20px)',
+    color: textPrimary,
     display: 'flex',
     flexDirection: 'column',
     overflow: 'hidden',
     '@media (max-width:810px)': {
       bottom: 78,
       width: 'calc(100vw - 12px)',
-      borderRadius: 8,
+      borderRadius: 12,
       minHeight: 180,
       maxHeight: '65vh',
     },
@@ -172,8 +196,9 @@ const useStyles = makeStyles((theme) => ({
     borderRadius: 'inherit',
     border: 'none',
     boxShadow: 'none',
-    background: 'rgba(6, 8, 12, 0.92)',
-    backdropFilter: 'blur(12px)',
+    background: 'transparent',
+    backdropFilter: 'blur(16px)',
+    WebkitBackdropFilter: 'blur(16px)',
     zIndex: 1,
   },
   resizeHandle: {
@@ -190,7 +215,7 @@ const useStyles = makeStyles((theme) => ({
       width: 56,
       height: 3,
       borderRadius: 999,
-      background: 'rgba(255, 255, 255, 0.22)',
+      background: `rgba(${primaryRgb.join(', ')}, 0.22)`,
     },
     '@media (max-width:810px)': {
       display: 'none',
@@ -223,20 +248,31 @@ const useStyles = makeStyles((theme) => ({
   languageBadge: {
     display: 'inline-flex',
     alignItems: 'center',
+    justifyContent: 'center',
     gap: theme.spacing(0.35),
     padding: theme.spacing(0.2, 0.7),
     borderRadius: 999,
-    border: '1px solid rgba(148, 163, 184, 0.28)',
-    background: 'rgba(15, 23, 42, 0.42)',
-    color: 'rgba(226, 232, 240, 0.8)',
+    border: `1px solid ${borderSubtle}`,
+    background: isDark ? 'rgba(15, 23, 42, 0.42)' : 'rgba(0, 0, 0, 0.06)',
+    color: isDark ? 'rgba(226, 232, 240, 0.8)' : 'rgba(0, 0, 0, 0.6)',
     fontSize: 10,
+    lineHeight: 1,
     letterSpacing: '0.04em',
     whiteSpace: 'nowrap',
+    transition: `all ${KARAOKE_ANIMATION_MS}ms ease-in-out`,
+    userSelect: 'none',
+  },
+  languageBadgeToggle: {
+    cursor: 'pointer',
+    '&:hover': {
+      borderColor: `rgba(${primaryRgb.join(', ')}, 0.35)`,
+      background: isDark ? 'rgba(15, 23, 42, 0.56)' : 'rgba(0, 0, 0, 0.1)',
+    },
   },
   languageBadgeActive: {
-    borderColor: 'rgba(148, 163, 184, 0.46)',
-    background: 'rgba(30, 41, 59, 0.56)',
-    color: 'rgba(248, 250, 252, 0.94)',
+    borderColor: `rgba(${primaryRgb.join(', ')}, 0.46)`,
+    background: `rgba(${primaryRgb.join(', ')}, 0.18)`,
+    color: isDark ? 'rgba(248, 250, 252, 0.94)' : 'rgba(0, 0, 0, 0.87)',
   },
   languageBadgeLabel: {
     fontWeight: 700,
@@ -246,35 +282,8 @@ const useStyles = makeStyles((theme) => ({
   languageBadgeValue: {
     opacity: 0.9,
   },
-  layerControls: {
-    display: 'flex',
-    alignItems: 'center',
-    gap: theme.spacing(0.5),
-  },
-  layerToggle: {
-    minWidth: 34,
-    minHeight: 24,
-    padding: theme.spacing(0, 0.8),
-    fontSize: 10,
-    letterSpacing: '0.08em',
-    borderRadius: 999,
-    color: 'rgba(203, 213, 225, 0.95)',
-    background: 'rgba(100, 116, 139, 0.26)',
-    border: '1px solid rgba(148, 163, 184, 0.45)',
-    transition: `all ${KARAOKE_ANIMATION_MS}ms ease-in-out`,
-    '&.Mui-disabled': {
-      color: 'rgba(148, 163, 184, 0.45)',
-      borderColor: 'rgba(100, 116, 139, 0.3)',
-      background: 'rgba(71, 85, 105, 0.2)',
-    },
-  },
-  layerToggleActive: {
-    color: 'rgba(220, 252, 231, 0.98)',
-    borderColor: 'rgba(34, 197, 94, 0.96)',
-    background: 'rgba(34, 197, 94, 0.28)',
-  },
   closeButton: {
-    color: 'rgba(255, 255, 255, 0.72)',
+    color: textSecondary,
   },
   lineGroup: {
     display: 'flex',
@@ -290,7 +299,7 @@ const useStyles = makeStyles((theme) => ({
     fontWeight: 400,
     lineHeight: KARAOKE_AUX_LINE_HEIGHT,
     letterSpacing: '0.01em',
-    transition: `opacity ${KARAOKE_ANIMATION_MS}ms ease-in-out, font-size ${KARAOKE_ANIMATION_MS}ms ease-in-out`,
+    transition: `opacity ${KARAOKE_ANIMATION_MS}ms ease-in-out, color ${KARAOKE_ANIMATION_MS}ms ease-in-out, font-size 280ms ease-in-out, max-width 280ms ease-in-out`,
   },
   inlinePr: {
     margin: 0,
@@ -304,23 +313,29 @@ const useStyles = makeStyles((theme) => ({
     boxSizing: 'border-box',
     textAlign: 'center',
     fontWeight: 400,
-    lineHeight: KARAOKE_AUX_LINE_HEIGHT,
+    lineHeight: 1,
     letterSpacing: '0.01em',
-    transition: `opacity ${KARAOKE_ANIMATION_MS}ms ease-in-out, font-size ${KARAOKE_ANIMATION_MS}ms ease-in-out`,
+    transition: `opacity ${KARAOKE_ANIMATION_MS}ms ease-in-out, color ${KARAOKE_ANIMATION_MS}ms ease-in-out, font-size 280ms ease-in-out, max-width 280ms ease-in-out`,
     padding: theme.spacing(0.15, 0.9),
     borderRadius: 999,
-    background: 'rgba(255, 255, 255, 0.08)',
-    border: '1px solid rgba(255, 255, 255, 0.12)',
+    background: isDark ? 'rgba(255, 255, 255, 0.08)' : 'rgba(0, 0, 0, 0.05)',
+    border: `1px solid ${borderSubtle}`,
+  },
+  bodyWrapper: {
+    position: 'relative',
+    flex: 1,
+    overflow: 'hidden',
   },
   body: {
     padding: theme.spacing(0.5, 2, 1.4, 2),
     overflowY: 'auto',
     overflowX: 'hidden',
-    scrollBehavior: 'smooth',
-    flex: 1,
+    height: '100%',
     overscrollBehavior: 'contain',
     scrollbarWidth: 'none',
     msOverflowStyle: 'none',
+    maskImage: 'linear-gradient(to bottom, transparent 0%, black 8%, black 92%, transparent 100%)',
+    WebkitMaskImage: 'linear-gradient(to bottom, transparent 0%, black 8%, black 92%, transparent 100%)',
     '&::-webkit-scrollbar': {
       display: 'none',
       width: 0,
@@ -347,7 +362,7 @@ const useStyles = makeStyles((theme) => ({
     lineHeight: 1.24,
     letterSpacing: '0.01em',
     textAlign: 'center',
-    color: 'rgba(255, 255, 255, 0.62)',
+    color: isDark ? 'rgba(255, 255, 255, 0.62)' : 'rgba(0, 0, 0, 0.52)',
     transition: `opacity ${KARAOKE_ANIMATION_MS}ms ease-in-out, color ${KARAOKE_ANIMATION_MS}ms ease-in-out, font-size 280ms ease-in-out, max-width 280ms ease-in-out`,
   },
   token: {
@@ -356,15 +371,15 @@ const useStyles = makeStyles((theme) => ({
     transition: `color ${KARAOKE_ANIMATION_MS}ms ease-in-out, text-shadow ${KARAOKE_ANIMATION_MS}ms ease-in-out`,
   },
   settingsButton: {
-    color: 'rgba(255, 255, 255, 0.55)',
+    color: textSecondary,
     padding: 4,
     '&:hover': {
-      color: 'rgba(255, 255, 255, 0.85)',
+      color: textPrimary,
     },
   },
   settingsPanel: {
-    background: 'rgba(12, 14, 20, 0.96)',
-    border: '1px solid rgba(255, 255, 255, 0.12)',
+    background: isDark ? 'rgba(12, 14, 20, 0.96)' : 'rgba(255, 255, 255, 0.96)',
+    border: `1px solid ${borderSubtle}`,
     borderRadius: 10,
     padding: theme.spacing(1.5, 2),
     width: 278,
@@ -388,14 +403,14 @@ const useStyles = makeStyles((theme) => ({
     fontWeight: 700,
     letterSpacing: '0.08em',
     textTransform: 'uppercase',
-    color: 'rgba(255, 255, 255, 0.78)',
+    color: isDark ? 'rgba(255, 255, 255, 0.78)' : 'rgba(0, 0, 0, 0.72)',
   },
   settingsLabel: {
     fontSize: 10,
     fontWeight: 600,
     letterSpacing: '0.1em',
     textTransform: 'uppercase',
-    color: 'rgba(255, 255, 255, 0.55)',
+    color: isDark ? 'rgba(255, 255, 255, 0.55)' : 'rgba(0, 0, 0, 0.5)',
     marginBottom: 4,
   },
   settingsRow: {
@@ -405,7 +420,7 @@ const useStyles = makeStyles((theme) => ({
   },
   settingsSlider: {
     flex: 1,
-    color: 'rgba(255, 255, 255, 0.6)',
+    color: `rgba(${primaryRgb.join(', ')}, 0.6)`,
     '& .MuiSlider-thumb': {
       width: 12,
       height: 12,
@@ -416,7 +431,7 @@ const useStyles = makeStyles((theme) => ({
   },
   settingsSliderValue: {
     fontSize: 11,
-    color: 'rgba(255, 255, 255, 0.5)',
+    color: isDark ? 'rgba(255, 255, 255, 0.5)' : 'rgba(0, 0, 0, 0.45)',
     minWidth: 22,
     textAlign: 'right',
   },
@@ -424,15 +439,15 @@ const useStyles = makeStyles((theme) => ({
     fontSize: 10,
     letterSpacing: '0.06em',
     textTransform: 'uppercase',
-    color: 'rgba(255, 255, 255, 0.45)',
+    color: isDark ? 'rgba(255, 255, 255, 0.45)' : 'rgba(0, 0, 0, 0.42)',
     minWidth: 72,
     whiteSpace: 'nowrap',
   },
   resetButton: {
-    color: 'rgba(255, 255, 255, 0.58)',
+    color: textSecondary,
     padding: 4,
     '&:hover': {
-      color: 'rgba(255, 255, 255, 0.9)',
+      color: textPrimary,
     },
   },
   colorDots: {
@@ -452,9 +467,9 @@ const useStyles = makeStyles((theme) => ({
     },
   },
   colorDotActive: {
-    borderColor: 'rgba(255, 255, 255, 0.85)',
+    borderColor: isDark ? 'rgba(255, 255, 255, 0.85)' : 'rgba(0, 0, 0, 0.7)',
   },
-}))
+})})
 
 const clamp = (v, min, max) => Math.max(min, Math.min(max, v))
 const lerp = (from, to, t) => from + (to - from) * t
@@ -479,6 +494,8 @@ const buildLanguageBadges = ({
   pronunciationLyric,
   showTranslation,
   showPronunciation,
+  translationEnabled,
+  pronunciationEnabled,
 }) =>
   [
     {
@@ -486,20 +503,25 @@ const buildLanguageBadges = ({
       label: 'Main',
       lang: mainLyric?.lang,
       active: true,
+      toggleable: false,
     },
-    {
+    pronunciationEnabled && {
       key: 'pr',
       label: 'PR',
       lang: pronunciationLyric?.lang,
       active: showPronunciation,
+      toggleable: true,
+      tooltip: showPronunciation ? 'Hide pronunciation' : 'Show pronunciation',
     },
-    {
+    translationEnabled && {
       key: 'tr',
       label: 'TR',
       lang: translationLyric?.lang,
       active: showTranslation,
+      toggleable: true,
+      tooltip: showTranslation ? 'Hide translation' : 'Show translation',
     },
-  ].filter((badge) => badge.lang)
+  ].filter((badge) => badge && badge.lang)
 
 const SettingsSection = ({ label, layer, settings, onChange, classes }) => {
   const s = settings[layer]
@@ -913,22 +935,20 @@ const buildTokenWipeStyle = ({
   const fillPct = clamp(fillProgress, 0, 1) * 100
   const doneColor = `rgba(${r}, ${g}, ${b}, ${clamp(highlightAlpha, TOKEN_DONE_ALPHA, TOKEN_ACTIVE_ALPHA)})`
   const futureColor = `rgba(${r}, ${g}, ${b}, ${futureAlpha})`
-  const activeShadow = `0 0 8px rgba(${r}, ${g}, ${b}, 0.34)`
 
   if (fillPct <= 0) {
     return { color: futureColor, textShadow: 'none' }
   }
 
   const edgeStart = clamp(fillPct - TOKEN_WIPE_EDGE_PCT, 0, 100)
-  const glowStop = clamp(fillPct + TOKEN_WIPE_GLOW_PCT, 0, 100)
-  const glowColor = `rgba(${r}, ${g}, ${b}, ${clamp(highlightAlpha + 0.18, TOKEN_DONE_ALPHA, TOKEN_ACTIVE_ALPHA)})`
+  const softEnd = clamp(fillPct + TOKEN_WIPE_SOFT_SPREAD_PCT, 0, 100)
   return {
     color: 'transparent',
     WebkitTextFillColor: 'transparent',
-    backgroundImage: `linear-gradient(90deg, ${doneColor} 0%, ${doneColor} ${edgeStart}%, ${glowColor} ${fillPct}%, ${futureColor} ${glowStop}%, ${futureColor} 100%)`,
+    backgroundImage: `linear-gradient(90deg, ${doneColor} 0%, ${doneColor} ${edgeStart}%, ${doneColor} ${fillPct}%, ${futureColor} ${softEnd}%, ${futureColor} 100%)`,
     backgroundClip: 'text',
     WebkitBackgroundClip: 'text',
-    textShadow: activeShadow,
+    textShadow: 'none',
   }
 }
 
@@ -1077,6 +1097,8 @@ const KaraokeLyricsOverlay = ({
   inline = false,
 }) => {
   const classes = useStyles()
+  const theme = useTheme()
+  const isDark = theme.palette.type === 'dark'
   const [playbackMs, setPlaybackMs] = useState(0)
   const [maxHeightPx, setMaxHeightPx] = useState(getMaxHeightPx())
   const [bodyViewportHeight, setBodyViewportHeight] = useState(0)
@@ -1092,10 +1114,10 @@ const KaraokeLyricsOverlay = ({
   }, [])
 
   const handleResetAppearance = useCallback(() => {
-    const defaults = createDefaultLyricsSettings()
+    const defaults = createDefaultLyricsSettings(isDark)
     setLyricsSettings(defaults)
     saveLyricsSettings(defaults)
-  }, [])
+  }, [isDark])
 
   const bodyRef = useRef(null)
   const activeLineRef = useRef(null)
@@ -1127,6 +1149,23 @@ const KaraokeLyricsOverlay = ({
     return () => window.removeEventListener('resize', onResize)
   }, [])
 
+  useEffect(() => {
+    setLyricsSettings((prev) => {
+      const currentColor = prev.main.colorKey
+      const shouldSwap =
+        (isDark && currentColor === 'black') ||
+        (!isDark && currentColor === 'white')
+      if (!shouldSwap) return prev
+      const newColorKey = isDark ? 'white' : 'black'
+      const updated = {
+        ...prev,
+        main: { ...prev.main, colorKey: newColorKey },
+      }
+      saveLyricsSettings(updated)
+      return updated
+    })
+  }, [isDark])
+
   useEffect(() => {
     const body = bodyRef.current
     if (!body) {
@@ -1308,6 +1347,8 @@ const KaraokeLyricsOverlay = ({
     pronunciationLyric,
     showTranslation,
     showPronunciation,
+    translationEnabled,
+    pronunciationEnabled,
   })
 
   const trByMainIndex = useMemo(() => {
@@ -1354,7 +1395,10 @@ const KaraokeLyricsOverlay = ({
       return
     }
 
-    const rafId = window.requestAnimationFrame(() => {
+    let animFrameId = null
+    let scrollAnimId = null
+
+    animFrameId = window.requestAnimationFrame(() => {
       const body = bodyRef.current
       const activeNode = activeLineRef.current
       if (!body || !activeNode) {
@@ -1368,23 +1412,36 @@ const KaraokeLyricsOverlay = ({
         bodyRect.top -
         (body.clientHeight - activeRect.height) / 2
       const maxTop = Math.max(0, body.scrollHeight - body.clientHeight)
-      const centeredTop = clamp(body.scrollTop + deltaWithinBody, 0, maxTop)
+      const targetTop = clamp(body.scrollTop + deltaWithinBody, 0, maxTop)
+      const distance = targetTop - body.scrollTop
 
-      if (Math.abs(body.scrollTop - centeredTop) < 2) {
+      if (Math.abs(distance) < 2) {
         return
       }
 
-      if (typeof body.scrollTo === 'function') {
-        body.scrollTo({
-          top: centeredTop,
-          behavior: 'smooth',
-        })
-      } else {
-        body.scrollTop = centeredTop
+      const startTop = body.scrollTop
+      const duration = 400
+      const startTime = performance.now()
+
+      const easeOutCubic = (t) => 1 - Math.pow(1 - t, 3)
+
+      const step = (now) => {
+        const elapsed = now - startTime
+        const progress = Math.min(elapsed / duration, 1)
+        const eased = easeOutCubic(progress)
+        body.scrollTop = startTop + distance * eased
+        if (progress < 1) {
+          scrollAnimId = window.requestAnimationFrame(step)
+        }
       }
+
+      scrollAnimId = window.requestAnimationFrame(step)
     })
 
-    return () => window.cancelAnimationFrame(rafId)
+    return () => {
+      if (animFrameId) window.cancelAnimationFrame(animFrameId)
+      if (scrollAnimId) window.cancelAnimationFrame(scrollAnimId)
+    }
   }, [
     centerSpacerPx,
     hasTimedMainLines,
@@ -1453,10 +1510,11 @@ const KaraokeLyricsOverlay = ({
     const [r, g, b] = parseColorRGB(
       getColorValue(lyricsSettings[layerKey].colorKey),
     )
+    const baseFontSize = lyricsSettings[layerKey].fontSize
     if (!hasTimedMainLines) {
       return {
         opacity: 0.94,
-        fontSize: lyricsSettings[layerKey].fontSize,
+        fontSize: baseFontSize,
         color: `rgba(${r}, ${g}, ${b}, 0.94)`,
         lineHeight: KARAOKE_AUX_LINE_HEIGHT,
       }
@@ -1482,11 +1540,18 @@ const KaraokeLyricsOverlay = ({
       opacity = Math.max(0.22, 0.5 - level * 0.08)
     }
 
+    const fontSize = isActive
+      ? baseFontSize
+      : Math.round(baseFontSize * KARAOKE_AUX_INACTIVE_FONT_FACTOR)
+
     return {
       opacity,
-      fontSize: lyricsSettings[layerKey].fontSize,
+      fontSize,
       color,
       lineHeight: KARAOKE_AUX_LINE_HEIGHT,
+      maxWidth: isActive
+        ? '100%'
+        : `${Math.round(KARAOKE_AUX_INACTIVE_FONT_FACTOR * 100)}%`,
     }
   }
 
@@ -1524,52 +1589,49 @@ const KaraokeLyricsOverlay = ({
       >
         <div className={classes.headerLeft}>
           <div className={classes.languageBadges}>
-            {languageBadges.map((badge) => (
-              <div
-                key={badge.key}
-                className={clsx(classes.languageBadge, {
-                  [classes.languageBadgeActive]: badge.active,
-                })}
-                data-testid={`lyrics-language-badge-${badge.key}`}
-              >
-                <span className={classes.languageBadgeLabel}>
-                  {badge.label}
-                </span>
-                <span className={classes.languageBadgeValue}>{badge.lang}</span>
-              </div>
-            ))}
-          </div>
-          <div className={classes.layerControls}>
-            <Tooltip title="Toggle translations">
-              <span>
-                <Button
-                  size="small"
-                  onClick={onToggleTranslation}
-                  disabled={!translationEnabled}
-                  className={clsx(classes.layerToggle, {
-                    [classes.layerToggleActive]: showTranslation,
+            {languageBadges.map((badge) => {
+              const badgeEl = (
+                <div
+                  key={badge.key}
+                  className={clsx(classes.languageBadge, {
+                    [classes.languageBadgeActive]: badge.active,
+                    [classes.languageBadgeToggle]: badge.toggleable,
                   })}
-                  data-testid="lyrics-toggle-translation"
+                  data-testid={`lyrics-language-badge-${badge.key}`}
+                  role={badge.toggleable ? 'button' : undefined}
+                  tabIndex={badge.toggleable ? 0 : undefined}
+                  onClick={
+                    badge.toggleable
+                      ? badge.key === 'tr'
+                        ? onToggleTranslation
+                        : onTogglePronunciation
+                      : undefined
+                  }
+                  onKeyDown={
+                    badge.toggleable
+                      ? (e) => {
+                          if (e.key === 'Enter' || e.key === ' ') {
+                            e.preventDefault()
+                            ;(badge.key === 'tr'
+                              ? onToggleTranslation
+                              : onTogglePronunciation)()
+                          }
+                        }
+                      : undefined
+                  }
                 >
-                  TR
-                </Button>
-              </span>
-            </Tooltip>
-            <Tooltip title="Toggle pronunciations">
-              <span>
-                <Button
-                  size="small"
-                  onClick={onTogglePronunciation}
-                  disabled={!pronunciationEnabled}
-                  className={clsx(classes.layerToggle, {
-                    [classes.layerToggleActive]: showPronunciation,
-                  })}
-                  data-testid="lyrics-toggle-pronunciation"
-                >
-                  PR
-                </Button>
-              </span>
-            </Tooltip>
+                  <span className={classes.languageBadgeLabel}>
+                    {badge.label}
+                  </span>
+                  <span className={classes.languageBadgeValue}>{badge.lang}</span>
+                </div>
+              )
+              return badge.toggleable ? (
+                <Tooltip key={badge.key} title={badge.tooltip}>
+                  {badgeEl}
+                </Tooltip>
+              ) : badgeEl
+            })}
           </div>
         </div>
 
@@ -1590,6 +1652,7 @@ const KaraokeLyricsOverlay = ({
         </div>
       </div>
 
+      <div className={classes.bodyWrapper}>
       <div
         className={clsx(classes.body, {
           [classes.bodyInline]: inline,
@@ -1674,6 +1737,7 @@ const KaraokeLyricsOverlay = ({
           <div aria-hidden style={{ height: centerSpacerPx }} />
         </div>
       </div>
+      </div>
     </div>
   )
 }
diff --git a/ui/src/audioplayer/lyrics.js b/ui/src/audioplayer/lyrics.js
index b44e4d9f0..98c638ab3 100644
--- a/ui/src/audioplayer/lyrics.js
+++ b/ui/src/audioplayer/lyrics.js
@@ -2,7 +2,7 @@ const normalizeLanguageTag = (language) =>
   (language || '').toLowerCase().replace('_', '-')
 
 // Roughly one 60fps frame; keeps line/token switching stable near tight boundaries.
-const KARAOKE_SWITCH_EPSILON_MS = 18
+const KARAOKE_SWITCH_EPSILON_MS = 50
 const LYRIC_KIND_MAIN = 'main'
 const LYRIC_KIND_TRANSLATION = 'translation'
 const LYRIC_KIND_PRONUNCIATION = 'pronunciation'
diff --git a/ui/src/audioplayer/styles.js b/ui/src/audioplayer/styles.js
index 09ccb8fcf..30ccf7afb 100644
--- a/ui/src/audioplayer/styles.js
+++ b/ui/src/audioplayer/styles.js
@@ -66,14 +66,14 @@ const useStyle = makeStyles(
         },
       '& .react-jinke-music-player-mobile .react-jinke-music-player-mobile-cover.nd-mobile-lyrics-active':
         {
-          width: 'calc(100% - 40px)',
+          width: '100%',
           maxWidth: 'none',
           height: 'clamp(280px, 42vh, 460px)',
           aspectRatio: 'auto',
-          borderRadius: 24,
-          border: '1px solid rgba(255, 255, 255, 0.1)',
-          boxShadow: '0 18px 40px rgba(0, 0, 0, 0.32)',
-          background: 'rgba(6, 8, 12, 0.82)',
+          borderRadius: 12,
+          border: 'none',
+          boxShadow: 'none',
+          background: 'transparent',
           cursor: 'default',
         },
       '& .react-jinke-music-player-mobile .react-jinke-music-player-mobile-cover img.cover':