diff --git a/README.md b/README.md
index 0ae5bdfaf..645f1580d 100644
--- a/README.md
+++ b/README.md
@@ -52,6 +52,7 @@ A share of the revenue helps fund the development of Navidrome at no additional
  - **Multi-platform**, runs on macOS, Linux and Windows. **Docker** images are also provided
  - Ready to use binaries for all major platforms, including **Raspberry Pi**
  - Automatically **monitors your library** for changes, importing new files and reloading new metadata 
+ - Supports lyrics from sidecar **.ttml**, **.elrc**, **.lrc**, **.srt**, **.txt** files and embedded tags (via `lyricspriority`)
  - **Themeable**, modern and responsive **Web interface** based on [Material UI](https://material-ui.com)
  - **Compatible** with all Subsonic/Madsonic/Airsonic [clients](https://www.navidrome.org/docs/overview/#apps)
  - **Transcoding** on the fly. Can be set per user/player. **Opus encoding is supported**
diff --git a/conf/configuration.go b/conf/configuration.go
index 916efe70b..9bd79d781 100644
--- a/conf/configuration.go
+++ b/conf/configuration.go
@@ -763,7 +763,7 @@ func setViperDefaults() {
 	viper.SetDefault("artistartpriority", "artist.*, album/artist.*, external")
 	viper.SetDefault("artistimagefolder", "")
 	viper.SetDefault("discartpriority", "disc*.*, cd*.*, cover.*, folder.*, front.*, discsubtitle, embedded")
-	viper.SetDefault("lyricspriority", ".lrc,.txt,embedded")
+	viper.SetDefault("lyricspriority", ".ttml,.elrc,.lrc,.srt,.txt,embedded")
 	viper.SetDefault("enablegravatar", false)
 	viper.SetDefault("enablefavourites", true)
 	viper.SetDefault("enablestarrating", true)
diff --git a/core/lyrics/lyrics.go b/core/lyrics/lyrics.go
index 758053042..cc3d574b3 100644
--- a/core/lyrics/lyrics.go
+++ b/core/lyrics/lyrics.go
@@ -14,6 +14,12 @@ type Lyrics interface {
 	GetLyrics(ctx context.Context, mf *model.MediaFile) (model.LyricList, error)
 }
 
+// BatchLyrics can resolve lyrics across multiple candidate media files while
+// still honoring the configured source priority globally.
+type BatchLyrics interface {
+	GetLyricsForMediaFiles(ctx context.Context, mediaFiles []model.MediaFile) (model.LyricList, error)
+}
+
 // PluginLoader discovers and loads lyrics provider plugins.
 type PluginLoader interface {
 	LoadLyricsProvider(name string) (Lyrics, bool)
@@ -32,28 +38,53 @@ func NewLyrics(pluginLoader PluginLoader) Lyrics {
 // GetLyrics returns lyrics for the given media file, trying sources in the
 // order specified by conf.Server.LyricsPriority.
 func (l *lyricsService) GetLyrics(ctx context.Context, mf *model.MediaFile) (model.LyricList, error) {
-	var lyricsList model.LyricList
-	var err error
+	return l.getLyricsForCandidates(ctx, []*model.MediaFile{mf})
+}
 
+// GetLyricsForMediaFiles resolves lyrics across duplicate media files while
+// preserving the configured source priority across the full candidate set.
+func (l *lyricsService) GetLyricsForMediaFiles(ctx context.Context, mediaFiles []model.MediaFile) (model.LyricList, error) {
+	candidates := make([]*model.MediaFile, 0, len(mediaFiles))
+	for i := range mediaFiles {
+		candidates = append(candidates, &mediaFiles[i])
+	}
+	return l.getLyricsForCandidates(ctx, candidates)
+}
+
+func (l *lyricsService) getLyricsForCandidates(ctx context.Context, mediaFiles []*model.MediaFile) (model.LyricList, error) {
 	for pattern := range strings.SplitSeq(conf.Server.LyricsPriority, ",") {
 		pattern = strings.TrimSpace(pattern)
-		switch {
-		case strings.EqualFold(pattern, "embedded"):
-			lyricsList, err = fromEmbedded(ctx, mf)
-		case strings.HasPrefix(pattern, "."):
-			lyricsList, err = fromExternalFile(ctx, mf, strings.ToLower(pattern))
-		default:
-			lyricsList, err = l.fromPlugin(ctx, mf, pattern)
+		if pattern == "" {
+			continue
 		}
 
-		if err != nil {
-			log.Error(ctx, "error getting lyrics", "source", pattern, err)
-		}
+		for _, mf := range mediaFiles {
+			if mf == nil {
+				continue
+			}
 
-		if len(lyricsList) > 0 {
-			return lyricsList, nil
+			lyricsList, err := l.getLyricsFromSource(ctx, mf, pattern)
+			if err != nil {
+				log.Error(ctx, "error getting lyrics", "source", pattern, err)
+				continue
+			}
+
+			if len(lyricsList) > 0 {
+				return lyricsList, nil
+			}
 		}
 	}
 
 	return nil, nil
 }
+
+func (l *lyricsService) getLyricsFromSource(ctx context.Context, mf *model.MediaFile, pattern string) (model.LyricList, error) {
+	switch {
+	case strings.EqualFold(pattern, "embedded"):
+		return fromEmbedded(ctx, mf)
+	case strings.HasPrefix(pattern, "."):
+		return fromExternalFile(ctx, mf, strings.ToLower(pattern))
+	default:
+		return l.fromPlugin(ctx, mf, pattern)
+	}
+}
diff --git a/core/lyrics/lyrics_test.go b/core/lyrics/lyrics_test.go
index 7e837782e..8fbaec6c5 100644
--- a/core/lyrics/lyrics_test.go
+++ b/core/lyrics/lyrics_test.go
@@ -45,6 +45,71 @@ var _ = Describe("sources", func() {
 		},
 	}
 
+	elrcLyrics := model.LyricList{
+		model.Lyrics{
+			DisplayArtist: "ELRC Artist",
+			DisplayTitle:  "ELRC Song",
+			Lang:          "eng",
+			Line: []model.Line{
+				{
+					Start: gg.P(int64(1000)),
+					End:   gg.P(int64(3000)),
+					Value: "Lead words",
+					Cue: []model.Cue{
+						{
+							Start:     gg.P(int64(1000)),
+							End:       gg.P(int64(1500)),
+							Value:     "Lead ",
+							ByteStart: 0,
+							ByteEnd:   4,
+						},
+						{
+							Start:     gg.P(int64(1500)),
+							End:       gg.P(int64(3000)),
+							Value:     "words",
+							ByteStart: 5,
+							ByteEnd:   9,
+						},
+					},
+				},
+				{
+					Start: gg.P(int64(3000)),
+					Value: "Fallback line",
+				},
+			},
+			Synced: true,
+		},
+	}
+
+	ttmlLyrics := model.LyricList{
+		model.Lyrics{
+			Kind: "main",
+			Lang: "eng",
+			Line: []model.Line{
+				{
+					Start: gg.P(int64(18800)),
+					Value: "We're no strangers to love",
+				},
+				{
+					Start: gg.P(int64(22800)),
+					Value: "You know the rules and so do I",
+				},
+			},
+			Synced: true,
+		},
+		model.Lyrics{
+			Kind: "main",
+			Lang: "por",
+			Line: []model.Line{
+				{
+					Start: gg.P(int64(18800)),
+					Value: "Nao somos estranhos ao amor",
+				},
+			},
+			Synced: true,
+		},
+	}
+
 	unsyncedLyrics := model.LyricList{
 		model.Lyrics{
 			Lang: "xxx",
@@ -60,6 +125,25 @@ var _ = Describe("sources", func() {
 		},
 	}
 
+	srtLyrics := model.LyricList{
+		model.Lyrics{
+			Lang: "xxx",
+			Line: []model.Line{
+				{
+					Start: gg.P(int64(18800)),
+					End:   gg.P(int64(22800)),
+					Value: "We're from subtitles",
+				},
+				{
+					Start: gg.P(int64(22801)),
+					End:   gg.P(int64(26000)),
+					Value: "Another subtitle line",
+				},
+			},
+			Synced: true,
+		},
+	}
+
 	BeforeEach(func() {
 		DeferCleanup(configtest.SetupConfig())
 
@@ -81,7 +165,33 @@ var _ = Describe("sources", func() {
 	},
 		Entry("embedded > lrc > txt", "embedded,.lrc,.txt", embeddedLyrics),
 		Entry("lrc > embedded > txt", ".lrc,embedded,.txt", syncedLyrics),
-		Entry("txt > lrc > embedded", ".txt,.lrc,embedded", unsyncedLyrics))
+		Entry("elrc > lrc > embedded", ".elrc,.lrc,embedded", elrcLyrics),
+		Entry("srt > txt > embedded", ".srt,.txt,embedded", srtLyrics),
+		Entry("txt > lrc > embedded", ".txt,.lrc,embedded", unsyncedLyrics),
+		Entry("ttml > elrc > lrc > srt > embedded", ".ttml,.elrc,.lrc,.srt,embedded", ttmlLyrics))
+
+	It("resolves source priority across duplicate media files", func() {
+		conf.Server.LyricsPriority = ".ttml,embedded"
+		embeddedJSON, err := json.Marshal(embeddedLyrics)
+		Expect(err).To(BeNil())
+
+		svc := lyrics.NewLyrics(nil)
+		batchSvc, ok := svc.(lyrics.BatchLyrics)
+		Expect(ok).To(BeTrue())
+
+		list, err := batchSvc.GetLyricsForMediaFiles(ctx, []model.MediaFile{
+			{
+				Lyrics: string(embeddedJSON),
+				Path:   "tests/fixtures/01 Invisible (RED) Edit Version.mp3",
+			},
+			{
+				Lyrics: "[]",
+				Path:   "tests/fixtures/test.mp3",
+			},
+		})
+		Expect(err).To(BeNil())
+		Expect(list).To(Equal(ttmlLyrics))
+	})
 
 	Context("Errors", func() {
 		var RegularUserContext = XContext
diff --git a/core/lyrics/sources.go b/core/lyrics/sources.go
index 82a10ca41..7586c944f 100644
--- a/core/lyrics/sources.go
+++ b/core/lyrics/sources.go
@@ -5,6 +5,7 @@ import (
 	"errors"
 	"os"
 	"path"
+	"strings"
 
 	"github.com/navidrome/navidrome/log"
 	"github.com/navidrome/navidrome/model"
@@ -36,18 +37,38 @@ func fromExternalFile(ctx context.Context, mf *model.MediaFile, suffix string) (
 		return nil, err
 	}
 
-	lyrics, err := model.ToLyrics("xxx", string(contents))
-	if err != nil {
-		log.Error(ctx, "error parsing lyric external file", "path", externalLyric, err)
-		return nil, err
-	} else if lyrics == nil {
+	var list model.LyricList
+	switch {
+	case strings.EqualFold(suffix, ".ttml"):
+		list, err = parseTTML(contents)
+		if err != nil {
+			log.Error(ctx, "error parsing ttml external file", "path", externalLyric, err)
+			return nil, err
+		}
+	case strings.EqualFold(suffix, ".srt"):
+		list, err = parseSRT(contents)
+		if err != nil {
+			log.Error(ctx, "error parsing srt external file", "path", externalLyric, err)
+			return nil, err
+		}
+	default:
+		lyrics, err := model.ToLyrics("xxx", string(contents))
+		if err != nil {
+			log.Error(ctx, "error parsing lyric external file", "path", externalLyric, err)
+			return nil, err
+		}
+		if lyrics != nil {
+			list = model.LyricList{*lyrics}
+		}
+	}
+
+	if len(list) == 0 {
 		log.Trace(ctx, "empty lyrics from external file", "path", externalLyric)
 		return nil, nil
 	}
 
 	log.Trace(ctx, "retrieved lyrics from external file", "path", externalLyric)
-
-	return model.LyricList{*lyrics}, nil
+	return list, nil
 }
 
 // fromPlugin attempts to load lyrics from a plugin with the given name.
diff --git a/core/lyrics/sources_test.go b/core/lyrics/sources_test.go
index b3d502101..1e98323ca 100644
--- a/core/lyrics/sources_test.go
+++ b/core/lyrics/sources_test.go
@@ -88,6 +88,89 @@ var _ = Describe("sources", func() {
 			}))
 		})
 
+		It("should return Enhanced LRC lyrics with word-level cues from a file", func() {
+			mf := model.MediaFile{Path: "tests/fixtures/test-enhanced.mp3"}
+			lyrics, err := fromExternalFile(ctx, &mf, ".lrc")
+
+			Expect(err).To(BeNil())
+			Expect(lyrics).To(HaveLen(1))
+			Expect(lyrics[0].DisplayArtist).To(Equal("Test Artist"))
+			Expect(lyrics[0].DisplayTitle).To(Equal("Enhanced Test"))
+			Expect(lyrics[0].Lang).To(Equal("eng"))
+			Expect(lyrics[0].Synced).To(BeTrue())
+			Expect(lyrics[0].Line).To(HaveLen(3))
+
+			// Line 1: has inline markers → Cue array populated
+			Expect(lyrics[0].Line[0].Start).To(Equal(gg.P(int64(1000))))
+			Expect(lyrics[0].Line[0].End).To(Equal(gg.P(int64(3000))))
+			Expect(lyrics[0].Line[0].Value).To(Equal("Some lyrics here"))
+			Expect(lyrics[0].Line[0].Cue).To(HaveLen(3))
+			Expect(*lyrics[0].Line[0].Cue[0].Start).To(Equal(int64(1000)))
+			Expect(lyrics[0].Line[0].Cue[0].Value).To(Equal("Some "))
+			Expect(lyrics[0].Line[0].Cue[0].End).To(Equal(gg.P(int64(1500))))
+			Expect(lyrics[0].Line[0].Cue[0].ByteStart).To(Equal(0))
+			Expect(lyrics[0].Line[0].Cue[0].ByteEnd).To(Equal(4))
+			Expect(*lyrics[0].Line[0].Cue[1].Start).To(Equal(int64(1500)))
+			Expect(lyrics[0].Line[0].Cue[1].Value).To(Equal("lyrics "))
+			Expect(lyrics[0].Line[0].Cue[1].End).To(Equal(gg.P(int64(2000))))
+			Expect(lyrics[0].Line[0].Cue[1].ByteStart).To(Equal(5))
+			Expect(lyrics[0].Line[0].Cue[1].ByteEnd).To(Equal(11))
+			Expect(*lyrics[0].Line[0].Cue[2].Start).To(Equal(int64(2000)))
+			Expect(lyrics[0].Line[0].Cue[2].Value).To(Equal("here"))
+			Expect(lyrics[0].Line[0].Cue[2].End).To(Equal(gg.P(int64(3000))))
+			Expect(lyrics[0].Line[0].Cue[2].ByteStart).To(Equal(12))
+			Expect(lyrics[0].Line[0].Cue[2].ByteEnd).To(Equal(15))
+
+			// Line 2: has inline markers
+			Expect(lyrics[0].Line[1].Start).To(Equal(gg.P(int64(3000))))
+			Expect(lyrics[0].Line[1].End).To(Equal(gg.P(int64(5000))))
+			Expect(lyrics[0].Line[1].Value).To(Equal("More words"))
+			Expect(lyrics[0].Line[1].Cue).To(HaveLen(2))
+			Expect(lyrics[0].Line[1].Cue[0].End).To(Equal(gg.P(int64(3500))))
+			Expect(lyrics[0].Line[1].Cue[1].End).To(Equal(gg.P(int64(5000))))
+			Expect(lyrics[0].Line[1].Cue[0].ByteStart).To(Equal(0))
+			Expect(lyrics[0].Line[1].Cue[0].ByteEnd).To(Equal(4))
+			Expect(lyrics[0].Line[1].Cue[1].ByteStart).To(Equal(5))
+			Expect(lyrics[0].Line[1].Cue[1].ByteEnd).To(Equal(9))
+
+			// Line 3: plain line, no cues
+			Expect(lyrics[0].Line[2].Start).To(Equal(gg.P(int64(5000))))
+			Expect(lyrics[0].Line[2].Value).To(Equal("Plain line without inline markers"))
+			Expect(lyrics[0].Line[2].Cue).To(BeNil())
+		})
+
+		It("should return Enhanced LRC lyrics from an ELRC file", func() {
+			mf := model.MediaFile{Path: "tests/fixtures/test.mp3"}
+			lyrics, err := fromExternalFile(ctx, &mf, ".elrc")
+
+			Expect(err).To(BeNil())
+			Expect(lyrics).To(HaveLen(1))
+			Expect(lyrics[0].DisplayArtist).To(Equal("ELRC Artist"))
+			Expect(lyrics[0].DisplayTitle).To(Equal("ELRC Song"))
+			Expect(lyrics[0].Lang).To(Equal("eng"))
+			Expect(lyrics[0].Synced).To(BeTrue())
+			Expect(lyrics[0].Line).To(HaveLen(2))
+
+			Expect(lyrics[0].Line[0].Start).To(Equal(gg.P(int64(1000))))
+			Expect(lyrics[0].Line[0].End).To(Equal(gg.P(int64(3000))))
+			Expect(lyrics[0].Line[0].Value).To(Equal("Lead words"))
+			Expect(lyrics[0].Line[0].Cue).To(HaveLen(2))
+			Expect(*lyrics[0].Line[0].Cue[0].Start).To(Equal(int64(1000)))
+			Expect(lyrics[0].Line[0].Cue[0].Value).To(Equal("Lead "))
+			Expect(lyrics[0].Line[0].Cue[0].End).To(Equal(gg.P(int64(1500))))
+			Expect(lyrics[0].Line[0].Cue[0].ByteStart).To(Equal(0))
+			Expect(lyrics[0].Line[0].Cue[0].ByteEnd).To(Equal(4))
+			Expect(*lyrics[0].Line[0].Cue[1].Start).To(Equal(int64(1500)))
+			Expect(lyrics[0].Line[0].Cue[1].Value).To(Equal("words"))
+			Expect(lyrics[0].Line[0].Cue[1].End).To(Equal(gg.P(int64(3000))))
+			Expect(lyrics[0].Line[0].Cue[1].ByteStart).To(Equal(5))
+			Expect(lyrics[0].Line[0].Cue[1].ByteEnd).To(Equal(9))
+
+			Expect(lyrics[0].Line[1].Start).To(Equal(gg.P(int64(3000))))
+			Expect(lyrics[0].Line[1].Value).To(Equal("Fallback line"))
+			Expect(lyrics[0].Line[1].Cue).To(BeNil())
+		})
+
 		It("should return unsynchronized lyrics from a file", func() {
 			mf := model.MediaFile{Path: "tests/fixtures/test.mp3"}
 			lyrics, err := fromExternalFile(ctx, &mf, ".txt")
@@ -109,6 +192,66 @@ var _ = Describe("sources", func() {
 			}))
 		})
 
+		It("should return synchronized lyrics from an SRT file", func() {
+			mf := model.MediaFile{Path: "tests/fixtures/test.mp3"}
+			lyrics, err := fromExternalFile(ctx, &mf, ".srt")
+
+			Expect(err).To(BeNil())
+			Expect(lyrics).To(Equal(model.LyricList{
+				model.Lyrics{
+					Lang: "xxx",
+					Line: []model.Line{
+						{
+							Start: gg.P(int64(18800)),
+							End:   gg.P(int64(22800)),
+							Value: "We're from subtitles",
+						},
+						{
+							Start: gg.P(int64(22801)),
+							End:   gg.P(int64(26000)),
+							Value: "Another subtitle line",
+						},
+					},
+					Synced: true,
+				},
+			}))
+		})
+
+		It("should return synchronized multilingual lyrics from a TTML file", func() {
+			mf := model.MediaFile{Path: "tests/fixtures/test.mp3"}
+			lyrics, err := fromExternalFile(ctx, &mf, ".ttml")
+
+			Expect(err).To(BeNil())
+			Expect(lyrics).To(Equal(model.LyricList{
+				{
+					Kind: "main",
+					Lang: "eng",
+					Line: []model.Line{
+						{
+							Start: gg.P(int64(18800)),
+							Value: "We're no strangers to love",
+						},
+						{
+							Start: gg.P(int64(22800)),
+							Value: "You know the rules and so do I",
+						},
+					},
+					Synced: true,
+				},
+				{
+					Kind: "main",
+					Lang: "por",
+					Line: []model.Line{
+						{
+							Start: gg.P(int64(18800)),
+							Value: "Nao somos estranhos ao amor",
+						},
+					},
+					Synced: true,
+				},
+			}))
+		})
+
 		It("should handle LRC files with UTF-8 BOM marker (issue #4631)", func() {
 			// The function looks for <basePath-without-ext><suffix>, so we need to pass
 			// a MediaFile with .mp3 path and look for .lrc suffix
@@ -142,5 +285,33 @@ var _ = Describe("sources", func() {
 			Expect(lyrics[0].Line[1].Start).To(Equal(gg.P(int64(22801))))
 			Expect(lyrics[0].Line[1].Value).To(Equal("You know the rules and so do I"))
 		})
+
+		It("should handle TTML files with UTF-8 BOM marker", func() {
+			mf := model.MediaFile{Path: "tests/fixtures/bom-test.mp3"}
+			lyrics, err := fromExternalFile(ctx, &mf, ".ttml")
+
+			Expect(err).To(BeNil())
+			Expect(lyrics).To(HaveLen(1))
+			Expect(lyrics[0].Kind).To(Equal("main"))
+			Expect(lyrics[0].Synced).To(BeTrue())
+			Expect(lyrics[0].Line).To(HaveLen(1))
+			Expect(lyrics[0].Line[0].Start).To(Equal(gg.P(int64(0))))
+			Expect(lyrics[0].Line[0].Value).To(Equal("BOM test line"))
+		})
+
+		It("should handle UTF-16 BE encoded TTML files", func() {
+			mf := model.MediaFile{Path: "tests/fixtures/bom-utf16-test.mp3"}
+			lyrics, err := fromExternalFile(ctx, &mf, ".ttml")
+
+			Expect(err).To(BeNil())
+			Expect(lyrics).To(HaveLen(1))
+			Expect(lyrics[0].Kind).To(Equal("main"))
+			Expect(lyrics[0].Synced).To(BeTrue())
+			Expect(lyrics[0].Line).To(HaveLen(2))
+			Expect(lyrics[0].Line[0].Start).To(Equal(gg.P(int64(18800))))
+			Expect(lyrics[0].Line[0].Value).To(Equal("UTF16 line one"))
+			Expect(lyrics[0].Line[1].Start).To(Equal(gg.P(int64(22801))))
+			Expect(lyrics[0].Line[1].Value).To(Equal("UTF16 line two"))
+		})
 	})
 })
diff --git a/core/lyrics/srt.go b/core/lyrics/srt.go
new file mode 100644
index 000000000..8fd77abb4
--- /dev/null
+++ b/core/lyrics/srt.go
@@ -0,0 +1,161 @@
+package lyrics
+
+import (
+	"bytes"
+	"regexp"
+	"strconv"
+	"strings"
+
+	"github.com/navidrome/navidrome/model"
+	"github.com/navidrome/navidrome/utils/str"
+)
+
+var srtTimeRegex = regexp.MustCompile(`^\s*(\d{1,2}):(\d{2}):(\d{2})[,.](\d{1,3})\s*$`)
+
+func parseSRT(contents []byte) (model.LyricList, error) {
+	raw := strings.ReplaceAll(string(contents), "\r\n", "\n")
+	raw = strings.ReplaceAll(raw, "\r", "\n")
+
+	blocks := splitSRTBlocks(raw)
+	lines := make([]model.Line, 0, len(blocks))
+
+	for _, block := range blocks {
+		line, ok, err := parseSRTBlock(block)
+		if err != nil {
+			return nil, err
+		}
+		if ok {
+			lines = append(lines, line)
+		}
+	}
+
+	if len(lines) == 0 {
+		return nil, nil
+	}
+
+	lyrics := model.NormalizeLyrics(model.Lyrics{
+		Lang:   "xxx",
+		Line:   lines,
+		Synced: true,
+	})
+	return model.LyricList{lyrics}, nil
+}
+
+func splitSRTBlocks(raw string) []string {
+	raw = strings.TrimSpace(raw)
+	if raw == "" {
+		return nil
+	}
+
+	parts := strings.Split(raw, "\n\n")
+	blocks := make([]string, 0, len(parts))
+	for _, part := range parts {
+		part = strings.TrimSpace(part)
+		if part != "" {
+			blocks = append(blocks, part)
+		}
+	}
+	return blocks
+}
+
+func parseSRTBlock(block string) (model.Line, bool, error) {
+	scanner := bytes.Split([]byte(block), []byte("\n"))
+	if len(scanner) == 0 {
+		return model.Line{}, false, nil
+	}
+
+	lines := make([]string, 0, len(scanner))
+	for _, line := range scanner {
+		lines = append(lines, strings.TrimSpace(string(line)))
+	}
+
+	if len(lines) == 0 {
+		return model.Line{}, false, nil
+	}
+
+	startIdx := 0
+	if digitsOnly(lines[0]) {
+		startIdx = 1
+	}
+	if startIdx >= len(lines) {
+		return model.Line{}, false, nil
+	}
+
+	timing := strings.Split(lines[startIdx], "-->")
+	if len(timing) != 2 {
+		return model.Line{}, false, nil
+	}
+
+	startMs, err := parseSRTTime(timing[0])
+	if err != nil {
+		return model.Line{}, false, err
+	}
+	endMs, err := parseSRTTime(timing[1])
+	if err != nil {
+		return model.Line{}, false, err
+	}
+
+	textLines := make([]string, 0, len(lines)-startIdx-1)
+	for _, line := range lines[startIdx+1:] {
+		if line == "" {
+			continue
+		}
+		textLines = append(textLines, line)
+	}
+
+	value := str.SanitizeText(strings.Join(textLines, "\n"))
+	if value == "" {
+		return model.Line{}, false, nil
+	}
+
+	return model.Line{
+		Start: &startMs,
+		End:   &endMs,
+		Value: value,
+	}, true, nil
+}
+
+func parseSRTTime(value string) (int64, error) {
+	match := srtTimeRegex.FindStringSubmatch(strings.TrimSpace(value))
+	if match == nil {
+		return 0, strconv.ErrSyntax
+	}
+
+	hours, err := strconv.ParseInt(match[1], 10, 64)
+	if err != nil {
+		return 0, err
+	}
+	minutes, err := strconv.ParseInt(match[2], 10, 64)
+	if err != nil {
+		return 0, err
+	}
+	seconds, err := strconv.ParseInt(match[3], 10, 64)
+	if err != nil {
+		return 0, err
+	}
+	millis, err := strconv.ParseInt(match[4], 10, 64)
+	if err != nil {
+		return 0, err
+	}
+
+	switch len(match[4]) {
+	case 1:
+		millis *= 100
+	case 2:
+		millis *= 10
+	}
+
+	return (((hours*60)+minutes)*60+seconds)*1000 + millis, nil
+}
+
+func digitsOnly(value string) bool {
+	if value == "" {
+		return false
+	}
+	for _, ch := range value {
+		if ch < '0' || ch > '9' {
+			return false
+		}
+	}
+	return true
+}
diff --git a/core/lyrics/ttml.go b/core/lyrics/ttml.go
new file mode 100644
index 000000000..576d2ca3d
--- /dev/null
+++ b/core/lyrics/ttml.go
@@ -0,0 +1,1264 @@
+package lyrics
+
+import (
+	"bytes"
+	"encoding/xml"
+	"errors"
+	"io"
+	"math"
+	"regexp"
+	"sort"
+	"strconv"
+	"strings"
+	"unicode"
+
+	"github.com/navidrome/navidrome/log"
+	"github.com/navidrome/navidrome/model"
+	"github.com/navidrome/navidrome/utils/str"
+)
+
+const (
+	defaultTTMLFrameRate    = 30.0
+	defaultTTMLSubFrameRate = 1.0
+	defaultTTMLTickRate     = 1.0
+
+	ttmlLyricKindMain          = "main"
+	ttmlLyricKindTranslation   = "translation"
+	ttmlLyricKindPronunciation = "pronunciation"
+	ttmlBackgroundAgentPrefix  = "__nd_bg__|"
+)
+
+var offsetTimeRegex = regexp.MustCompile(`^([0-9]+(?:\.[0-9]+)?)(h|m|s|ms|f|t)$`)
+var xmlEncodingRegex = regexp.MustCompile(`(?i)<\?xml([^>]*?)encoding\s*=\s*["'][^"']+["']([^>]*)\?>`)
+
+type ttmlTimeKind int
+
+const (
+	ttmlTimeAbsolute ttmlTimeKind = iota
+	ttmlTimeOffset
+	ttmlTimeAmbiguous
+)
+
+type ttmlTimingParams struct {
+	frameRate    float64
+	subFrameRate float64
+	tickRate     float64
+}
+
+type ttmlTimingContext struct {
+	lang     string
+	role     string
+	agentID  string
+	begin    int64
+	hasBegin bool
+	end      int64
+	hasEnd   bool
+	invalid  bool
+}
+
+type ttmlLineRef struct {
+	order int
+	line  model.Line
+}
+
+type ttmlMetadataEntry struct {
+	key  string
+	line model.Line
+	seq  int
+}
+
+type ttmlResolvedMetadataLine struct {
+	order int
+	seq   int
+	line  model.Line
+}
+
+type ttmlDefinedAgent struct {
+	ID   string
+	Type string
+	Name string
+}
+
+type ttmlPiece struct {
+	raw string
+	cue *model.Cue
+}
+
+type ttmlParser struct {
+	decoder *xml.Decoder
+	params  ttmlTimingParams
+
+	mainLangOrder   []string
+	mainLinesByLang map[string][]model.Line
+
+	mainLineRefsByKey map[string]ttmlLineRef
+	mainLineOrder     int
+
+	translationLangOrder   []string
+	translationEntriesByLg map[string][]ttmlMetadataEntry
+
+	pronunciationLangOrder   []string
+	pronunciationEntriesByLg map[string][]ttmlMetadataEntry
+
+	definedAgents map[string]ttmlDefinedAgent
+
+	metadataSeq int
+}
+
+func parseTTML(contents []byte) (model.LyricList, error) {
+	contents = xmlEncodingRegex.ReplaceAll(contents, []byte(`<?xml$1encoding="UTF-8"$2?>`))
+
+	p := ttmlParser{
+		decoder: xml.NewDecoder(bytes.NewReader(contents)),
+		params: ttmlTimingParams{
+			frameRate:    defaultTTMLFrameRate,
+			subFrameRate: defaultTTMLSubFrameRate,
+			tickRate:     defaultTTMLTickRate,
+		},
+		mainLinesByLang:          make(map[string][]model.Line),
+		mainLineRefsByKey:        make(map[string]ttmlLineRef),
+		translationEntriesByLg:   make(map[string][]ttmlMetadataEntry),
+		pronunciationEntriesByLg: make(map[string][]ttmlMetadataEntry),
+		definedAgents:            make(map[string]ttmlDefinedAgent),
+	}
+
+	root := ttmlTimingContext{lang: "xxx"}
+
+	for {
+		token, err := p.decoder.Token()
+		if errors.Is(err, io.EOF) {
+			break
+		}
+		if err != nil {
+			return nil, err
+		}
+
+		start, ok := token.(xml.StartElement)
+		if !ok {
+			continue
+		}
+
+		if err := p.parseElement(start, root); err != nil {
+			return nil, err
+		}
+	}
+
+	return p.toLyricList(), nil
+}
+
+func (p *ttmlParser) parseElement(start xml.StartElement, parent ttmlTimingContext) error {
+	local := strings.ToLower(start.Name.Local)
+	if local == "tt" {
+		p.updateTimingParams(start.Attr)
+	}
+
+	switch local {
+	case "translation":
+		return p.parseMetadataTrack(start, parent, ttmlLyricKindTranslation)
+	case "transliteration":
+		return p.parseMetadataTrack(start, parent, ttmlLyricKindPronunciation)
+	case "agent":
+		return p.parseAgentDefinition(start)
+	}
+
+	ctx := p.childContext(start.Attr, parent)
+	if local == "p" {
+		lineText, tokens, err := p.parseParagraph(ctx)
+		if err != nil {
+			return err
+		}
+		if ctx.invalid || lineText == "" {
+			return nil
+		}
+
+		parsedLine := model.Line{Value: lineText}
+		if ctx.hasBegin {
+			startMs := ctx.begin
+			parsedLine.Start = &startMs
+		}
+		if ctx.hasEnd {
+			endMs := ctx.end
+			parsedLine.End = &endMs
+		}
+		if len(tokens) > 0 {
+			parsedLine.Cue = tokens
+		}
+		parsedLine = hydrateLineTimingFromTokens(parsedLine)
+
+		lineKey, _ := attrValue(start.Attr, "key")
+		p.addMainLine(ctx.lang, lineKey, parsedLine)
+		return nil
+	}
+
+	for {
+		token, err := p.decoder.Token()
+		if err != nil {
+			return err
+		}
+
+		switch t := token.(type) {
+		case xml.StartElement:
+			nextParent := ctx
+			if ctx.invalid {
+				// Best effort: ignore invalid timing in container elements, and
+				// continue traversing descendants with parent context.
+				nextParent = parent
+			}
+			if err := p.parseElement(t, nextParent); err != nil {
+				return err
+			}
+		case xml.EndElement:
+			if strings.EqualFold(t.Name.Local, start.Name.Local) {
+				return nil
+			}
+		}
+	}
+}
+
+func (p *ttmlParser) parseMetadataTrack(start xml.StartElement, parent ttmlTimingContext, kind string) error {
+	ctx := p.childContext(start.Attr, parent)
+	lang := normalizeTTMLLang(ctx.lang)
+
+	for {
+		token, err := p.decoder.Token()
+		if err != nil {
+			return err
+		}
+
+		switch t := token.(type) {
+		case xml.StartElement:
+			if strings.EqualFold(t.Name.Local, "text") {
+				entry, ok, err := p.parseMetadataText(t, ctx)
+				if err != nil {
+					return err
+				}
+				if ok {
+					p.addMetadataEntry(kind, lang, entry)
+				}
+				continue
+			}
+
+			nextParent := ctx
+			if ctx.invalid {
+				nextParent = parent
+			}
+			if err := p.parseElement(t, nextParent); err != nil {
+				return err
+			}
+		case xml.EndElement:
+			if strings.EqualFold(t.Name.Local, start.Name.Local) {
+				return nil
+			}
+		}
+	}
+}
+
+func (p *ttmlParser) parseAgentDefinition(start xml.StartElement) error {
+	id, ok := attrValue(start.Attr, "id")
+	id = strings.TrimSpace(id)
+	if !ok || id == "" {
+		return p.skipElement(start)
+	}
+
+	agent := ttmlDefinedAgent{
+		ID:   id,
+		Type: strings.ToLower(strings.TrimSpace(attrOrEmpty(start.Attr, "type"))),
+	}
+
+	for {
+		token, err := p.decoder.Token()
+		if err != nil {
+			return err
+		}
+
+		switch t := token.(type) {
+		case xml.StartElement:
+			if strings.EqualFold(t.Name.Local, "name") {
+				name, err := p.collectElementText(t)
+				if err != nil {
+					return err
+				}
+				name = sanitizeTTMLText(name)
+				if name != "" && agent.Name == "" {
+					agent.Name = name
+				}
+				continue
+			}
+			if err := p.skipElement(t); err != nil {
+				return err
+			}
+		case xml.EndElement:
+			if strings.EqualFold(t.Name.Local, start.Name.Local) {
+				p.definedAgents[agent.ID] = agent
+				return nil
+			}
+		}
+	}
+}
+
+func (p *ttmlParser) parseMetadataText(start xml.StartElement, parent ttmlTimingContext) (ttmlMetadataEntry, bool, error) {
+	forKey, hasFor := attrValue(start.Attr, "for")
+	forKey = strings.TrimSpace(forKey)
+
+	pieces, err := p.parseInlineElement(start, parent)
+	if err != nil {
+		return ttmlMetadataEntry{}, false, err
+	}
+	if !hasFor || forKey == "" {
+		return ttmlMetadataEntry{}, false, nil
+	}
+
+	ctx := p.childContext(start.Attr, parent)
+	if ctx.invalid {
+		return ttmlMetadataEntry{}, false, nil
+	}
+
+	value, tokens := buildTTMLLineFromPieces(pieces)
+	line := model.Line{Value: value}
+	if ctx.hasBegin {
+		startMs := ctx.begin
+		line.Start = &startMs
+	}
+	if ctx.hasEnd {
+		endMs := ctx.end
+		line.End = &endMs
+	}
+	if len(tokens) > 0 {
+		line.Cue = tokens
+	}
+	line = hydrateLineTimingFromTokens(line)
+
+	if line.Value == "" && len(line.Cue) == 0 {
+		return ttmlMetadataEntry{}, false, nil
+	}
+
+	return ttmlMetadataEntry{key: forKey, line: line}, true, nil
+}
+
+func (p *ttmlParser) parseParagraph(parent ttmlTimingContext) (string, []model.Cue, error) {
+	var pieces []ttmlPiece
+
+	for {
+		token, err := p.decoder.Token()
+		if err != nil {
+			return "", nil, err
+		}
+
+		switch t := token.(type) {
+		case xml.StartElement:
+			inlinePieces, err := p.parseInlineElement(t, parent)
+			if err != nil {
+				return "", nil, err
+			}
+			pieces = append(pieces, inlinePieces...)
+		case xml.EndElement:
+			if strings.EqualFold(t.Name.Local, "p") {
+				value, tokens := buildTTMLLineFromPieces(pieces)
+				return value, tokens, nil
+			}
+		case xml.CharData:
+			pieces = append(pieces, ttmlPiece{raw: string(t)})
+		}
+	}
+}
+
+func (p *ttmlParser) parseInlineElement(start xml.StartElement, parent ttmlTimingContext) ([]ttmlPiece, error) {
+	local := strings.ToLower(start.Name.Local)
+	if local == "br" {
+		return []ttmlPiece{{raw: "\n"}}, nil
+	}
+
+	ctx := p.childContext(start.Attr, parent)
+	_, hasBegin := attrValue(start.Attr, "begin")
+	_, hasEnd := attrValue(start.Attr, "end")
+	_, hasDur := attrValue(start.Attr, "dur")
+	hasOwnTiming := hasBegin || hasEnd || hasDur
+
+	var pieces []ttmlPiece
+
+	for {
+		token, err := p.decoder.Token()
+		if err != nil {
+			return nil, err
+		}
+
+		switch t := token.(type) {
+		case xml.StartElement:
+			inlinePieces, err := p.parseInlineElement(t, ctx)
+			if err != nil {
+				return nil, err
+			}
+			pieces = append(pieces, inlinePieces...)
+		case xml.EndElement:
+			if !strings.EqualFold(t.Name.Local, start.Name.Local) {
+				continue
+			}
+
+			if local == "span" && hasOwnTiming && !ctx.invalid && !ttmlPiecesContainCue(pieces) {
+				rawValue := concatTTMLPieceRaw(pieces)
+				tokenText := sanitizeTTMLText(rawValue)
+				if tokenText != "" {
+					parsedToken := model.Cue{
+						AgentID: p.resolveCueAgentID(ctx),
+					}
+					if ctx.hasBegin {
+						startMs := ctx.begin
+						parsedToken.Start = &startMs
+					}
+					if ctx.hasEnd {
+						endMs := ctx.end
+						parsedToken.End = &endMs
+					}
+
+					return []ttmlPiece{{
+						raw: rawValue,
+						cue: &parsedToken,
+					}}, nil
+				}
+			}
+
+			return pieces, nil
+		case xml.CharData:
+			pieces = append(pieces, ttmlPiece{raw: string(t)})
+		}
+	}
+}
+
+func buildTTMLLineFromPieces(pieces []ttmlPiece) (string, []model.Cue) {
+	finalized := finalizeTTMLLines(splitTTMLPiecesByNewline(pieces))
+	for len(finalized) > 0 && finalized[0].text == "" && len(finalized[0].cues) == 0 {
+		finalized = finalized[1:]
+	}
+	for len(finalized) > 0 {
+		last := finalized[len(finalized)-1]
+		if last.text != "" || len(last.cues) > 0 {
+			break
+		}
+		finalized = finalized[:len(finalized)-1]
+	}
+
+	var value strings.Builder
+	cues := make([]model.Cue, 0, 8)
+	byteOffset := 0
+	for i, line := range finalized {
+		if i > 0 {
+			value.WriteByte('\n')
+			byteOffset++
+		}
+		value.WriteString(line.text)
+		for _, cue := range line.cues {
+			cue.ByteStart += byteOffset
+			cue.ByteEnd += byteOffset
+			cues = append(cues, cue)
+		}
+		byteOffset += len(line.text)
+	}
+
+	return value.String(), cues
+}
+
+type ttmlFinalLine struct {
+	text string
+	cues []model.Cue
+}
+
+func finalizeTTMLLines(lines [][]ttmlPiece) []ttmlFinalLine {
+	finalized := make([]ttmlFinalLine, 0, len(lines))
+	for _, line := range lines {
+		text, cues := finalizeTTMLLogicalLine(line)
+		finalized = append(finalized, ttmlFinalLine{text: text, cues: cues})
+	}
+	return finalized
+}
+
+func splitTTMLPiecesByNewline(pieces []ttmlPiece) [][]ttmlPiece {
+	lines := [][]ttmlPiece{{}}
+	for _, piece := range pieces {
+		raw := normalizeTTMLPieceRaw(piece.raw)
+		if raw == "" {
+			continue
+		}
+
+		start := 0
+		for i := 0; i < len(raw); i++ {
+			if raw[i] != '\n' {
+				continue
+			}
+			if start < i {
+				lines[len(lines)-1] = append(lines[len(lines)-1], ttmlPiece{
+					raw: raw[start:i],
+					cue: cloneTTMLCue(piece.cue),
+				})
+			}
+			lines = append(lines, []ttmlPiece{})
+			start = i + 1
+		}
+		if start < len(raw) {
+			lines[len(lines)-1] = append(lines[len(lines)-1], ttmlPiece{
+				raw: raw[start:],
+				cue: cloneTTMLCue(piece.cue),
+			})
+		}
+	}
+	return lines
+}
+
+func finalizeTTMLLogicalLine(line []ttmlPiece) (string, []model.Cue) {
+	rawLine := concatTTMLPieceRaw(line)
+	if rawLine == "" {
+		return "", nil
+	}
+
+	leftTrimBytes := len(rawLine) - len(strings.TrimLeftFunc(rawLine, unicode.IsSpace))
+	rightTrimBytes := len(rawLine) - len(strings.TrimRightFunc(rawLine, unicode.IsSpace))
+	trimmedEnd := len(rawLine) - rightTrimBytes
+	if trimmedEnd < leftTrimBytes {
+		trimmedEnd = leftTrimBytes
+	}
+
+	trimmed := strings.TrimSpace(rawLine)
+	cues := make([]model.Cue, 0, len(line))
+	cursor := 0
+	for _, piece := range line {
+		pieceEnd := cursor + len(piece.raw)
+		if piece.cue != nil {
+			byteStart := max(cursor, leftTrimBytes)
+			byteEnd := min(pieceEnd, trimmedEnd)
+			if byteStart < byteEnd {
+				cue := *piece.cue
+				cue.Value = rawLine[byteStart:byteEnd]
+				cue.ByteStart = byteStart - leftTrimBytes
+				cue.ByteEnd = byteEnd - leftTrimBytes - 1
+				cues = append(cues, cue)
+			}
+		}
+		cursor = pieceEnd
+	}
+
+	return trimmed, cues
+}
+
+func normalizeTTMLPieceRaw(raw string) string {
+	raw = str.SanitizeText(raw)
+	raw = strings.ReplaceAll(raw, "\r\n", "\n")
+	raw = strings.ReplaceAll(raw, "\r", "\n")
+	return raw
+}
+
+func concatTTMLPieceRaw(pieces []ttmlPiece) string {
+	var raw strings.Builder
+	for _, piece := range pieces {
+		raw.WriteString(normalizeTTMLPieceRaw(piece.raw))
+	}
+	return raw.String()
+}
+
+func ttmlPiecesContainCue(pieces []ttmlPiece) bool {
+	for _, piece := range pieces {
+		if piece.cue != nil {
+			return true
+		}
+	}
+	return false
+}
+
+func cloneTTMLCue(cue *model.Cue) *model.Cue {
+	if cue == nil {
+		return nil
+	}
+
+	cloned := *cue
+	return &cloned
+}
+
+func (p *ttmlParser) toLyricList() model.LyricList {
+	res := make(model.LyricList, 0, len(p.mainLangOrder)+len(p.translationLangOrder)+len(p.pronunciationLangOrder))
+	for _, lang := range p.mainLangOrder {
+		lines := p.mainLinesByLang[lang]
+		if len(lines) == 0 {
+			continue
+		}
+		res = append(res, p.finalizeLyrics(model.Lyrics{
+			Kind:   ttmlLyricKindMain,
+			Lang:   lang,
+			Line:   lines,
+			Synced: linesAreSynced(lines),
+		}))
+	}
+
+	res = append(res, p.buildMetadataLyrics(ttmlLyricKindTranslation, p.translationLangOrder, p.translationEntriesByLg)...)
+	res = append(res, p.buildMetadataLyrics(ttmlLyricKindPronunciation, p.pronunciationLangOrder, p.pronunciationEntriesByLg)...)
+	return res
+}
+
+func (p *ttmlParser) buildMetadataLyrics(kind string, langOrder []string, entriesByLang map[string][]ttmlMetadataEntry) model.LyricList {
+	res := make(model.LyricList, 0, len(langOrder))
+
+	for _, lang := range langOrder {
+		entries := entriesByLang[lang]
+		if len(entries) == 0 {
+			continue
+		}
+
+		seenKeys := make(map[string]struct{}, len(entries))
+		resolved := make([]ttmlResolvedMetadataLine, 0, len(entries))
+		for _, entry := range entries {
+			if _, exists := seenKeys[entry.key]; exists {
+				continue
+			}
+			seenKeys[entry.key] = struct{}{}
+
+			ref, ok := p.mainLineRefsByKey[entry.key]
+			if !ok {
+				log.Warn("Skipping TTML metadata line without matching key", "kind", kind, "lang", lang, "key", entry.key)
+				continue
+			}
+
+			line := entry.line
+			if line.Start == nil && ref.line.Start != nil {
+				startMs := *ref.line.Start
+				line.Start = &startMs
+			}
+			if line.End == nil && ref.line.End != nil {
+				endMs := *ref.line.End
+				line.End = &endMs
+			}
+			line = hydrateLineTimingFromTokens(line)
+
+			if line.Value == "" && len(line.Cue) == 0 {
+				continue
+			}
+
+			resolved = append(resolved, ttmlResolvedMetadataLine{
+				order: ref.order,
+				seq:   entry.seq,
+				line:  line,
+			})
+		}
+
+		if len(resolved) == 0 {
+			continue
+		}
+
+		sort.SliceStable(resolved, func(i, j int) bool {
+			if resolved[i].order != resolved[j].order {
+				return resolved[i].order < resolved[j].order
+			}
+			return resolved[i].seq < resolved[j].seq
+		})
+
+		lines := make([]model.Line, len(resolved))
+		for i := range resolved {
+			lines[i] = resolved[i].line
+		}
+
+		res = append(res, p.finalizeLyrics(model.Lyrics{
+			Kind:   kind,
+			Lang:   lang,
+			Line:   lines,
+			Synced: linesAreSynced(lines),
+		}))
+	}
+
+	return res
+}
+
+func (p *ttmlParser) finalizeLyrics(lyrics model.Lyrics) model.Lyrics {
+	lyrics.Line, lyrics.Agents = p.resolveAgents(lyrics.Line)
+	return model.NormalizeLyrics(lyrics)
+}
+
+func (p *ttmlParser) resolveAgents(lines []model.Line) ([]model.Line, []model.Agent) {
+	if len(lines) == 0 {
+		return lines, nil
+	}
+
+	usedOrder := make([]string, 0, 4)
+	usedSet := make(map[string]struct{}, 4)
+	sawEmptyCue := false
+
+	for i := range lines {
+		for j := range lines[i].Cue {
+			agentID := strings.TrimSpace(lines[i].Cue[j].AgentID)
+			if agentID == "" {
+				sawEmptyCue = true
+				continue
+			}
+			if _, exists := usedSet[agentID]; !exists {
+				usedSet[agentID] = struct{}{}
+				usedOrder = append(usedOrder, agentID)
+			}
+		}
+	}
+
+	if len(usedOrder) == 0 {
+		return lines, nil
+	}
+
+	mainID := ""
+	for _, agentID := range usedOrder {
+		role := p.baseRoleForAgent(agentID)
+		if role != "bg" && role != "group" {
+			mainID = agentID
+			break
+		}
+	}
+	if mainID == "" && sawEmptyCue {
+		mainID = "main"
+	}
+	if mainID == "" {
+		for _, agentID := range usedOrder {
+			if p.baseRoleForAgent(agentID) != "bg" {
+				mainID = agentID
+				break
+			}
+		}
+	}
+	if mainID == "" {
+		mainID = usedOrder[0]
+	}
+
+	if _, exists := usedSet[mainID]; !exists {
+		usedSet[mainID] = struct{}{}
+		usedOrder = append([]string{mainID}, usedOrder...)
+	}
+
+	for i := range lines {
+		for j := range lines[i].Cue {
+			if strings.TrimSpace(lines[i].Cue[j].AgentID) == "" {
+				lines[i].Cue[j].AgentID = mainID
+			}
+		}
+	}
+
+	agents := make([]model.Agent, 0, len(usedOrder))
+	for _, agentID := range usedOrder {
+		role := p.baseRoleForAgent(agentID)
+		if agentID == mainID {
+			role = "main"
+		}
+		agent := model.Agent{
+			ID:   agentID,
+			Role: role,
+			Name: p.agentNameForID(agentID),
+		}
+		agents = append(agents, agent)
+	}
+
+	return lines, agents
+}
+
+func (p *ttmlParser) resolveCueAgentID(ctx ttmlTimingContext) string {
+	agentID := strings.TrimSpace(ctx.agentID)
+	if contextHasRole(ctx.role, "x-bg") {
+		if agentID == "" {
+			agentID = "main"
+		}
+		return backgroundAgentID(agentID)
+	}
+	return agentID
+}
+
+func (p *ttmlParser) baseRoleForAgent(agentID string) string {
+	if isBackgroundAgentID(agentID) {
+		return "bg"
+	}
+
+	if agent, ok := p.definedAgents[agentID]; ok {
+		switch agent.Type {
+		case "group":
+			return "group"
+		default:
+			return "voice"
+		}
+	}
+
+	return "voice"
+}
+
+func (p *ttmlParser) agentNameForID(agentID string) string {
+	if isBackgroundAgentID(agentID) {
+		baseID := strings.TrimPrefix(agentID, ttmlBackgroundAgentPrefix)
+		if baseID == "main" {
+			return ""
+		}
+		if agent, ok := p.definedAgents[baseID]; ok {
+			return agent.Name
+		}
+		return ""
+	}
+
+	if agent, ok := p.definedAgents[agentID]; ok {
+		return agent.Name
+	}
+
+	return ""
+}
+
+func backgroundAgentID(agentID string) string {
+	return ttmlBackgroundAgentPrefix + agentID
+}
+
+func isBackgroundAgentID(agentID string) bool {
+	return strings.HasPrefix(agentID, ttmlBackgroundAgentPrefix)
+}
+
+func contextHasRole(roles string, role string) bool {
+	for _, candidate := range strings.Fields(strings.ToLower(roles)) {
+		if candidate == strings.ToLower(role) {
+			return true
+		}
+	}
+	return false
+}
+
+func (p *ttmlParser) addMainLine(lang string, lineKey string, line model.Line) {
+	lang = normalizeTTMLLang(lang)
+	if _, ok := p.mainLinesByLang[lang]; !ok {
+		p.mainLangOrder = append(p.mainLangOrder, lang)
+	}
+	p.mainLinesByLang[lang] = append(p.mainLinesByLang[lang], line)
+
+	lineKey = strings.TrimSpace(lineKey)
+	if lineKey != "" {
+		if _, exists := p.mainLineRefsByKey[lineKey]; !exists {
+			p.mainLineRefsByKey[lineKey] = ttmlLineRef{
+				order: p.mainLineOrder,
+				line:  line,
+			}
+		}
+	}
+	p.mainLineOrder++
+}
+
+func (p *ttmlParser) addMetadataEntry(kind string, lang string, entry ttmlMetadataEntry) {
+	lang = normalizeTTMLLang(lang)
+	entry.seq = p.metadataSeq
+	p.metadataSeq++
+
+	switch kind {
+	case ttmlLyricKindTranslation:
+		if _, ok := p.translationEntriesByLg[lang]; !ok {
+			p.translationLangOrder = append(p.translationLangOrder, lang)
+		}
+		p.translationEntriesByLg[lang] = append(p.translationEntriesByLg[lang], entry)
+	case ttmlLyricKindPronunciation:
+		if _, ok := p.pronunciationEntriesByLg[lang]; !ok {
+			p.pronunciationLangOrder = append(p.pronunciationLangOrder, lang)
+		}
+		p.pronunciationEntriesByLg[lang] = append(p.pronunciationEntriesByLg[lang], entry)
+	}
+}
+
+func (p *ttmlParser) childContext(attrs []xml.Attr, parent ttmlTimingContext) ttmlTimingContext {
+	ctx := parent
+
+	if lang, ok := attrValue(attrs, "lang"); ok {
+		ctx.lang = normalizeTTMLLang(lang)
+	}
+	if agentID, ok := attrValue(attrs, "agent"); ok {
+		ctx.agentID = strings.TrimSpace(agentID)
+	}
+	if role, ok := attrValue(attrs, "role"); ok {
+		role = strings.TrimSpace(role)
+		if role != "" {
+			if ctx.role == "" {
+				ctx.role = role
+			} else if !strings.Contains(ctx.role, role) {
+				ctx.role = ctx.role + " " + role
+			}
+		}
+	}
+
+	beginExpr, hasBegin := attrValue(attrs, "begin")
+	endExpr, hasEnd := attrValue(attrs, "end")
+	durExpr, hasDur := attrValue(attrs, "dur")
+
+	if hasBegin {
+		begin, kind, ok := parseTTMLTimeExpression(beginExpr, p.params)
+		if !ok {
+			ctx.invalid = true
+			return ctx
+		}
+
+		base := int64(0)
+		if parent.hasBegin {
+			base = parent.begin
+		}
+		ctx.begin = resolveTTMLTime(begin, kind, base, parent)
+		ctx.hasBegin = true
+	} else {
+		ctx.begin = parent.begin
+		ctx.hasBegin = parent.hasBegin
+	}
+
+	var calculatedEnd int64
+	calculatedHasEnd := false
+
+	if hasEnd {
+		end, kind, ok := parseTTMLTimeExpression(endExpr, p.params)
+		if !ok {
+			ctx.invalid = true
+			return ctx
+		}
+
+		base := ctx.begin
+		if !ctx.hasBegin {
+			base = parent.begin
+		}
+		calculatedEnd = resolveTTMLTime(end, kind, base, parent)
+		calculatedHasEnd = true
+	}
+
+	if hasDur {
+		dur, ok := parseTTMLDurationExpression(durExpr, p.params)
+		if !ok {
+			ctx.invalid = true
+			return ctx
+		}
+		if ctx.hasBegin {
+			durEnd := ctx.begin + dur
+			if !calculatedHasEnd || durEnd < calculatedEnd {
+				calculatedEnd = durEnd
+				calculatedHasEnd = true
+			}
+		}
+	}
+
+	if !calculatedHasEnd && parent.hasEnd {
+		calculatedEnd = parent.end
+		calculatedHasEnd = true
+	}
+
+	ctx.end = calculatedEnd
+	ctx.hasEnd = calculatedHasEnd
+	return ctx
+}
+
+func (p *ttmlParser) updateTimingParams(attrs []xml.Attr) {
+	frameRate := p.params.frameRate
+	if value, ok := attrValue(attrs, "frameRate"); ok {
+		if parsed, err := strconv.ParseFloat(value, 64); err == nil && parsed > 0 {
+			frameRate = parsed
+		}
+	}
+
+	if value, ok := attrValue(attrs, "frameRateMultiplier"); ok {
+		parts := strings.Fields(value)
+		if len(parts) == 2 {
+			numerator, errA := strconv.ParseFloat(parts[0], 64)
+			denominator, errB := strconv.ParseFloat(parts[1], 64)
+			if errA == nil && errB == nil && denominator > 0 {
+				frameRate = frameRate * (numerator / denominator)
+			}
+		}
+	}
+
+	subFrameRate := p.params.subFrameRate
+	if value, ok := attrValue(attrs, "subFrameRate"); ok {
+		if parsed, err := strconv.ParseFloat(value, 64); err == nil && parsed > 0 {
+			subFrameRate = parsed
+		}
+	}
+
+	tickRate := p.params.tickRate
+	if value, ok := attrValue(attrs, "tickRate"); ok {
+		if parsed, err := strconv.ParseFloat(value, 64); err == nil && parsed > 0 {
+			tickRate = parsed
+		}
+	}
+
+	p.params.frameRate = positiveOrDefault(frameRate, defaultTTMLFrameRate)
+	p.params.subFrameRate = positiveOrDefault(subFrameRate, defaultTTMLSubFrameRate)
+	p.params.tickRate = positiveOrDefault(tickRate, defaultTTMLTickRate)
+}
+
+func parseTTMLDurationExpression(expr string, params ttmlTimingParams) (int64, bool) {
+	value, _, ok := parseTTMLTimeExpression(expr, params)
+	return value, ok
+}
+
+func resolveTTMLTime(value int64, kind ttmlTimeKind, base int64, parent ttmlTimingContext) int64 {
+	switch kind {
+	case ttmlTimeAbsolute:
+		return value
+	case ttmlTimeOffset:
+		return base + value
+	case ttmlTimeAmbiguous:
+		absolute := value
+		offset := base + value
+
+		// No parent timing context → no reference frame for offsets.
+		// Prefer absolute when offset differs (i.e., base > 0).
+		if !parent.hasBegin && !parent.hasEnd && base != 0 {
+			return absolute
+		}
+
+		if parent.hasBegin && parent.hasEnd {
+			absoluteInParent := absolute >= parent.begin && absolute <= parent.end
+			offsetInParent := offset >= parent.begin && offset <= parent.end
+			if absoluteInParent && !offsetInParent {
+				return absolute
+			}
+			if offsetInParent && !absoluteInParent {
+				return offset
+			}
+		}
+
+		if parent.hasBegin {
+			if absolute < parent.begin && offset >= parent.begin {
+				return offset
+			}
+			if absolute >= parent.begin && offset > absolute {
+				return absolute
+			}
+		}
+		return offset
+	default:
+		return base + value
+	}
+}
+
+func parseTTMLTimeExpression(expr string, params ttmlTimingParams) (int64, ttmlTimeKind, bool) {
+	expr = strings.TrimSpace(expr)
+	if expr == "" {
+		return 0, ttmlTimeOffset, false
+	}
+
+	lower := strings.ToLower(expr)
+	if strings.Contains(lower, "wallclock(") ||
+		strings.Contains(lower, ".begin") ||
+		strings.Contains(lower, ".end") {
+		log.Warn("Unsupported TTML time expression", "value", expr)
+		return 0, ttmlTimeOffset, false
+	}
+
+	// Best-effort support for non-standard TTML seen in the wild where a
+	// bare decimal value is used (implicitly seconds), e.g. "0.170".
+	if value, err := strconv.ParseFloat(lower, 64); err == nil && value >= 0 {
+		return int64(math.Round(value * 1000)), ttmlTimeAmbiguous, true
+	}
+
+	if matches := offsetTimeRegex.FindStringSubmatch(lower); len(matches) == 3 {
+		value, err := strconv.ParseFloat(matches[1], 64)
+		if err != nil {
+			return 0, ttmlTimeOffset, false
+		}
+
+		unit := matches[2]
+		seconds := 0.0
+		switch unit {
+		case "h":
+			seconds = value * 60 * 60
+		case "m":
+			seconds = value * 60
+		case "s":
+			seconds = value
+		case "ms":
+			seconds = value / 1000
+		case "f":
+			seconds = value / params.frameRate
+		case "t":
+			seconds = value / params.tickRate
+		default:
+			return 0, ttmlTimeOffset, false
+		}
+
+		return int64(math.Round(seconds * 1000)), ttmlTimeOffset, true
+	}
+
+	colonCount := strings.Count(expr, ":")
+	switch colonCount {
+	case 1, 2:
+		clockMs, ok := parseTTMLClockTime(expr)
+		if !ok {
+			return 0, ttmlTimeAbsolute, false
+		}
+		return clockMs, ttmlTimeAbsolute, true
+	case 3:
+		framesMs, ok := parseTTMLFrameTime(expr, params)
+		if !ok {
+			return 0, ttmlTimeAbsolute, false
+		}
+		return framesMs, ttmlTimeAbsolute, true
+	default:
+		log.Warn("Unsupported TTML time expression", "value", expr)
+		return 0, ttmlTimeOffset, false
+	}
+}
+
+func parseTTMLClockTime(value string) (int64, bool) {
+	parts := strings.Split(value, ":")
+	if len(parts) != 2 && len(parts) != 3 {
+		return 0, false
+	}
+
+	hours := int64(0)
+	minutesIdx := 0
+	if len(parts) == 3 {
+		h, err := strconv.ParseInt(parts[0], 10, 64)
+		if err != nil {
+			return 0, false
+		}
+		hours = h
+		minutesIdx = 1
+	}
+
+	minutes, err := strconv.ParseInt(parts[minutesIdx], 10, 64)
+	if err != nil {
+		return 0, false
+	}
+
+	seconds, err := strconv.ParseFloat(parts[minutesIdx+1], 64)
+	if err != nil {
+		return 0, false
+	}
+
+	totalSeconds := float64(hours*60*60+minutes*60) + seconds
+	return int64(math.Round(totalSeconds * 1000)), true
+}
+
+func parseTTMLFrameTime(value string, params ttmlTimingParams) (int64, bool) {
+	parts := strings.Split(value, ":")
+	if len(parts) != 4 {
+		return 0, false
+	}
+
+	hours, err := strconv.ParseInt(parts[0], 10, 64)
+	if err != nil {
+		return 0, false
+	}
+
+	minutes, err := strconv.ParseInt(parts[1], 10, 64)
+	if err != nil {
+		return 0, false
+	}
+
+	seconds, err := strconv.ParseInt(parts[2], 10, 64)
+	if err != nil {
+		return 0, false
+	}
+
+	frameParts := strings.SplitN(parts[3], ".", 2)
+	frames, err := strconv.ParseFloat(frameParts[0], 64)
+	if err != nil {
+		return 0, false
+	}
+
+	subFrames := 0.0
+	if len(frameParts) == 2 {
+		subFrames, err = strconv.ParseFloat(frameParts[1], 64)
+		if err != nil {
+			return 0, false
+		}
+	}
+
+	totalSeconds := float64(hours*60*60 + minutes*60 + seconds)
+	totalSeconds += frames / params.frameRate
+	totalSeconds += subFrames / (params.subFrameRate * params.frameRate)
+
+	return int64(math.Round(totalSeconds * 1000)), true
+}
+
+func attrValue(attrs []xml.Attr, key string) (string, bool) {
+	for _, attr := range attrs {
+		if strings.EqualFold(attr.Name.Local, key) {
+			return strings.TrimSpace(attr.Value), true
+		}
+	}
+	return "", false
+}
+
+func attrOrEmpty(attrs []xml.Attr, key string) string {
+	value, _ := attrValue(attrs, key)
+	return value
+}
+
+func (p *ttmlParser) collectElementText(start xml.StartElement) (string, error) {
+	var text strings.Builder
+
+	for {
+		token, err := p.decoder.Token()
+		if err != nil {
+			return "", err
+		}
+
+		switch t := token.(type) {
+		case xml.StartElement:
+			value, err := p.collectElementText(t)
+			if err != nil {
+				return "", err
+			}
+			text.WriteString(value)
+		case xml.EndElement:
+			if strings.EqualFold(t.Name.Local, start.Name.Local) {
+				return text.String(), nil
+			}
+		case xml.CharData:
+			text.WriteString(string(t))
+		}
+	}
+}
+
+func (p *ttmlParser) skipElement(_ xml.StartElement) error {
+	depth := 1
+	for depth > 0 {
+		token, err := p.decoder.Token()
+		if err != nil {
+			return err
+		}
+
+		switch token.(type) {
+		case xml.StartElement:
+			depth++
+		case xml.EndElement:
+			depth--
+		}
+	}
+	return nil
+}
+
+func normalizeTTMLLang(lang string) string {
+	lang = strings.ToLower(strings.TrimSpace(lang))
+	if lang == "" {
+		return "xxx"
+	}
+	return lang
+}
+
+func sanitizeTTMLText(raw string) string {
+	raw = str.SanitizeText(raw)
+	raw = strings.ReplaceAll(raw, "\r\n", "\n")
+	raw = strings.ReplaceAll(raw, "\r", "\n")
+
+	lines := strings.Split(raw, "\n")
+	for i := range lines {
+		lines[i] = strings.TrimSpace(lines[i])
+	}
+	return strings.TrimSpace(strings.Join(lines, "\n"))
+}
+
+func linesAreSynced(lines []model.Line) bool {
+	for i := range lines {
+		if lines[i].Start != nil {
+			return true
+		}
+		for j := range lines[i].Cue {
+			if lines[i].Cue[j].Start != nil {
+				return true
+			}
+		}
+	}
+	return false
+}
+
+func hydrateLineTimingFromTokens(line model.Line) model.Line {
+	return model.NormalizeLineTiming(line)
+}
+
+func positiveOrDefault(v float64, fallback float64) float64 {
+	if v <= 0 {
+		return fallback
+	}
+	return v
+}
diff --git a/core/lyrics/ttml_test.go b/core/lyrics/ttml_test.go
new file mode 100644
index 000000000..14676975d
--- /dev/null
+++ b/core/lyrics/ttml_test.go
@@ -0,0 +1,407 @@
+package lyrics
+
+import (
+	"github.com/navidrome/navidrome/model"
+	"github.com/navidrome/navidrome/utils/gg"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("parseTTML", func() {
+	Describe("Multi-language and timing", func() {
+		It("should parse multiple language divs with inherited offsets and frame/tick timing", func() {
+			content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
+<tt xmlns="http://www.w3.org/ns/ttml" xmlns:ttp="http://www.w3.org/ns/ttml#parameter" ttp:frameRate="30" ttp:subFrameRate="2" ttp:tickRate="10">
+  <body>
+    <div xml:lang="eng" begin="1s">
+      <p begin="2s">Line one</p>
+      <p begin="00:00:04:15.1"><span>Line two</span><br/>with break</p>
+    </div>
+    <div xml:lang="por">
+      <p begin="45t">Linha</p>
+    </div>
+  </body>
+</tt>`)
+
+			list, err := parseTTML(content)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(list).To(HaveLen(2))
+
+			By("parsing the English track")
+			eng := list[0]
+			Expect(eng.Lang).To(Equal("eng"))
+			Expect(eng.Synced).To(BeTrue())
+			Expect(eng.Line[0].Start).To(Equal(gg.P(int64(3000))))
+			Expect(eng.Line[0].Value).To(Equal("Line one"))
+			Expect(eng.Line[1].Start).To(Equal(gg.P(int64(4517))))
+			Expect(eng.Line[1].Value).To(Equal("Line two\nwith break"))
+
+			By("parsing the Portuguese track")
+			por := list[1]
+			Expect(por.Lang).To(Equal("por"))
+			Expect(por.Line[0].Start).To(Equal(gg.P(int64(4500))))
+			Expect(por.Line[0].Value).To(Equal("Linha"))
+		})
+	})
+
+	Describe("Unsupported cue handling", func() {
+		It("should skip wallclock cues and keep valid ones", func() {
+			content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
+<tt xmlns="http://www.w3.org/ns/ttml">
+  <body xml:lang="eng">
+    <div>
+      <p begin="wallclock(2026-01-01T00:00:00Z)">Skip me</p>
+      <p begin="1s">Keep me</p>
+    </div>
+  </body>
+</tt>`)
+
+			list, err := parseTTML(content)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(list).To(HaveLen(1))
+			Expect(list[0].Line).To(HaveLen(1))
+			Expect(list[0].Line[0].Start).To(Equal(gg.P(int64(1000))))
+			Expect(list[0].Line[0].Value).To(Equal("Keep me"))
+		})
+	})
+
+	Describe("Begin/End/Dur with inheritance", func() {
+		It("should correctly accumulate nested timing from body, div, and p elements", func() {
+			content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
+<tt xmlns="http://www.w3.org/ns/ttml">
+  <body xml:lang="eng" begin="10s">
+    <div begin="5s" dur="8s">
+      <p begin="1s" dur="2s">First line</p>
+      <p begin="3s" end="5s">Second line</p>
+    </div>
+  </body>
+</tt>`)
+
+			list, err := parseTTML(content)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(list).To(HaveLen(1))
+			Expect(list[0].Lang).To(Equal("eng"))
+			Expect(list[0].Line).To(HaveLen(2))
+			Expect(list[0].Line[0].Start).To(Equal(gg.P(int64(16000))))
+			Expect(list[0].Line[0].Value).To(Equal("First line"))
+			Expect(list[0].Line[1].Start).To(Equal(gg.P(int64(18000))))
+			Expect(list[0].Line[1].Value).To(Equal("Second line"))
+		})
+	})
+
+	Describe("Non-standard bare second offsets", func() {
+		It("should parse bare decimal numbers as seconds", func() {
+			content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
+<tt xmlns="http://www.w3.org/ns/ttml">
+  <body xml:lang="eng" begin="10">
+    <div>
+      <p begin="0.170">First line</p>
+      <p begin="3.710">Second line</p>
+    </div>
+  </body>
+</tt>`)
+
+			list, err := parseTTML(content)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(list).To(HaveLen(1))
+			Expect(list[0].Line).To(HaveLen(2))
+			Expect(list[0].Line[0].Start).To(Equal(gg.P(int64(10170))))
+			Expect(list[0].Line[0].Value).To(Equal("First line"))
+			Expect(list[0].Line[1].Start).To(Equal(gg.P(int64(13710))))
+			Expect(list[0].Line[1].Value).To(Equal("Second line"))
+		})
+	})
+
+	Describe("Word timing tokens", func() {
+		It("should extract timed tokens from spans including background role", func() {
+			content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
+<tt xmlns="http://www.w3.org/ns/ttml" xmlns:ttm="http://www.w3.org/ns/ttml#metadata">
+  <body xml:lang="eng">
+    <div>
+      <p begin="00:01.000" end="00:03.000">
+        <span begin="00:01.000" end="00:01.400">He</span><span begin="00:01.400" end="00:01.800">llo</span>
+        <span ttm:role="x-bg"><span begin="00:02.000" end="00:02.500">echo</span></span>
+      </p>
+    </div>
+  </body>
+</tt>`)
+
+			list, err := parseTTML(content)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(list).To(HaveLen(1))
+			Expect(list[0].Agents).To(Equal([]model.Agent{
+				{ID: "main", Role: "main"},
+				{ID: "__nd_bg__|main", Role: "bg"},
+			}))
+			Expect(list[0].Line).To(HaveLen(1))
+
+			line := list[0].Line[0]
+			Expect(line.Start).To(Equal(gg.P(int64(1000))))
+			Expect(line.Value).To(Equal("Hello\necho"))
+			Expect(line.End).To(Equal(gg.P(int64(3000))))
+			Expect(line.Cue).To(HaveLen(3))
+
+			Expect(line.Cue[0]).To(Equal(model.Cue{Start: gg.P(int64(1000)), End: gg.P(int64(1400)), Value: "He", ByteStart: 0, ByteEnd: 1, AgentID: "main"}))
+			Expect(line.Cue[1]).To(Equal(model.Cue{Start: gg.P(int64(1400)), End: gg.P(int64(1800)), Value: "llo", ByteStart: 2, ByteEnd: 4, AgentID: "main"}))
+			Expect(line.Cue[2]).To(Equal(model.Cue{Start: gg.P(int64(2000)), End: gg.P(int64(2500)), Value: "echo", ByteStart: 6, ByteEnd: 9, AgentID: "__nd_bg__|main"}))
+		})
+
+		It("should parse named TTML agents into main, voice, and group roles", func() {
+			content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
+<tt xmlns="http://www.w3.org/ns/ttml" xmlns:ttm="http://www.w3.org/ns/ttml#metadata">
+  <head>
+    <metadata>
+      <ttm:agent xml:id="v1" type="person"><ttm:name>Chris Martin</ttm:name></ttm:agent>
+      <ttm:agent xml:id="v2" type="person"><ttm:name>Jin</ttm:name></ttm:agent>
+      <ttm:agent xml:id="v1000" type="group"><ttm:name>All</ttm:name></ttm:agent>
+    </metadata>
+  </head>
+  <body xml:lang="eng">
+    <div>
+      <p begin="1s" end="2s" ttm:agent="v1"><span begin="1s" end="1.5s">You</span></p>
+      <p begin="2s" end="3s" ttm:agent="v2"><span begin="2s" end="2.5s">and</span></p>
+      <p begin="3s" end="4s" ttm:agent="v1000"><span begin="3s" end="3.5s">All</span></p>
+    </div>
+  </body>
+</tt>`)
+
+			list, err := parseTTML(content)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(list).To(HaveLen(1))
+			Expect(list[0].Agents).To(Equal([]model.Agent{
+				{ID: "v1", Role: "main", Name: "Chris Martin"},
+				{ID: "v2", Role: "voice", Name: "Jin"},
+				{ID: "v1000", Role: "group", Name: "All"},
+			}))
+			Expect(list[0].Line[0].Cue[0].AgentID).To(Equal("v1"))
+			Expect(list[0].Line[1].Cue[0].AgentID).To(Equal("v2"))
+			Expect(list[0].Line[2].Cue[0].AgentID).To(Equal("v1000"))
+		})
+
+		It("should avoid collisions between derived background agents and explicit TTML agent ids", func() {
+			content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
+<tt xmlns="http://www.w3.org/ns/ttml" xmlns:ttm="http://www.w3.org/ns/ttml#metadata">
+  <head>
+    <metadata>
+      <ttm:agent xml:id="lead" type="person"><ttm:name>Lead</ttm:name></ttm:agent>
+      <ttm:agent xml:id="lead__bg" type="person"><ttm:name>Existing Background Id</ttm:name></ttm:agent>
+    </metadata>
+  </head>
+  <body xml:lang="eng">
+    <div>
+      <p begin="1s" end="2s" ttm:agent="lead">
+        <span begin="1s" end="1.4s">Lead</span>
+        <span ttm:role="x-bg"><span begin="1.5s" end="1.8s">Echo</span></span>
+      </p>
+      <p begin="2s" end="3s" ttm:agent="lead__bg">
+        <span begin="2s" end="2.5s">Named</span>
+      </p>
+    </div>
+  </body>
+</tt>`)
+
+			list, err := parseTTML(content)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(list).To(HaveLen(1))
+			Expect(list[0].Agents).To(Equal([]model.Agent{
+				{ID: "lead", Role: "main", Name: "Lead"},
+				{ID: "__nd_bg__|lead", Role: "bg", Name: "Lead"},
+				{ID: "lead__bg", Role: "voice", Name: "Existing Background Id"},
+			}))
+			Expect(list[0].Line).To(HaveLen(2))
+			Expect(list[0].Line[0].Cue).To(HaveLen(2))
+			Expect(list[0].Line[0].Cue[0].AgentID).To(Equal("lead"))
+			Expect(list[0].Line[0].Cue[1].AgentID).To(Equal("__nd_bg__|lead"))
+			Expect(list[0].Line[1].Cue).To(HaveLen(1))
+			Expect(list[0].Line[1].Cue[0].AgentID).To(Equal("lead__bg"))
+		})
+
+		It("should fill missing cue agent ids with the resolved main agent", func() {
+			content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
+<tt xmlns="http://www.w3.org/ns/ttml" xmlns:ttm="http://www.w3.org/ns/ttml#metadata">
+  <head>
+    <metadata>
+      <ttm:agent xml:id="guest" type="person"><ttm:name>Guest Vocal</ttm:name></ttm:agent>
+    </metadata>
+  </head>
+  <body xml:lang="eng">
+    <div>
+      <p begin="1s" end="3s">
+        <span begin="1s" end="1.4s">Lead</span>
+        <span begin="2s" end="2.4s" ttm:agent="guest">Guest</span>
+      </p>
+    </div>
+  </body>
+</tt>`)
+
+			list, err := parseTTML(content)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(list).To(HaveLen(1))
+			Expect(list[0].Agents).To(Equal([]model.Agent{
+				{ID: "guest", Role: "main", Name: "Guest Vocal"},
+			}))
+			Expect(list[0].Line).To(HaveLen(1))
+			Expect(list[0].Line[0].Cue).To(HaveLen(2))
+			Expect(list[0].Line[0].Cue[0].AgentID).To(Equal("guest"))
+			Expect(list[0].Line[0].Cue[1].AgentID).To(Equal("guest"))
+		})
+	})
+
+	Describe("Ambiguous decimal timing", func() {
+		It("should prefer absolute timing when values fall inside parent window", func() {
+			content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
+<tt xmlns="http://www.w3.org/ns/ttml">
+  <body xml:lang="eng">
+    <div begin="37.870" end="45.570">
+      <p begin="43.444" end="45.570">
+        <span begin="43.444" end="43.716">go</span>
+        <span begin="43.716" end="43.887">go</span>
+      </p>
+    </div>
+  </body>
+</tt>`)
+
+			list, err := parseTTML(content)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(list).To(HaveLen(1))
+			Expect(list[0].Line).To(HaveLen(1))
+
+			line := list[0].Line[0]
+			Expect(line.Start).To(Equal(gg.P(int64(43444))))
+			Expect(line.Value).To(Equal("go\ngo"))
+			Expect(line.End).To(Equal(gg.P(int64(45570))))
+			Expect(line.Cue).To(HaveLen(2))
+			Expect(line.Cue[0]).To(Equal(model.Cue{Start: gg.P(int64(43444)), End: gg.P(int64(43716)), Value: "go", ByteStart: 0, ByteEnd: 1}))
+			Expect(line.Cue[1]).To(Equal(model.Cue{Start: gg.P(int64(43716)), End: gg.P(int64(43887)), Value: "go", ByteStart: 3, ByteEnd: 4}))
+		})
+	})
+
+	Describe("Unsynced fallback", func() {
+		It("should return unsynced lyrics when no timing is present", func() {
+			content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
+<tt xmlns="http://www.w3.org/ns/ttml">
+  <body>
+    <div>
+      <p>No timing here</p>
+    </div>
+  </body>
+</tt>`)
+
+			list, err := parseTTML(content)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(list).To(HaveLen(1))
+			Expect(list[0].Lang).To(Equal("xxx"))
+			Expect(list[0].Synced).To(BeFalse())
+			Expect(list[0].Line).To(HaveLen(1))
+			Expect(list[0].Line[0].Start).To(BeNil())
+			Expect(list[0].Line[0].Value).To(Equal("No timing here"))
+		})
+	})
+
+	Describe("Metadata tracks", func() {
+		It("should produce main, translation, and pronunciation tracks from iTunesMetadata", func() {
+			content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
+<tt xmlns="http://www.w3.org/ns/ttml" xmlns:itunes="http://music.apple.com/lyric-ttml-internal">
+  <head>
+    <metadata>
+      <iTunesMetadata xmlns="http://music.apple.com/lyric-ttml-internal">
+        <translations>
+          <translation xml:lang="es">
+            <text for="L1">Hola</text>
+            <text for="MISSING">Skip me</text>
+          </translation>
+        </translations>
+        <transliterations>
+          <transliteration xml:lang="ja-Latn">
+            <text for="L2"><span begin="00:02.000" end="00:02.300" xmlns="http://www.w3.org/ns/ttml">ko</span><span begin="00:02.300" end="00:02.600" xmlns="http://www.w3.org/ns/ttml">nni</span></text>
+          </transliteration>
+        </transliterations>
+      </iTunesMetadata>
+    </metadata>
+  </head>
+  <body xml:lang="ja">
+    <div>
+      <p begin="00:01.000" end="00:01.500" itunes:key="L1">こんにちは</p>
+      <p begin="00:02.000" end="00:02.700" itunes:key="L2">こんばんは</p>
+    </div>
+  </body>
+</tt>`)
+
+			list, err := parseTTML(content)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(list).To(HaveLen(3))
+
+			By("checking the main track")
+			main := list[0]
+			Expect(main.Kind).To(Equal("main"))
+			Expect(main.Lang).To(Equal("ja"))
+			Expect(main.Line).To(HaveLen(2))
+
+			By("checking the translation track")
+			translation := list[1]
+			Expect(translation.Kind).To(Equal("translation"))
+			Expect(translation.Lang).To(Equal("es"))
+			Expect(translation.Line).To(HaveLen(1))
+			Expect(translation.Line[0].Start).To(Equal(gg.P(int64(1000))))
+			Expect(translation.Line[0].Value).To(Equal("Hola"))
+			Expect(translation.Line[0].End).To(Equal(gg.P(int64(1500))))
+
+			By("checking the pronunciation track")
+			pronunciation := list[2]
+			Expect(pronunciation.Kind).To(Equal("pronunciation"))
+			Expect(pronunciation.Lang).To(Equal("ja-latn"))
+			Expect(pronunciation.Line).To(HaveLen(1))
+			Expect(pronunciation.Line[0].Start).To(Equal(gg.P(int64(2000))))
+			Expect(pronunciation.Line[0].Value).To(Equal("konni"))
+			Expect(pronunciation.Line[0].End).To(Equal(gg.P(int64(2600))))
+			Expect(pronunciation.Line[0].Cue).To(HaveLen(2))
+			Expect(pronunciation.Line[0].Cue[0]).To(Equal(model.Cue{Start: gg.P(int64(2000)), End: gg.P(int64(2300)), Value: "ko", ByteStart: 0, ByteEnd: 1}))
+			Expect(pronunciation.Line[0].Cue[1]).To(Equal(model.Cue{Start: gg.P(int64(2300)), End: gg.P(int64(2600)), Value: "nni", ByteStart: 2, ByteEnd: 4}))
+		})
+	})
+
+	Describe("Pronunciation with bare decimal end times", func() {
+		It("should correctly parse bare decimal times in transliteration spans", func() {
+			content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
+<tt xmlns="http://www.w3.org/ns/ttml" xmlns:itunes="http://music.apple.com/lyric-ttml-internal">
+  <head>
+    <metadata>
+      <iTunesMetadata xmlns="http://music.apple.com/lyric-ttml-internal">
+        <transliterations>
+          <transliteration xml:lang="ja-Latn">
+            <text for="L1"><span begin="2.747" end="3.018" xmlns="http://www.w3.org/ns/ttml">I</span> <span begin="3.018" end="3.179" xmlns="http://www.w3.org/ns/ttml">woke</span> <span begin="3.179" end="3.582" xmlns="http://www.w3.org/ns/ttml">up</span></text>
+          </transliteration>
+        </transliterations>
+      </iTunesMetadata>
+    </metadata>
+  </head>
+  <body xml:lang="ja">
+    <div>
+      <p begin="00:02.747" end="00:04.000" itunes:key="L1">起きた</p>
+    </div>
+  </body>
+</tt>`)
+
+			list, err := parseTTML(content)
+			Expect(err).ToNot(HaveOccurred())
+
+			var pronunciation *model.Lyrics
+			for i := range list {
+				if list[i].Kind == "pronunciation" {
+					pronunciation = &list[i]
+					break
+				}
+			}
+			Expect(pronunciation).ToNot(BeNil())
+			Expect(pronunciation.Line).To(HaveLen(1))
+
+			line := pronunciation.Line[0]
+			Expect(line.Start).To(Equal(gg.P(int64(2747))))
+			Expect(line.Value).To(Equal("I woke up"))
+			Expect(line.Cue).To(HaveLen(3))
+			Expect(line.Cue[0]).To(Equal(model.Cue{Start: gg.P(int64(2747)), End: gg.P(int64(3018)), Value: "I", ByteStart: 0, ByteEnd: 0}))
+			Expect(line.Cue[1]).To(Equal(model.Cue{Start: gg.P(int64(3018)), End: gg.P(int64(3179)), Value: "woke", ByteStart: 2, ByteEnd: 5}))
+			Expect(line.Cue[2]).To(Equal(model.Cue{Start: gg.P(int64(3179)), End: gg.P(int64(3582)), Value: "up", ByteStart: 7, ByteEnd: 8}))
+		})
+	})
+})
diff --git a/model/lyrics.go b/model/lyrics.go
index f75f3b11b..9a57ebaad 100644
--- a/model/lyrics.go
+++ b/model/lyrics.go
@@ -6,23 +6,43 @@ import (
 	"slices"
 	"strconv"
 	"strings"
+	"unicode"
 
 	"github.com/navidrome/navidrome/log"
 	"github.com/navidrome/navidrome/utils/str"
 )
 
+type Cue struct {
+	Start     *int64 `structs:"start,omitempty"   json:"start,omitempty"`
+	End       *int64 `structs:"end,omitempty"     json:"end,omitempty"`
+	Value     string `structs:"value"             json:"value"`
+	ByteStart int    `structs:"byteStart"         json:"byteStart"`
+	ByteEnd   int    `structs:"byteEnd"           json:"byteEnd"`
+	AgentID   string `structs:"agentId,omitempty" json:"agentId,omitempty"`
+}
+
+type Agent struct {
+	ID   string `structs:"id"             json:"id"`
+	Role string `structs:"role"           json:"role"`
+	Name string `structs:"name,omitempty" json:"name,omitempty"`
+}
+
 type Line struct {
 	Start *int64 `structs:"start,omitempty" json:"start,omitempty"`
+	End   *int64 `structs:"end,omitempty"   json:"end,omitempty"`
 	Value string `structs:"value"           json:"value"`
+	Cue   []Cue  `structs:"cue,omitempty"   json:"cue,omitempty"`
 }
 
 type Lyrics struct {
-	DisplayArtist string `structs:"displayArtist,omitempty" json:"displayArtist,omitempty"`
-	DisplayTitle  string `structs:"displayTitle,omitempty"  json:"displayTitle,omitempty"`
-	Lang          string `structs:"lang"                    json:"lang"`
-	Line          []Line `structs:"line"                    json:"line"`
-	Offset        *int64 `structs:"offset,omitempty"        json:"offset,omitempty"`
-	Synced        bool   `structs:"synced"                  json:"synced"`
+	DisplayArtist string  `structs:"displayArtist,omitempty" json:"displayArtist,omitempty"`
+	DisplayTitle  string  `structs:"displayTitle,omitempty"  json:"displayTitle,omitempty"`
+	Kind          string  `structs:"kind,omitempty"          json:"kind,omitempty"`
+	Lang          string  `structs:"lang"                    json:"lang"`
+	Agents        []Agent `structs:"agents,omitempty"       json:"agents,omitempty"`
+	Line          []Line  `structs:"line"                    json:"line"`
+	Offset        *int64  `structs:"offset,omitempty"        json:"offset,omitempty"`
+	Synced        bool    `structs:"synced"                  json:"synced"`
 }
 
 // support the standard [mm:ss.mm], as well as [hh:*] and [*.mmm]
@@ -33,6 +53,10 @@ var (
 	syncRegex  = regexp.MustCompile(`(^|\n)\s*` + timeRegexString)
 	timeRegex  = regexp.MustCompile(timeRegexString)
 	lrcIdRegex = regexp.MustCompile(`\[(ar|ti|offset|lang):([^]]+)]`)
+
+	// Enhanced LRC: inline word-level timing markers like <00:12.34>
+	enhancedLRCTimeString = `<([0-9]{1,2}:)?([0-9]{1,2}):([0-9]{1,2})(.[0-9]{1,3})?>`
+	enhancedLRCRegex      = regexp.MustCompile(enhancedLRCTimeString)
 )
 
 func (l Lyrics) IsEmpty() bool {
@@ -106,9 +130,11 @@ func ToLyrics(language, text string) (*Lyrics, error) {
 
 			if validLine {
 				for idx := range timestamps {
+					value, cues := parseEnhancedLine(priorLine)
 					structuredLines = append(structuredLines, Line{
 						Start: &timestamps[idx],
-						Value: strings.TrimSpace(priorLine),
+						Value: value,
+						Cue:   cues,
 					})
 				}
 				timestamps = nil
@@ -154,9 +180,11 @@ func ToLyrics(language, text string) (*Lyrics, error) {
 
 	if validLine {
 		for idx := range timestamps {
+			value, cues := parseEnhancedLine(priorLine)
 			structuredLines = append(structuredLines, Line{
 				Start: &timestamps[idx],
-				Value: strings.TrimSpace(priorLine),
+				Value: value,
+				Cue:   cues,
 			})
 		}
 	}
@@ -173,13 +201,118 @@ func ToLyrics(language, text string) (*Lyrics, error) {
 		DisplayArtist: artist,
 		DisplayTitle:  title,
 		Lang:          language,
-		Line:          structuredLines,
+		Line:          NormalizeCueLines(structuredLines),
 		Offset:        offset,
 		Synced:        synced,
 	}
 	return &lyrics, nil
 }
 
+// parseEnhancedLine extracts word-level timing cues from Enhanced LRC inline markers
+// and computes UTF-8 byte offsets against the final stripped line value.
+func parseEnhancedLine(text string) (string, []Cue) {
+	matches := enhancedLRCRegex.FindAllStringSubmatchIndex(text, -1)
+	if len(matches) == 0 {
+		return strings.TrimSpace(text), nil
+	}
+
+	type segment struct {
+		start    int64
+		rawStart int
+		rawEnd   int
+	}
+
+	segments := make([]segment, 0, len(matches))
+	var rawValue strings.Builder
+	for i, match := range matches {
+		timeMs, err := parseTime(
+			// Rewrite <...> as [...] so parseTime can handle it with the same logic
+			"["+text[match[0]+1:match[1]-1]+"]",
+			// Adjust match indices to point into our rewritten string (need start/end pairs for each group)
+			[]int{
+				0, match[1] - match[0],
+				adjustGroup(match, 2), adjustGroup(match, 3),
+				adjustGroup(match, 4), adjustGroup(match, 5),
+				adjustGroup(match, 6), adjustGroup(match, 7),
+				adjustGroup(match, 8), adjustGroup(match, 9),
+			},
+		)
+		if err != nil {
+			continue
+		}
+
+		// Text runs from after this marker to the start of the next marker (or end of string)
+		textStart := match[1]
+		var textEnd int
+		if i+1 < len(matches) {
+			textEnd = matches[i+1][0]
+		} else {
+			textEnd = len(text)
+		}
+
+		word := text[textStart:textEnd]
+		if word == "" {
+			continue
+		}
+
+		rawStart := rawValue.Len()
+		rawValue.WriteString(word)
+		segments = append(segments, segment{
+			start:    timeMs,
+			rawStart: rawStart,
+			rawEnd:   rawValue.Len(),
+		})
+	}
+
+	if len(segments) == 0 {
+		return strings.TrimSpace(stripEnhancedMarkers(text)), nil
+	}
+
+	finalRaw := rawValue.String()
+	leftTrimBytes := len(finalRaw) - len(strings.TrimLeftFunc(finalRaw, unicode.IsSpace))
+	rightTrimBytes := len(finalRaw) - len(strings.TrimRightFunc(finalRaw, unicode.IsSpace))
+	trimmedEnd := len(finalRaw) - rightTrimBytes
+	if trimmedEnd < leftTrimBytes {
+		trimmedEnd = leftTrimBytes
+	}
+
+	cues := make([]Cue, 0, len(segments))
+	for _, seg := range segments {
+		start := seg.start
+		byteStart := max(seg.rawStart, leftTrimBytes)
+		byteEnd := min(seg.rawEnd, trimmedEnd)
+		if byteStart >= byteEnd {
+			continue
+		}
+
+		cues = append(cues, Cue{
+			Start:     &start,
+			Value:     finalRaw[byteStart:byteEnd],
+			ByteStart: byteStart - leftTrimBytes,
+			ByteEnd:   byteEnd - leftTrimBytes - 1,
+		})
+	}
+
+	return strings.TrimSpace(finalRaw), cues
+}
+
+// adjustGroup remaps a capture group index from the original match to our rewritten "[...]" string.
+// The rewrite shifts by -1 (removed '<', added '[') so positions within the brackets stay the same.
+func adjustGroup(match []int, groupIdx int) int {
+	orig := match[groupIdx]
+	if orig == -1 {
+		return -1
+	}
+	// Offset is: original position minus the position of '<' in the original, plus 1 for '['
+	return orig - match[0]
+}
+
+// stripEnhancedMarkers removes all <mm:ss.mm> inline markers from text,
+// returning the plain lyric text.
+func stripEnhancedMarkers(text string) string {
+	return enhancedLRCRegex.ReplaceAllString(text, "")
+}
+
 func parseTime(line string, match []int) (int64, error) {
 	var hours, millis int64
 	var err error
@@ -227,3 +360,115 @@ func parseTime(line string, match []int) (int64, error) {
 }
 
 type LyricList []Lyrics
+
+func NormalizeLyrics(lyrics Lyrics) Lyrics {
+	lyrics.Line = NormalizeCueLines(lyrics.Line)
+	if len(lyrics.Agents) == 0 {
+		lyrics.Agents = nil
+	}
+	return lyrics
+}
+
+func NormalizeCueLines(lines []Line) []Line {
+	if len(lines) == 0 {
+		return lines
+	}
+
+	normalized := make([]Line, len(lines))
+	copy(normalized, lines)
+
+	for i := range normalized {
+		var fallbackEnd *int64
+		if normalized[i].End != nil {
+			v := *normalized[i].End
+			fallbackEnd = &v
+		} else if i+1 < len(normalized) && normalized[i+1].Start != nil {
+			v := *normalized[i+1].Start
+			fallbackEnd = &v
+		}
+
+		normalized[i] = normalizeCueLine(normalized[i], fallbackEnd)
+	}
+
+	return normalized
+}
+
+func NormalizeLineTiming(line Line) Line {
+	if len(line.Cue) == 0 {
+		return line
+	}
+
+	var earliestStart *int64
+	var latestEnd *int64
+	for i := range line.Cue {
+		token := line.Cue[i]
+		if token.Start != nil {
+			if earliestStart == nil || *token.Start < *earliestStart {
+				v := *token.Start
+				earliestStart = &v
+			}
+		}
+
+		candidateEnd := token.End
+		if candidateEnd == nil {
+			candidateEnd = token.Start
+		}
+		if candidateEnd != nil {
+			if latestEnd == nil || *candidateEnd > *latestEnd {
+				v := *candidateEnd
+				latestEnd = &v
+			}
+		}
+	}
+
+	if line.Start == nil && earliestStart != nil {
+		v := *earliestStart
+		line.Start = &v
+	}
+	if line.End == nil && latestEnd != nil {
+		v := *latestEnd
+		line.End = &v
+	}
+	return line
+}
+
+func normalizeCueLine(line Line, fallbackEnd *int64) Line {
+	if len(line.Cue) == 0 {
+		return line
+	}
+
+	for i := range line.Cue {
+		if line.Cue[i].End != nil {
+			continue
+		}
+
+		if i+1 < len(line.Cue) && line.Cue[i+1].Start != nil {
+			v := *line.Cue[i+1].Start
+			line.Cue[i].End = &v
+			continue
+		}
+
+		if fallbackEnd != nil {
+			v := *fallbackEnd
+			line.Cue[i].End = &v
+		}
+	}
+
+	for i := range line.Cue {
+		if line.Cue[i].End == nil {
+			line.Cue = clearCueEnds(line.Cue)
+			return NormalizeLineTiming(line)
+		}
+	}
+
+	return NormalizeLineTiming(line)
+}
+
+func clearCueEnds(cues []Cue) []Cue {
+	normalized := make([]Cue, len(cues))
+	copy(normalized, cues)
+	for i := range normalized {
+		normalized[i].End = nil
+	}
+	return normalized
+}
diff --git a/model/lyrics_test.go b/model/lyrics_test.go
index 382976872..1fa82f258 100644
--- a/model/lyrics_test.go
+++ b/model/lyrics_test.go
@@ -116,4 +116,85 @@ var _ = Describe("ToLyrics", func() {
 			{Start: &e, Value: "Test"},
 		}))
 	})
+
+	It("should parse Enhanced LRC with word-level timing", func() {
+		lyrics, err := ToLyrics("xxx", "[00:01.00]<00:01.00>Some <00:01.50>lyrics <00:02.00>here\n[00:03.00]<00:03.00>More <00:03.50>words")
+		Expect(err).ToNot(HaveOccurred())
+		Expect(lyrics.Synced).To(BeTrue())
+		Expect(lyrics.Line).To(HaveLen(2))
+
+		t1000, t1500, t2000, t3000, t3500 := int64(1000), int64(1500), int64(2000), int64(3000), int64(3500)
+
+		line0 := lyrics.Line[0]
+		Expect(line0.Start).To(Equal(&t1000))
+		Expect(line0.End).To(Equal(&t3000))
+		Expect(line0.Value).To(Equal("Some lyrics here"))
+		Expect(line0.Cue).To(Equal([]Cue{
+			{Start: &t1000, End: &t1500, Value: "Some ", ByteStart: 0, ByteEnd: 4},
+			{Start: &t1500, End: &t2000, Value: "lyrics ", ByteStart: 5, ByteEnd: 11},
+			{Start: &t2000, End: &t3000, Value: "here", ByteStart: 12, ByteEnd: 15},
+		}))
+
+		line1 := lyrics.Line[1]
+		Expect(line1.Start).To(Equal(&t3000))
+		Expect(line1.End).To(Equal(&t3500))
+		Expect(line1.Value).To(Equal("More words"))
+		Expect(line1.Cue).To(Equal([]Cue{
+			{Start: &t3000, Value: "More ", ByteStart: 0, ByteEnd: 4},
+			{Start: &t3500, Value: "words", ByteStart: 5, ByteEnd: 9},
+		}))
+
+		Expect(line1.Cue[1].End).To(BeNil())
+	})
+
+	It("should ignore Enhanced LRC markers and return plain lines when no markers present", func() {
+		a, b := int64(1000), int64(3000)
+		lyrics, err := ToLyrics("xxx", "[00:01.00]Plain line\n[00:03.00]Another plain line")
+		Expect(err).ToNot(HaveOccurred())
+		Expect(lyrics.Line).To(Equal([]Line{
+			{Start: &a, Value: "Plain line"},
+			{Start: &b, Value: "Another plain line"},
+		}))
+	})
+
+	It("should handle mixed Enhanced and plain LRC lines", func() {
+		lyrics, err := ToLyrics("xxx", "[00:01.00]<00:01.00>Some <00:01.50>lyrics\n[00:03.00]Plain line\n[00:05.00]<00:05.00>More <00:05.50>words")
+		Expect(err).ToNot(HaveOccurred())
+		Expect(lyrics.Line).To(HaveLen(3))
+
+		t1000, t1500, t5000, t5500 := int64(1000), int64(1500), int64(5000), int64(5500)
+		t3000 := int64(3000)
+
+		Expect(lyrics.Line[0].Cue).To(Equal([]Cue{
+			{Start: &t1000, End: &t1500, Value: "Some ", ByteStart: 0, ByteEnd: 4},
+			{Start: &t1500, End: &t3000, Value: "lyrics", ByteStart: 5, ByteEnd: 10},
+		}))
+		Expect(lyrics.Line[0].Value).To(Equal("Some lyrics"))
+		Expect(lyrics.Line[0].End).To(Equal(&t3000))
+
+		Expect(lyrics.Line[1].Cue).To(BeNil())
+		Expect(lyrics.Line[1].Value).To(Equal("Plain line"))
+
+		Expect(lyrics.Line[2].Cue).To(Equal([]Cue{
+			{Start: &t5000, Value: "More ", ByteStart: 0, ByteEnd: 4},
+			{Start: &t5500, Value: "words", ByteStart: 5, ByteEnd: 9},
+		}))
+		Expect(lyrics.Line[2].Value).To(Equal("More words"))
+	})
+
+	It("should preserve byte offsets for Enhanced LRC cues", func() {
+		lyrics, err := ToLyrics("xxx", "[00:00.00]<00:00.00>Oh <00:00.90>love<00:01.30> me <00:01.60>tonight")
+		Expect(err).ToNot(HaveOccurred())
+		Expect(lyrics.Line).To(HaveLen(1))
+
+		t0, t900, t1300, t1600 := int64(0), int64(900), int64(1300), int64(1600)
+		line := lyrics.Line[0]
+		Expect(line.Value).To(Equal("Oh love me tonight"))
+		Expect(line.Cue).To(Equal([]Cue{
+			{Start: &t0, Value: "Oh ", ByteStart: 0, ByteEnd: 2},
+			{Start: &t900, Value: "love", ByteStart: 3, ByteEnd: 6},
+			{Start: &t1300, Value: " me ", ByteStart: 7, ByteEnd: 10},
+			{Start: &t1600, Value: "tonight", ByteStart: 11, ByteEnd: 17},
+		}))
+	})
 })
diff --git a/server/subsonic/helpers.go b/server/subsonic/helpers.go
index 74d57ade4..6a14aa4aa 100644
--- a/server/subsonic/helpers.go
+++ b/server/subsonic/helpers.go
@@ -493,14 +493,79 @@ func mapExplicitStatus(explicitStatus string) string {
 	return ""
 }
 
-func buildStructuredLyric(mf *model.MediaFile, lyrics model.Lyrics) responses.StructuredLyric {
+func buildStructuredLyric(mf *model.MediaFile, lyrics model.Lyrics, enhanced bool) responses.StructuredLyric {
 	lines := make([]responses.Line, len(lyrics.Line))
+	var cueLines []responses.CueLine
+	agentOrderByID := make(map[string]int, len(lyrics.Agents))
+	agentRoleByID := make(map[string]string, len(lyrics.Agents))
+	responseAgents := make([]responses.Agent, 0, len(lyrics.Agents))
+
+	for i, agent := range lyrics.Agents {
+		agentOrderByID[agent.ID] = i
+		agentRoleByID[agent.ID] = agent.Role
+		responseAgents = append(responseAgents, responses.Agent{
+			ID:   agent.ID,
+			Role: agent.Role,
+			Name: agent.Name,
+		})
+	}
 
 	for i, line := range lyrics.Line {
 		lines[i] = responses.Line{
 			Start: line.Start,
 			Value: line.Value,
 		}
+		if !enhanced || len(line.Cue) == 0 {
+			continue
+		}
+
+		agentOrder := make([]string, 0, 2)
+		cuesByAgent := make(map[string][]model.Cue)
+		for _, cue := range line.Cue {
+			if cue.Start == nil {
+				continue
+			}
+			agentID := strings.TrimSpace(cue.AgentID)
+			if _, exists := cuesByAgent[agentID]; !exists {
+				agentOrder = append(agentOrder, agentID)
+			}
+			cuesByAgent[agentID] = append(cuesByAgent[agentID], cue)
+		}
+
+		sort.SliceStable(agentOrder, func(i, j int) bool {
+			leftRole := agentRoleByID[agentOrder[i]]
+			rightRole := agentRoleByID[agentOrder[j]]
+			if leftRole == "main" && rightRole != "main" {
+				return true
+			}
+			if rightRole == "main" && leftRole != "main" {
+				return false
+			}
+
+			leftOrder, leftOK := agentOrderByID[agentOrder[i]]
+			rightOrder, rightOK := agentOrderByID[agentOrder[j]]
+			if leftOK && rightOK && leftOrder != rightOrder {
+				return leftOrder < rightOrder
+			}
+			if leftOK != rightOK {
+				return leftOK
+			}
+			return i < j
+		})
+
+		for _, agentID := range agentOrder {
+			cueLine := responses.CueLine{
+				Index: int32(i),
+				Start: line.Start,
+				End:   line.End,
+				Value: line.Value,
+				Cue:   buildLyricCues(cuesByAgent[agentID], line.End),
+			}
+			if agentID != "" {
+				cueLine.AgentID = agentID
+			}
+			cueLines = append(cueLines, cueLine)
+		}
 	}
 
 	structured := responses.StructuredLyric{
@@ -508,10 +573,22 @@ func buildStructuredLyric(mf *model.MediaFile, lyrics model.Lyrics) responses.St
 		DisplayTitle:  lyrics.DisplayTitle,
 		Lang:          lyrics.Lang,
 		Line:          lines,
+		CueLine:       cueLines,
 		Offset:        lyrics.Offset,
 		Synced:        lyrics.Synced,
 	}
 
+	if enhanced {
+		kind := strings.TrimSpace(lyrics.Kind)
+		if kind == "" {
+			kind = "main"
+		}
+		structured.Kind = kind
+		if len(cueLines) > 0 && len(responseAgents) > 0 {
+			structured.Agents = responseAgents
+		}
+	}
+
 	if structured.DisplayArtist == "" {
 		structured.DisplayArtist = mf.Artist
 	}
@@ -522,11 +599,86 @@ func buildStructuredLyric(mf *model.MediaFile, lyrics model.Lyrics) responses.St
 	return structured
 }
 
-func buildLyricsList(mf *model.MediaFile, lyricsList model.LyricList) *responses.LyricsList {
-	lyricList := make(responses.StructuredLyrics, len(lyricsList))
+func buildLyricCues(cues []model.Cue, lineEnd *int64) []responses.LyricCue {
+	if len(cues) == 0 {
+		return nil
+	}
 
-	for i, lyrics := range lyricsList {
-		lyricList[i] = buildStructuredLyric(mf, lyrics)
+	hasAnyEnd := false
+	for i := range cues {
+		if cues[i].End != nil {
+			hasAnyEnd = true
+			break
+		}
+	}
+
+	normalized := make([]responses.LyricCue, 0, len(cues))
+	for i := range cues {
+		if cues[i].Start == nil {
+			continue
+		}
+
+		cue := responses.LyricCue{
+			Start:     *cues[i].Start,
+			Value:     cues[i].Value,
+			ByteStart: cues[i].ByteStart,
+			ByteEnd:   cues[i].ByteEnd,
+		}
+		if hasAnyEnd {
+			end := cues[i].End
+			if end == nil {
+				if i+1 < len(cues) && cues[i+1].Start != nil {
+					v := *cues[i+1].Start
+					end = &v
+				} else if lineEnd != nil {
+					v := *lineEnd
+					end = &v
+				}
+			}
+			if end != nil && i+1 < len(cues) && cues[i+1].Start != nil && *end > *cues[i+1].Start {
+				v := *cues[i+1].Start
+				end = &v
+			}
+			if end != nil && *end < cue.Start {
+				v := cue.Start
+				end = &v
+			}
+			cue.End = end
+		}
+		normalized = append(normalized, cue)
+	}
+
+	if hasAnyEnd {
+		for i := range normalized {
+			if normalized[i].End == nil {
+				for j := range normalized {
+					normalized[j].End = nil
+				}
+				break
+			}
+		}
+	}
+
+	return normalized
+}
+
+func buildLyricsList(mf *model.MediaFile, lyricsList model.LyricList, enhanced bool) *responses.LyricsList {
+	var filtered model.LyricList
+	if enhanced {
+		filtered = lyricsList
+	} else {
+		// Without enhanced, only return "main" kind entries
+		for _, l := range lyricsList {
+			kind := strings.TrimSpace(l.Kind)
+			if kind == "" || kind == "main" {
+				filtered = append(filtered, l)
+			}
+		}
+	}
+
+	lyricList := make(responses.StructuredLyrics, len(filtered))
+	for i, lyrics := range filtered {
+		lyricList[i] = buildStructuredLyric(mf, lyrics, enhanced)
 	}
 
 	res := &responses.LyricsList{
diff --git a/server/subsonic/media_retrieval.go b/server/subsonic/media_retrieval.go
index 3faae1650..c3c6d98ea 100644
--- a/server/subsonic/media_retrieval.go
+++ b/server/subsonic/media_retrieval.go
@@ -10,6 +10,7 @@ import (
 
 	"github.com/navidrome/navidrome/conf"
 	"github.com/navidrome/navidrome/consts"
+	lyricssvc "github.com/navidrome/navidrome/core/lyrics"
 	"github.com/navidrome/navidrome/log"
 	"github.com/navidrome/navidrome/model"
 	"github.com/navidrome/navidrome/resources"
@@ -19,6 +20,8 @@ import (
 	"github.com/navidrome/navidrome/utils/req"
 )
 
+const maxLegacyLyricsCandidates = 10
+
 func (api *Router) GetAvatar(w http.ResponseWriter, r *http.Request) (*responses.Subsonic, error) {
 	if !conf.Server.EnableGravatar {
 		return api.getPlaceHolderAvatar(w, r)
@@ -98,7 +101,11 @@ func (api *Router) GetLyrics(r *http.Request) (*responses.Subsonic, error) {
 	response := newResponse()
 	lyricsResponse := responses.Lyrics{}
 	response.Lyrics = &lyricsResponse
-	mediaFiles, err := api.ds.MediaFile(r.Context()).GetAll(filter.SongsByArtistTitleWithLyricsFirst(artist, title))
+	opts := filter.SongsByArtistTitleWithLyricsFirst(artist, title)
+	// Search a bounded duplicate window so source-priority fallback can still
+	// reach older matches without turning legacy getLyrics into an unbounded scan.
+	opts.Max = maxLegacyLyricsCandidates
+	mediaFiles, err := api.ds.MediaFile(r.Context()).GetAll(opts)
 
 	if err != nil {
 		return nil, err
@@ -108,9 +115,22 @@ func (api *Router) GetLyrics(r *http.Request) (*responses.Subsonic, error) {
 		return response, nil
 	}
 
-	structuredLyrics, err := api.lyrics.GetLyrics(r.Context(), &mediaFiles[0])
-	if err != nil {
-		return nil, err
+	var structuredLyrics model.LyricList
+	if batchLyrics, ok := api.lyrics.(lyricssvc.BatchLyrics); ok {
+		structuredLyrics, err = batchLyrics.GetLyricsForMediaFiles(r.Context(), mediaFiles)
+		if err != nil {
+			return nil, err
+		}
+	} else {
+		for i := range mediaFiles {
+			structuredLyrics, err = api.lyrics.GetLyrics(r.Context(), &mediaFiles[i])
+			if err != nil {
+				return nil, err
+			}
+			if len(structuredLyrics) > 0 {
+				break
+			}
+		}
 	}
 
 	if len(structuredLyrics) == 0 {
@@ -124,7 +144,6 @@ func (api *Router) GetLyrics(r *http.Request) (*responses.Subsonic, error) {
 	for _, line := range structuredLyrics[0].Line {
 		lyricsText.WriteString(line.Value + "\n")
 	}
-
 	lyricsResponse.Value = lyricsText.String()
 
 	return response, nil
@@ -146,8 +165,10 @@ func (api *Router) GetLyricsBySongId(r *http.Request) (*responses.Subsonic, erro
 		return nil, err
 	}
 
+	enhanced, _ := req.Params(r).Bool("enhanced")
+
 	response := newResponse()
-	response.LyricsList = buildLyricsList(mediaFile, structuredLyrics)
+	response.LyricsList = buildLyricsList(mediaFile, structuredLyrics, enhanced)
 
 	return response, nil
 }
diff --git a/server/subsonic/media_retrieval_test.go b/server/subsonic/media_retrieval_test.go
index 589a609da..45a475da6 100644
--- a/server/subsonic/media_retrieval_test.go
+++ b/server/subsonic/media_retrieval_test.go
@@ -186,6 +186,41 @@ var _ = Describe("MediaRetrievalController", func() {
 			Expect(response.Lyrics.Title).To(Equal("Never Gonna Give You Up"))
 			Expect(response.Lyrics.Value).To(Equal("We're no strangers to love\nYou know the rules and so do I\n"))
 		})
+
+		It("should prefer higher-priority sidecar lyrics across duplicate candidates", func() {
+			conf.Server.LyricsPriority = ".ttml,embedded"
+			r := newGetRequest("artist=Rick+Astley", "title=Never+Gonna+Give+You+Up")
+			baseTime := time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC)
+			embedded, err := model.ToLyrics("eng", "Newest duplicate embedded lyrics")
+			Expect(err).ToNot(HaveOccurred())
+			embeddedJSON, err := json.Marshal(model.LyricList{*embedded})
+			Expect(err).ToNot(HaveOccurred())
+			mockRepo.SetData(model.MediaFiles{
+				{
+					ID:        "1",
+					Path:      "tests/fixtures/01 Invisible (RED) Edit Version.mp3",
+					Artist:    "Rick Astley",
+					Title:     "Never Gonna Give You Up",
+					Lyrics:    string(embeddedJSON),
+					UpdatedAt: baseTime.Add(2 * time.Hour), // Newer duplicate with embedded lyrics only
+				},
+				{
+					ID:        "2",
+					Path:      "tests/fixtures/test.mp3",
+					Artist:    "Rick Astley",
+					Title:     "Never Gonna Give You Up",
+					Lyrics:    "[]",
+					UpdatedAt: baseTime.Add(1 * time.Hour), // Older, but has TTML sidecar
+				},
+			})
+
+			response, err := router.GetLyrics(r)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(response.Lyrics.Artist).To(Equal("Rick Astley"))
+			Expect(response.Lyrics.Title).To(Equal("Never Gonna Give You Up"))
+			Expect(response.Lyrics.Value).To(Equal("We're no strangers to love\nYou know the rules and so do I\n"))
+			Expect(mockRepo.Options.Max).To(Equal(maxLegacyLyricsCandidates))
+		})
 	})
 
 	Describe("GetLyricsBySongId", func() {
@@ -202,8 +237,10 @@ var _ = Describe("MediaRetrievalController", func() {
 
 				Expect(realLyric.DisplayArtist).To(Equal(expectedLyric.DisplayArtist))
 				Expect(realLyric.DisplayTitle).To(Equal(expectedLyric.DisplayTitle))
+				Expect(realLyric.Kind).To(Equal(expectedLyric.Kind))
 				Expect(realLyric.Lang).To(Equal(expectedLyric.Lang))
 				Expect(realLyric.Synced).To(Equal(expectedLyric.Synced))
+				Expect(realLyric.Agents).To(Equal(expectedLyric.Agents))
 
 				if expectedLyric.Offset == nil {
 					Expect(realLyric.Offset).To(BeNil())
@@ -222,6 +259,38 @@ var _ = Describe("MediaRetrievalController", func() {
 						Expect(*realLine.Start).To(Equal(*expectedLine.Start))
 					}
 				}
+
+				Expect(realLyric.CueLine).To(HaveLen(len(expectedLyric.CueLine)))
+				for j, realCueLine := range realLyric.CueLine {
+					expectedCueLine := expectedLyric.CueLine[j]
+					Expect(realCueLine.Index).To(Equal(expectedCueLine.Index))
+					Expect(realCueLine.Value).To(Equal(expectedCueLine.Value))
+					Expect(realCueLine.AgentID).To(Equal(expectedCueLine.AgentID))
+					if expectedCueLine.Start == nil {
+						Expect(realCueLine.Start).To(BeNil())
+					} else {
+						Expect(*realCueLine.Start).To(Equal(*expectedCueLine.Start))
+					}
+					if expectedCueLine.End == nil {
+						Expect(realCueLine.End).To(BeNil())
+					} else {
+						Expect(*realCueLine.End).To(Equal(*expectedCueLine.End))
+					}
+
+					Expect(realCueLine.Cue).To(HaveLen(len(expectedCueLine.Cue)))
+					for k, realCue := range realCueLine.Cue {
+						expectedCue := expectedCueLine.Cue[k]
+						Expect(realCue.Value).To(Equal(expectedCue.Value))
+						Expect(realCue.Start).To(Equal(expectedCue.Start))
+						Expect(realCue.ByteStart).To(Equal(expectedCue.ByteStart))
+						Expect(realCue.ByteEnd).To(Equal(expectedCue.ByteEnd))
+						if expectedCue.End == nil {
+							Expect(realCue.End).To(BeNil())
+						} else {
+							Expect(*realCue.End).To(Equal(*expectedCue.End))
+						}
+					}
+				}
 			}
 		}
 
@@ -323,6 +392,427 @@ var _ = Describe("MediaRetrievalController", func() {
 				},
 			})
 		})
+
+		It("should return multilingual TTML sidecar lyrics", func() {
+			conf.Server.LyricsPriority = ".ttml,embedded"
+			r := newGetRequest("id=1")
+
+			mockRepo.SetData(model.MediaFiles{
+				{
+					ID:     "1",
+					Path:   "tests/fixtures/test.mp3",
+					Artist: "Rick Astley",
+					Title:  "Never Gonna Give You Up",
+					Lyrics: "[]",
+				},
+			})
+
+			response, err := router.GetLyricsBySongId(r)
+			Expect(err).ToNot(HaveOccurred())
+
+			porTime := int64(18800)
+			ttmlTime := int64(22800)
+			compareResponses(response.LyricsList, responses.LyricsList{
+				StructuredLyrics: responses.StructuredLyrics{
+					{
+						DisplayArtist: "Rick Astley",
+						DisplayTitle:  "Never Gonna Give You Up",
+						Lang:          "eng",
+						Synced:        true,
+						Line: []responses.Line{
+							{
+								Start: &times[0],
+								Value: "We're no strangers to love",
+							},
+							{
+								Start: &ttmlTime,
+								Value: "You know the rules and so do I",
+							},
+						},
+					},
+					{
+						DisplayArtist: "Rick Astley",
+						DisplayTitle:  "Never Gonna Give You Up",
+						Lang:          "por",
+						Synced:        true,
+						Line: []responses.Line{
+							{
+								Start: &porTime,
+								Value: "Nao somos estranhos ao amor",
+							},
+						},
+					},
+				},
+			})
+		})
+
+		It("should return metadata-linked translation and pronunciation tracks from TTML", func() {
+			conf.Server.LyricsPriority = ".ttml,embedded"
+			r := newGetRequest("id=1&enhanced=true")
+
+			mockRepo.SetData(model.MediaFiles{
+				{
+					ID:     "1",
+					Path:   "tests/fixtures/test-metadata.mp3",
+					Artist: "Rick Astley",
+					Title:  "Never Gonna Give You Up",
+					Lyrics: "[]",
+				},
+			})
+
+			response, err := router.GetLyricsBySongId(r)
+			Expect(err).ToNot(HaveOccurred())
+
+			mainStartA := int64(1000)
+			mainStartB := int64(2000)
+			tokenStartA := int64(2000)
+			tokenEndA := int64(2300)
+			tokenStartB := int64(2300)
+			tokenEndB := int64(2600)
+			compareResponses(response.LyricsList, responses.LyricsList{
+				StructuredLyrics: responses.StructuredLyrics{
+					{
+						DisplayArtist: "Rick Astley",
+						DisplayTitle:  "Never Gonna Give You Up",
+						Kind:          "main",
+						Lang:          "ja",
+						Synced:        true,
+						Line: []responses.Line{
+							{
+								Start: &mainStartA,
+								Value: "こんにちは",
+							},
+							{
+								Start: &mainStartB,
+								Value: "こんばんは",
+							},
+						},
+					},
+					{
+						DisplayArtist: "Rick Astley",
+						DisplayTitle:  "Never Gonna Give You Up",
+						Kind:          "translation",
+						Lang:          "es",
+						Synced:        true,
+						Line: []responses.Line{
+							{
+								Start: &mainStartA,
+								Value: "Hola",
+							},
+						},
+					},
+					{
+						DisplayArtist: "Rick Astley",
+						DisplayTitle:  "Never Gonna Give You Up",
+						Kind:          "pronunciation",
+						Lang:          "ja-latn",
+						Synced:        true,
+						Line: []responses.Line{
+							{
+								Start: &mainStartB,
+								Value: "konni",
+							},
+						},
+						CueLine: []responses.CueLine{
+							{
+								Index: 0,
+								Start: &mainStartB,
+								End:   &tokenEndB,
+								Value: "konni",
+								Cue: []responses.LyricCue{
+									{
+										Start:     tokenStartA,
+										End:       &tokenEndA,
+										ByteStart: 0,
+										ByteEnd:   1,
+										Value:     "ko",
+									},
+									{
+										Start:     tokenStartB,
+										End:       &tokenEndB,
+										ByteStart: 2,
+										ByteEnd:   4,
+										Value:     "nni",
+									},
+								},
+							},
+						},
+					},
+				},
+			})
+		})
+
+		It("should return cue lines for songLyrics v2 clients with enhanced=true", func() {
+			r := newGetRequest("id=1&enhanced=true")
+
+			lineStart := int64(1000)
+			lineEnd := int64(3000)
+			tokenStartA := int64(1000)
+			tokenEndA := int64(1400)
+			tokenStartB := int64(2000)
+			tokenEndB := int64(2500)
+			lyricsJson, err := json.Marshal(model.LyricList{
+				{
+					Lang:   "eng",
+					Agents: []model.Agent{{ID: "lead", Role: "main"}, {ID: "__nd_bg__|lead", Role: "bg"}},
+					Synced: true,
+					Line: []model.Line{
+						{
+							Start: &lineStart,
+							End:   &lineEnd,
+							Value: "Hello echo",
+							Cue: []model.Cue{
+								{
+									Start:     &tokenStartA,
+									End:       &tokenEndA,
+									Value:     "Hello",
+									ByteStart: 0,
+									ByteEnd:   4,
+									AgentID:   "lead",
+								},
+								{
+									Start:     &tokenStartB,
+									End:       &tokenEndB,
+									Value:     "echo",
+									ByteStart: 6,
+									ByteEnd:   9,
+									AgentID:   "__nd_bg__|lead",
+								},
+							},
+						},
+					},
+				},
+			})
+			Expect(err).ToNot(HaveOccurred())
+
+			mockRepo.SetData(model.MediaFiles{
+				{
+					ID:     "1",
+					Artist: "Rick Astley",
+					Title:  "Never Gonna Give You Up",
+					Lyrics: string(lyricsJson),
+				},
+			})
+
+			response, err := router.GetLyricsBySongId(r)
+			Expect(err).ToNot(HaveOccurred())
+			compareResponses(response.LyricsList, responses.LyricsList{
+				StructuredLyrics: responses.StructuredLyrics{
+					{
+						DisplayArtist: "Rick Astley",
+						DisplayTitle:  "Never Gonna Give You Up",
+						Kind:          "main",
+						Lang:          "eng",
+						Synced:        true,
+						Agents: []responses.Agent{
+							{ID: "lead", Role: "main"},
+							{ID: "__nd_bg__|lead", Role: "bg"},
+						},
+						Line: []responses.Line{
+							{
+								Start: &lineStart,
+								Value: "Hello echo",
+							},
+						},
+						CueLine: []responses.CueLine{
+							{
+								Index:   0,
+								Start:   &lineStart,
+								End:     &lineEnd,
+								Value:   "Hello echo",
+								AgentID: "lead",
+								Cue: []responses.LyricCue{
+									{
+										Start:     tokenStartA,
+										End:       &tokenEndA,
+										ByteStart: 0,
+										ByteEnd:   4,
+										Value:     "Hello",
+									},
+								},
+							},
+							{
+								Index:   0,
+								Start:   &lineStart,
+								End:     &lineEnd,
+								Value:   "Hello echo",
+								AgentID: "__nd_bg__|lead",
+								Cue: []responses.LyricCue{
+									{
+										Start:     tokenStartB,
+										End:       &tokenEndB,
+										ByteStart: 6,
+										ByteEnd:   9,
+										Value:     "echo",
+									},
+								},
+							},
+						},
+					},
+				},
+			})
+		})
+
+		It("should keep enhanced line-level lyrics when no cue data is available", func() {
+			r := newGetRequest("id=1&enhanced=true")
+
+			lineStart := int64(1000)
+			lineEnd := int64(3000)
+			lyricsJSON, err := json.Marshal(model.LyricList{
+				{
+					Kind:   "main",
+					Lang:   "eng",
+					Synced: true,
+					Line: []model.Line{
+						{
+							Start: &lineStart,
+							End:   &lineEnd,
+							Value: "Line without word timing",
+						},
+					},
+				},
+			})
+			Expect(err).ToNot(HaveOccurred())
+
+			mockRepo.SetData(model.MediaFiles{
+				{
+					ID:     "1",
+					Artist: "Rick Astley",
+					Title:  "Never Gonna Give You Up",
+					Lyrics: string(lyricsJSON),
+				},
+			})
+
+			response, err := router.GetLyricsBySongId(r)
+			Expect(err).ToNot(HaveOccurred())
+			compareResponses(response.LyricsList, responses.LyricsList{
+				StructuredLyrics: responses.StructuredLyrics{
+					{
+						DisplayArtist: "Rick Astley",
+						DisplayTitle:  "Never Gonna Give You Up",
+						Kind:          "main",
+						Lang:          "eng",
+						Synced:        true,
+						Line: []responses.Line{
+							{
+								Start: &lineStart,
+								Value: "Line without word timing",
+							},
+						},
+					},
+				},
+			})
+		})
+
+		It("should return required cue byte offsets for ambiguous and multibyte cue lines", func() {
+			r := newGetRequest("id=1&enhanced=true")
+
+			asciiLineStart := int64(0)
+			asciiLineEnd := int64(2400)
+			asciiCueStartA := int64(0)
+			asciiCueEndA := int64(300)
+			asciiCueStartB := int64(900)
+			asciiCueEndB := int64(1300)
+			asciiCueStartC := int64(1300)
+			asciiCueEndC := int64(1600)
+			asciiCueStartD := int64(1600)
+
+			utfLineStart := int64(2747)
+			utfLineEnd := int64(6214)
+			utfCueStartA := int64(2747)
+			utfCueEndA := int64(3018)
+			utfCueStartB := int64(3018)
+			utfCueEndB := int64(3179)
+			utfCueStartC := int64(3582)
+			utfCueEndC := int64(4100)
+			utfCueStartD := int64(4500)
+			utfCueEndD := int64(6214)
+
+			lyricsJSON, err := json.Marshal(model.LyricList{
+				{
+					Lang:   "eng",
+					Synced: true,
+					Line: []model.Line{
+						{
+							Start: &asciiLineStart,
+							End:   &asciiLineEnd,
+							Value: "Oh love love me tonight",
+							Cue: []model.Cue{
+								{Start: &asciiCueStartA, End: &asciiCueEndA, Value: "Oh", ByteStart: 0, ByteEnd: 1},
+								{Start: &asciiCueStartB, End: &asciiCueEndB, Value: "love", ByteStart: 8, ByteEnd: 11},
+								{Start: &asciiCueStartC, End: &asciiCueEndC, Value: "me", ByteStart: 13, ByteEnd: 14},
+								{Start: &asciiCueStartD, Value: "tonight", ByteStart: 16, ByteEnd: 22},
+							},
+						},
+						{
+							Start: &utfLineStart,
+							End:   &utfLineEnd,
+							Value: "눈을 뜬 순간",
+							Cue: []model.Cue{
+								{Start: &utfCueStartA, End: &utfCueEndA, Value: "눈", ByteStart: 0, ByteEnd: 2},
+								{Start: &utfCueStartB, End: &utfCueEndB, Value: "을", ByteStart: 3, ByteEnd: 5},
+								{Start: &utfCueStartC, End: &utfCueEndC, Value: "뜬", ByteStart: 7, ByteEnd: 9},
+								{Start: &utfCueStartD, End: &utfCueEndD, Value: "순간", ByteStart: 11, ByteEnd: 16},
+							},
+						},
+					},
+				},
+			})
+			Expect(err).ToNot(HaveOccurred())
+
+			mockRepo.SetData(model.MediaFiles{
+				{
+					ID:     "1",
+					Artist: "Rick Astley",
+					Title:  "Never Gonna Give You Up",
+					Lyrics: string(lyricsJSON),
+				},
+			})
+
+			response, err := router.GetLyricsBySongId(r)
+			Expect(err).ToNot(HaveOccurred())
+			compareResponses(response.LyricsList, responses.LyricsList{
+				StructuredLyrics: responses.StructuredLyrics{
+					{
+						DisplayArtist: "Rick Astley",
+						DisplayTitle:  "Never Gonna Give You Up",
+						Kind:          "main",
+						Lang:          "eng",
+						Synced:        true,
+						Line: []responses.Line{
+							{Start: &asciiLineStart, Value: "Oh love love me tonight"},
+							{Start: &utfLineStart, Value: "눈을 뜬 순간"},
+						},
+						CueLine: []responses.CueLine{
+							{
+								Index: 0,
+								Start: &asciiLineStart,
+								End:   &asciiLineEnd,
+								Value: "Oh love love me tonight",
+								Cue: []responses.LyricCue{
+									{Start: asciiCueStartA, End: &asciiCueEndA, Value: "Oh", ByteStart: 0, ByteEnd: 1},
+									{Start: asciiCueStartB, End: &asciiCueEndB, Value: "love", ByteStart: 8, ByteEnd: 11},
+									{Start: asciiCueStartC, End: &asciiCueEndC, Value: "me", ByteStart: 13, ByteEnd: 14},
+									{Start: asciiCueStartD, End: &asciiLineEnd, Value: "tonight", ByteStart: 16, ByteEnd: 22},
+								},
+							},
+							{
+								Index: 1,
+								Start: &utfLineStart,
+								End:   &utfLineEnd,
+								Value: "눈을 뜬 순간",
+								Cue: []responses.LyricCue{
+									{Start: utfCueStartA, End: &utfCueEndA, Value: "눈", ByteStart: 0, ByteEnd: 2},
+									{Start: utfCueStartB, End: &utfCueEndB, Value: "을", ByteStart: 3, ByteEnd: 5},
+									{Start: utfCueStartC, End: &utfCueEndC, Value: "뜬", ByteStart: 7, ByteEnd: 9},
+									{Start: utfCueStartD, End: &utfCueEndD, Value: "순간", ByteStart: 11, ByteEnd: 16},
+								},
+							},
+						},
+					},
+				},
+			})
+		})
 	})
 })
 
diff --git a/server/subsonic/opensubsonic.go b/server/subsonic/opensubsonic.go
index 6c54d36d0..16bd6805f 100644
--- a/server/subsonic/opensubsonic.go
+++ b/server/subsonic/opensubsonic.go
@@ -11,7 +11,7 @@ func (api *Router) GetOpenSubsonicExtensions(_ *http.Request) (*responses.Subson
 	extensions := responses.OpenSubsonicExtensions{
 		{Name: "transcodeOffset", Versions: []int32{1}},
 		{Name: "formPost", Versions: []int32{1}},
-		{Name: "songLyrics", Versions: []int32{1}},
+		{Name: "songLyrics", Versions: []int32{1, 2}},
 		{Name: "indexBasedQueue", Versions: []int32{1}},
 		{Name: "transcoding", Versions: []int32{1}},
 	}
diff --git a/server/subsonic/opensubsonic_test.go b/server/subsonic/opensubsonic_test.go
index d98599f8f..5031f1c69 100644
--- a/server/subsonic/opensubsonic_test.go
+++ b/server/subsonic/opensubsonic_test.go
@@ -58,7 +58,7 @@ var _ = Describe("GetOpenSubsonicExtensions", func() {
 				HaveLen(5),
 				ContainElement(responses.OpenSubsonicExtension{Name: "transcodeOffset", Versions: []int32{1}}),
 				ContainElement(responses.OpenSubsonicExtension{Name: "formPost", Versions: []int32{1}}),
-				ContainElement(responses.OpenSubsonicExtension{Name: "songLyrics", Versions: []int32{1}}),
+				ContainElement(responses.OpenSubsonicExtension{Name: "songLyrics", Versions: []int32{1, 2}}),
 				ContainElement(responses.OpenSubsonicExtension{Name: "indexBasedQueue", Versions: []int32{1}}),
 				ContainElement(responses.OpenSubsonicExtension{Name: "transcoding", Versions: []int32{1}}),
 			))
@@ -87,7 +87,7 @@ var _ = Describe("GetOpenSubsonicExtensions", func() {
 				HaveLen(6),
 				ContainElement(responses.OpenSubsonicExtension{Name: "transcodeOffset", Versions: []int32{1}}),
 				ContainElement(responses.OpenSubsonicExtension{Name: "formPost", Versions: []int32{1}}),
-				ContainElement(responses.OpenSubsonicExtension{Name: "songLyrics", Versions: []int32{1}}),
+				ContainElement(responses.OpenSubsonicExtension{Name: "songLyrics", Versions: []int32{1, 2}}),
 				ContainElement(responses.OpenSubsonicExtension{Name: "indexBasedQueue", Versions: []int32{1}}),
 				ContainElement(responses.OpenSubsonicExtension{Name: "transcoding", Versions: []int32{1}}),
 				ContainElement(responses.OpenSubsonicExtension{Name: "sonicSimilarity", Versions: []int32{1}}),
diff --git a/server/subsonic/responses/responses.go b/server/subsonic/responses/responses.go
index c2e863b0f..8d3279d87 100644
--- a/server/subsonic/responses/responses.go
+++ b/server/subsonic/responses/responses.go
@@ -543,13 +543,39 @@ type Line struct {
 	Value string `xml:",chardata"            json:"value"`
 }
 
+type LyricCue struct {
+	Start     int64  `xml:"start,attr"           json:"start"`
+	End       *int64 `xml:"end,attr,omitempty"   json:"end,omitempty"`
+	ByteStart int    `xml:"byteStart,attr"       json:"byteStart"`
+	ByteEnd   int    `xml:"byteEnd,attr"         json:"byteEnd"`
+	Value     string `xml:",chardata"            json:"value"`
+}
+
+type Agent struct {
+	ID   string `xml:"id,attr"                 json:"id"`
+	Role string `xml:"role,attr"               json:"role"`
+	Name string `xml:"name,attr,omitempty"     json:"name,omitempty"`
+}
+
+type CueLine struct {
+	Index   int32      `xml:"index,attr"                    json:"index"`
+	Start   *int64     `xml:"start,attr,omitempty"          json:"start,omitempty"`
+	End     *int64     `xml:"end,attr,omitempty"            json:"end,omitempty"`
+	Value   string     `xml:"value,attr"                    json:"value"`
+	AgentID string     `xml:"agentId,attr,omitempty"        json:"agentId,omitempty"`
+	Cue     []LyricCue `xml:"cue,omitempty"                 json:"cue,omitempty"`
+}
+
 type StructuredLyric struct {
-	DisplayArtist string `xml:"displayArtist,attr,omitempty" json:"displayArtist,omitempty"`
-	DisplayTitle  string `xml:"displayTitle,attr,omitempty"  json:"displayTitle,omitempty"`
-	Lang          string `xml:"lang,attr"                    json:"lang"`
-	Line          []Line `xml:"line"                         json:"line"`
-	Offset        *int64 `xml:"offset,attr,omitempty"        json:"offset,omitempty"`
-	Synced        bool   `xml:"synced,attr"                  json:"synced"`
+	DisplayArtist string    `xml:"displayArtist,attr,omitempty" json:"displayArtist,omitempty"`
+	DisplayTitle  string    `xml:"displayTitle,attr,omitempty"  json:"displayTitle,omitempty"`
+	Kind          string    `xml:"kind,attr,omitempty"          json:"kind,omitempty"`
+	Lang          string    `xml:"lang,attr"                    json:"lang"`
+	Line          []Line    `xml:"line"                         json:"line"`
+	Agents        []Agent   `xml:"agent,omitempty"              json:"agents,omitempty"`
+	CueLine       []CueLine `xml:"cueLine,omitempty"     json:"cueLine,omitempty"`
+	Offset        *int64    `xml:"offset,attr,omitempty"        json:"offset,omitempty"`
+	Synced        bool      `xml:"synced,attr"                  json:"synced"`
 }
 
 type StructuredLyrics []StructuredLyric
diff --git a/tests/fixtures/bom-test.ttml b/tests/fixtures/bom-test.ttml
new file mode 100644
index 000000000..319ab1f07
--- /dev/null
+++ b/tests/fixtures/bom-test.ttml
@@ -0,0 +1,2 @@
+﻿<?xml version="1.0" encoding="UTF-8"?>
+<tt xmlns="http://www.w3.org/ns/ttml"><body><div xml:lang="eng"><p begin="00:00:00.00">BOM test line</p></div></body></tt>
diff --git a/tests/fixtures/bom-utf16-test.ttml b/tests/fixtures/bom-utf16-test.ttml
new file mode 100644
index 000000000..a5621ef5d
Binary files /dev/null and b/tests/fixtures/bom-utf16-test.ttml differ
diff --git a/tests/fixtures/test-enhanced.lrc b/tests/fixtures/test-enhanced.lrc
new file mode 100644
index 000000000..8f7b60f8c
--- /dev/null
+++ b/tests/fixtures/test-enhanced.lrc
@@ -0,0 +1,6 @@
+[ar:Test Artist]
+[ti:Enhanced Test]
+[lang:eng]
+[00:01.00]<00:01.00>Some <00:01.50>lyrics <00:02.00>here
+[00:03.00]<00:03.00>More <00:03.50>words
+[00:05.00]Plain line without inline markers
diff --git a/tests/fixtures/test-metadata.ttml b/tests/fixtures/test-metadata.ttml
new file mode 100644
index 000000000..c0243c18f
--- /dev/null
+++ b/tests/fixtures/test-metadata.ttml
@@ -0,0 +1,25 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<tt xmlns="http://www.w3.org/ns/ttml" xmlns:itunes="http://music.apple.com/lyric-ttml-internal">
+  <head>
+    <metadata>
+      <iTunesMetadata xmlns="http://music.apple.com/lyric-ttml-internal">
+        <translations>
+          <translation xml:lang="es">
+            <text for="L1">Hola</text>
+          </translation>
+        </translations>
+        <transliterations>
+          <transliteration xml:lang="ja-Latn">
+            <text for="L2"><span begin="00:02.000" end="00:02.300" xmlns="http://www.w3.org/ns/ttml">ko</span><span begin="00:02.300" end="00:02.600" xmlns="http://www.w3.org/ns/ttml">nni</span></text>
+          </transliteration>
+        </transliterations>
+      </iTunesMetadata>
+    </metadata>
+  </head>
+  <body xml:lang="ja">
+    <div>
+      <p begin="00:01.000" end="00:01.500" itunes:key="L1">こんにちは</p>
+      <p begin="00:02.000" end="00:02.700" itunes:key="L2">こんばんは</p>
+    </div>
+  </body>
+</tt>
diff --git a/tests/fixtures/test.elrc b/tests/fixtures/test.elrc
new file mode 100644
index 000000000..01c3d2cdd
--- /dev/null
+++ b/tests/fixtures/test.elrc
@@ -0,0 +1,5 @@
+[ar:ELRC Artist]
+[ti:ELRC Song]
+[lang:eng]
+[00:01.00]<00:01.00>Lead <00:01.50>words
+[00:03.00]Fallback line
diff --git a/tests/fixtures/test.srt b/tests/fixtures/test.srt
new file mode 100644
index 000000000..3c9c09a39
--- /dev/null
+++ b/tests/fixtures/test.srt
@@ -0,0 +1,7 @@
+1
+00:00:18,800 --> 00:00:22,800
+We're from subtitles
+
+2
+00:00:22,801 --> 00:00:26,000
+Another subtitle line
diff --git a/tests/fixtures/test.ttml b/tests/fixtures/test.ttml
new file mode 100644
index 000000000..a85673a1b
--- /dev/null
+++ b/tests/fixtures/test.ttml
@@ -0,0 +1,12 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<tt xmlns="http://www.w3.org/ns/ttml" xmlns:ttp="http://www.w3.org/ns/ttml#parameter" ttp:frameRate="30" ttp:subFrameRate="2" ttp:tickRate="10">
+  <body>
+    <div xml:lang="eng">
+      <p begin="00:00:18.80">We're no strangers to love</p>
+      <p begin="00:00:22:24">You know the rules and so do I</p>
+    </div>
+    <div xml:lang="por">
+      <p begin="188t">Nao somos estranhos ao amor</p>
+    </div>
+  </body>
+</tt>
diff --git a/ui/src/actions/player.js b/ui/src/actions/player.js
index 9056abeb6..f55102207 100644
--- a/ui/src/actions/player.js
+++ b/ui/src/actions/player.js
@@ -9,6 +9,7 @@ export const PLAYER_SET_VOLUME = 'PLAYER_SET_VOLUME'
 export const PLAYER_SET_MODE = 'PLAYER_SET_MODE'
 export const TRANSCODING_SET_PROFILE = 'TRANSCODING_SET_PROFILE'
 export const PLAYER_REFRESH_QUEUE = 'PLAYER_REFRESH_QUEUE'
+export const PLAYER_UPDATE_LYRIC = 'PLAYER_UPDATE_LYRIC'
 
 export const setTrack = (data) => ({
   type: PLAYER_SET_TRACK,
@@ -114,3 +115,8 @@ export const refreshQueue = (resolvedUrls) => ({
   type: PLAYER_REFRESH_QUEUE,
   data: resolvedUrls,
 })
+
+export const updateQueueLyric = (trackId, lyric) => ({
+  type: PLAYER_UPDATE_LYRIC,
+  data: { trackId, lyric },
+})
diff --git a/ui/src/audioplayer/KaraokeLyricsOverlay.jsx b/ui/src/audioplayer/KaraokeLyricsOverlay.jsx
new file mode 100644
index 000000000..aefb0127e
--- /dev/null
+++ b/ui/src/audioplayer/KaraokeLyricsOverlay.jsx
@@ -0,0 +1,1745 @@
+import IconButton from '@material-ui/core/IconButton'
+import Popover from '@material-ui/core/Popover'
+import Slider from '@material-ui/core/Slider'
+import { makeStyles, useTheme } from '@material-ui/core/styles'
+import Tooltip from '@material-ui/core/Tooltip'
+import Typography from '@material-ui/core/Typography'
+import CloseIcon from '@material-ui/icons/Close'
+import RestoreIcon from '@material-ui/icons/Restore'
+import TuneIcon from '@material-ui/icons/Tune'
+import clsx from 'clsx'
+import React, {
+  memo,
+  useCallback,
+  useEffect,
+  useMemo,
+  useRef,
+  useState,
+} from 'react'
+import {
+  buildHighlightedAuxLine,
+  buildHighlightedMainLine,
+  buildKaraokeLines,
+  getActiveKaraokeState,
+  hasUsableKaraokeTiming,
+  hasStructuredLyricContent,
+  resolveKaraokeTokenWindow,
+  resolveLayerLineForMain,
+  utf8ByteRangeToCodeUnitRange,
+} from './lyrics'
+
+const KARAOKE_RENDER_LEAD_MS = 80
+const KARAOKE_CLOCK_DRIFT_RESET_MS = 140
+const KARAOKE_CLOCK_RESET_THRESHOLD_MS = 320
+const KARAOKE_MONOTONIC_JITTER_MS = 60
+const KARAOKE_RENDER_UPDATE_EPSILON_MS = 6
+const KARAOKE_WORD_SETTLE_MS = 96
+const KARAOKE_ANIMATION_MS = 150
+const KARAOKE_DEFAULT_HEIGHT_PX = 300
+const KARAOKE_MIN_HEIGHT_PX = 150
+const KARAOKE_MAX_HEIGHT_RATIO = 0.72
+const KARAOKE_MAX_HEIGHT_PX = 760
+const KARAOKE_CENTER_SPACER_RATIO = 0.5
+const KARAOKE_CENTER_SPACER_MIN_PX = 132
+const KARAOKE_DEFAULT_LINE_HEIGHT = 1.3
+const KARAOKE_MIN_LINE_HEIGHT = 1
+const KARAOKE_MAX_LINE_HEIGHT = 2.2
+const KARAOKE_LINE_HEIGHT_STEP = 0.02
+const KARAOKE_GROUP_SPACING_BASE_PX = 14
+const KARAOKE_AUX_LINE_HEIGHT = 1.2
+const KARAOKE_MAIN_INACTIVE_FONT_FACTOR = 0.8
+const KARAOKE_AUX_INACTIVE_FONT_FACTOR = 0.88
+
+const TOKEN_DONE_ALPHA = 1
+const TOKEN_FUTURE_ALPHA = 0.34
+const TOKEN_ACTIVE_ALPHA = 1
+const TOKEN_WIPE_SOFT_SPREAD_PCT = 12
+const TOKEN_WIPE_EDGE_PCT = 8
+
+const COLOR_PRESETS = [
+  { key: 'white', label: 'White', value: 'rgba(255, 255, 255, 0.92)' },
+  { key: 'black', label: 'Black', value: 'rgba(0, 0, 0, 0.87)' },
+  { key: 'blue', label: 'Blue', value: 'rgba(120, 160, 220, 0.75)' },
+  { key: 'green', label: 'Green', value: 'rgba(100, 200, 130, 0.7)' },
+  { key: 'pink', label: 'Pink', value: 'rgba(240, 140, 170, 0.75)' },
+  { key: 'purple', label: 'Purple', value: 'rgba(180, 140, 240, 0.75)' },
+  { key: 'orange', label: 'Orange', value: 'rgba(240, 180, 100, 0.75)' },
+  { key: 'cyan', label: 'Cyan', value: 'rgba(100, 210, 220, 0.75)' },
+  { key: 'yellow', label: 'Yellow', value: 'rgba(240, 230, 110, 0.75)' },
+]
+
+const DEFAULT_LYRICS_SETTINGS = {
+  lineHeight: KARAOKE_DEFAULT_LINE_HEIGHT,
+  overlayHeight: KARAOKE_DEFAULT_HEIGHT_PX,
+  tr: { fontSize: 18, colorKey: 'blue' },
+  main: { fontSize: 30, colorKey: 'white' },
+  pr: { fontSize: 18, colorKey: 'green' },
+}
+
+const SETTINGS_STORAGE_KEY = 'karaoke-lyrics-settings'
+
+const createDefaultLyricsSettings = (isDark = true) => ({
+  lineHeight: KARAOKE_DEFAULT_LINE_HEIGHT,
+  overlayHeight: KARAOKE_DEFAULT_HEIGHT_PX,
+  tr: { ...DEFAULT_LYRICS_SETTINGS.tr },
+  main: { ...DEFAULT_LYRICS_SETTINGS.main, colorKey: isDark ? 'white' : 'black' },
+  pr: { ...DEFAULT_LYRICS_SETTINGS.pr },
+})
+
+const clampLineHeight = (value) => {
+  const numeric = Number(value)
+  if (!Number.isFinite(numeric)) {
+    return KARAOKE_DEFAULT_LINE_HEIGHT
+  }
+  return clamp(numeric, KARAOKE_MIN_LINE_HEIGHT, KARAOKE_MAX_LINE_HEIGHT)
+}
+
+const clampOverlayHeightPreference = (value) => {
+  const numeric = Number(value)
+  if (!Number.isFinite(numeric)) {
+    return KARAOKE_DEFAULT_HEIGHT_PX
+  }
+  return clamp(numeric, KARAOKE_MIN_HEIGHT_PX, KARAOKE_MAX_HEIGHT_PX)
+}
+
+const normalizeLyricsSettings = (settings) => ({
+  lineHeight: clampLineHeight(settings?.lineHeight),
+  overlayHeight: clampOverlayHeightPreference(settings?.overlayHeight),
+  tr: { ...DEFAULT_LYRICS_SETTINGS.tr, ...settings?.tr },
+  main: { ...DEFAULT_LYRICS_SETTINGS.main, ...settings?.main },
+  pr: { ...DEFAULT_LYRICS_SETTINGS.pr, ...settings?.pr },
+})
+
+const loadLyricsSettings = () => {
+  try {
+    const raw = localStorage.getItem(SETTINGS_STORAGE_KEY)
+    if (raw) {
+      return normalizeLyricsSettings(JSON.parse(raw))
+    }
+  } catch {
+    /* ignore */
+  }
+  return normalizeLyricsSettings()
+}
+
+const saveLyricsSettings = (settings) => {
+  try {
+    localStorage.setItem(
+      SETTINGS_STORAGE_KEY,
+      JSON.stringify(normalizeLyricsSettings(settings)),
+    )
+  } catch {
+    /* ignore */
+  }
+}
+
+const getColorValue = (colorKey) =>
+  COLOR_PRESETS.find((c) => c.key === colorKey)?.value || COLOR_PRESETS[0].value
+
+const hexToRgba = (hex, alpha) => {
+  const m = (hex || '').match(/#([0-9a-f]{2})([0-9a-f]{2})([0-9a-f]{2})/i)
+  if (m) return `rgba(${parseInt(m[1], 16)}, ${parseInt(m[2], 16)}, ${parseInt(m[3], 16)}, ${alpha})`
+  const rm = (hex || '').match(/rgba?\((\d+),\s*(\d+),\s*(\d+)/)
+  if (rm) return `rgba(${rm[1]}, ${rm[2]}, ${rm[3]}, ${alpha})`
+  return `rgba(48, 48, 48, ${alpha})`
+}
+
+const useStyles = makeStyles((theme) => {
+  const isDark = theme.palette.type === 'dark'
+  const overlayBg = hexToRgba(theme.palette.background.default, 0.85)
+  const primaryMain = theme.palette.primary.main
+  const primaryRgb = (() => {
+    const m = (primaryMain || '').match(/#([0-9a-f]{2})([0-9a-f]{2})([0-9a-f]{2})/i)
+    if (m) return [parseInt(m[1], 16), parseInt(m[2], 16), parseInt(m[3], 16)]
+    const rm = (primaryMain || '').match(/rgba?\((\d+),\s*(\d+),\s*(\d+)/)
+    if (rm) return [parseInt(rm[1]), parseInt(rm[2]), parseInt(rm[3])]
+    return [144, 202, 249]
+  })()
+  const textPrimary = isDark ? 'rgba(255, 255, 255, 0.92)' : 'rgba(0, 0, 0, 0.87)'
+  const textSecondary = isDark ? 'rgba(255, 255, 255, 0.55)' : 'rgba(0, 0, 0, 0.54)'
+  const borderSubtle = isDark ? 'rgba(255, 255, 255, 0.12)' : 'rgba(0, 0, 0, 0.12)'
+
+  return ({
+  overlay: {
+    position: 'fixed',
+    left: '50%',
+    bottom: 100,
+    transform: 'translateX(-50%)',
+    zIndex: 1400,
+    width: 'min(1000px, calc(100vw - 32px))',
+    minHeight: KARAOKE_MIN_HEIGHT_PX,
+    background: overlayBg,
+    borderRadius: 12,
+    border: `1px solid ${borderSubtle}`,
+    boxShadow: '0 18px 48px rgba(0, 0, 0, 0.42)',
+    backdropFilter: 'blur(20px)',
+    color: textPrimary,
+    display: 'flex',
+    flexDirection: 'column',
+    overflow: 'hidden',
+    '@media (max-width:810px)': {
+      bottom: 78,
+      width: 'calc(100vw - 12px)',
+      borderRadius: 12,
+      minHeight: 180,
+      maxHeight: '65vh',
+    },
+  },
+  overlayInline: {
+    position: 'absolute',
+    inset: 0,
+    width: '100%',
+    height: '100%',
+    minHeight: 0,
+    maxHeight: '100%',
+    transform: 'none',
+    borderRadius: 'inherit',
+    border: 'none',
+    boxShadow: 'none',
+    background: 'transparent',
+    backdropFilter: 'blur(16px)',
+    WebkitBackdropFilter: 'blur(16px)',
+    zIndex: 1,
+  },
+  resizeHandle: {
+    height: 14,
+    cursor: 'ns-resize',
+    flexShrink: 0,
+    position: 'relative',
+    '&::after': {
+      content: '""',
+      position: 'absolute',
+      left: '50%',
+      top: 4,
+      transform: 'translateX(-50%)',
+      width: 56,
+      height: 3,
+      borderRadius: 999,
+      background: `rgba(${primaryRgb.join(', ')}, 0.22)`,
+    },
+    '@media (max-width:810px)': {
+      display: 'none',
+    },
+  },
+  header: {
+    display: 'flex',
+    alignItems: 'center',
+    justifyContent: 'space-between',
+    gap: theme.spacing(1),
+    padding: theme.spacing(0.3, 1.3, 0.4, 1.3),
+  },
+  headerInline: {
+    padding: theme.spacing(0.25, 0.65, 0.35, 0.65),
+    gap: theme.spacing(0.65),
+  },
+  headerLeft: {
+    display: 'flex',
+    alignItems: 'center',
+    gap: theme.spacing(1),
+    minWidth: 0,
+  },
+  languageBadges: {
+    display: 'flex',
+    alignItems: 'center',
+    gap: theme.spacing(0.5),
+    flexWrap: 'wrap',
+    minWidth: 0,
+  },
+  languageBadge: {
+    display: 'inline-flex',
+    alignItems: 'center',
+    justifyContent: 'center',
+    gap: theme.spacing(0.35),
+    padding: theme.spacing(0.2, 0.7),
+    borderRadius: 999,
+    border: `1px solid ${borderSubtle}`,
+    background: isDark ? 'rgba(15, 23, 42, 0.42)' : 'rgba(0, 0, 0, 0.06)',
+    color: isDark ? 'rgba(226, 232, 240, 0.8)' : 'rgba(0, 0, 0, 0.6)',
+    fontSize: 10,
+    lineHeight: 1,
+    letterSpacing: '0.04em',
+    whiteSpace: 'nowrap',
+    transition: `all ${KARAOKE_ANIMATION_MS}ms ease-in-out`,
+    userSelect: 'none',
+  },
+  languageBadgeToggle: {
+    cursor: 'pointer',
+    '&:hover': {
+      borderColor: `rgba(${primaryRgb.join(', ')}, 0.35)`,
+      background: isDark ? 'rgba(15, 23, 42, 0.56)' : 'rgba(0, 0, 0, 0.1)',
+    },
+  },
+  languageBadgeActive: {
+    borderColor: `rgba(${primaryRgb.join(', ')}, 0.46)`,
+    background: `rgba(${primaryRgb.join(', ')}, 0.18)`,
+    color: isDark ? 'rgba(248, 250, 252, 0.94)' : 'rgba(0, 0, 0, 0.87)',
+  },
+  languageBadgeLabel: {
+    fontWeight: 700,
+    textTransform: 'uppercase',
+    opacity: 0.78,
+  },
+  languageBadgeValue: {
+    opacity: 0.9,
+  },
+  closeButton: {
+    color: textSecondary,
+  },
+  lineGroup: {
+    display: 'flex',
+    flexDirection: 'column',
+    alignItems: 'center',
+    gap: theme.spacing(0.35),
+  },
+  inlineTr: {
+    margin: 0,
+    display: 'inline-block',
+    maxWidth: '100%',
+    textAlign: 'center',
+    fontWeight: 400,
+    lineHeight: KARAOKE_AUX_LINE_HEIGHT,
+    letterSpacing: '0.01em',
+    transition: `opacity ${KARAOKE_ANIMATION_MS}ms ease-in-out, color ${KARAOKE_ANIMATION_MS}ms ease-in-out, font-size 280ms ease-in-out, max-width 280ms ease-in-out`,
+  },
+  inlinePr: {
+    margin: 0,
+    display: 'inline-flex',
+    alignItems: 'center',
+    justifyContent: 'center',
+    flexWrap: 'wrap',
+    alignSelf: 'center',
+    width: 'fit-content',
+    maxWidth: '100%',
+    boxSizing: 'border-box',
+    textAlign: 'center',
+    fontWeight: 400,
+    lineHeight: 1,
+    letterSpacing: '0.01em',
+    transition: `opacity ${KARAOKE_ANIMATION_MS}ms ease-in-out, color ${KARAOKE_ANIMATION_MS}ms ease-in-out, font-size 280ms ease-in-out, max-width 280ms ease-in-out`,
+    padding: theme.spacing(0.15, 0.9),
+    borderRadius: 999,
+    background: isDark ? 'rgba(255, 255, 255, 0.08)' : 'rgba(0, 0, 0, 0.05)',
+    border: `1px solid ${borderSubtle}`,
+  },
+  bodyWrapper: {
+    position: 'relative',
+    flex: 1,
+    overflow: 'hidden',
+  },
+  body: {
+    padding: theme.spacing(0.5, 2, 1.4, 2),
+    overflowY: 'auto',
+    overflowX: 'hidden',
+    height: '100%',
+    overscrollBehavior: 'contain',
+    scrollbarWidth: 'none',
+    msOverflowStyle: 'none',
+    maskImage: 'linear-gradient(to bottom, transparent 0%, black 8%, black 92%, transparent 100%)',
+    WebkitMaskImage: 'linear-gradient(to bottom, transparent 0%, black 8%, black 92%, transparent 100%)',
+    '&::-webkit-scrollbar': {
+      display: 'none',
+      width: 0,
+      height: 0,
+    },
+    '@media (max-width:810px)': {
+      padding: theme.spacing(0.35, 1.2, 1.2, 1.2),
+    },
+  },
+  bodyInline: {
+    padding: theme.spacing(0.25, 0.8, 0.85, 0.8),
+  },
+  lines: {
+    display: 'flex',
+    flexDirection: 'column',
+    gap: theme.spacing(1.24),
+    paddingBottom: theme.spacing(1),
+  },
+  line: {
+    margin: 0,
+    display: 'inline-block',
+    maxWidth: '100%',
+    fontWeight: 600,
+    lineHeight: 1.24,
+    letterSpacing: '0.01em',
+    textAlign: 'center',
+    color: isDark ? 'rgba(255, 255, 255, 0.62)' : 'rgba(0, 0, 0, 0.52)',
+    transition: `opacity ${KARAOKE_ANIMATION_MS}ms ease-in-out, color ${KARAOKE_ANIMATION_MS}ms ease-in-out, font-size 280ms ease-in-out, max-width 280ms ease-in-out`,
+  },
+  token: {
+    display: 'inline-block',
+    whiteSpace: 'pre-wrap',
+    transition: `color ${KARAOKE_ANIMATION_MS}ms ease-in-out, text-shadow ${KARAOKE_ANIMATION_MS}ms ease-in-out`,
+  },
+  settingsButton: {
+    color: textSecondary,
+    padding: 4,
+    '&:hover': {
+      color: textPrimary,
+    },
+  },
+  settingsPanel: {
+    background: isDark ? 'rgba(12, 14, 20, 0.96)' : 'rgba(255, 255, 255, 0.96)',
+    border: `1px solid ${borderSubtle}`,
+    borderRadius: 10,
+    padding: theme.spacing(1.5, 2),
+    width: 278,
+    backdropFilter: 'blur(12px)',
+  },
+  settingsHeader: {
+    display: 'flex',
+    alignItems: 'center',
+    justifyContent: 'space-between',
+    gap: theme.spacing(1),
+    marginBottom: theme.spacing(1.25),
+  },
+  settingsSection: {
+    marginBottom: theme.spacing(1.2),
+    '&:last-child': {
+      marginBottom: 0,
+    },
+  },
+  settingsTitle: {
+    fontSize: 11,
+    fontWeight: 700,
+    letterSpacing: '0.08em',
+    textTransform: 'uppercase',
+    color: isDark ? 'rgba(255, 255, 255, 0.78)' : 'rgba(0, 0, 0, 0.72)',
+  },
+  settingsLabel: {
+    fontSize: 10,
+    fontWeight: 600,
+    letterSpacing: '0.1em',
+    textTransform: 'uppercase',
+    color: isDark ? 'rgba(255, 255, 255, 0.55)' : 'rgba(0, 0, 0, 0.5)',
+    marginBottom: 4,
+  },
+  settingsRow: {
+    display: 'flex',
+    alignItems: 'center',
+    gap: theme.spacing(1),
+  },
+  settingsSlider: {
+    flex: 1,
+    color: `rgba(${primaryRgb.join(', ')}, 0.6)`,
+    '& .MuiSlider-thumb': {
+      width: 12,
+      height: 12,
+    },
+    '& .MuiSlider-rail': {
+      opacity: 0.3,
+    },
+  },
+  settingsSliderValue: {
+    fontSize: 11,
+    color: isDark ? 'rgba(255, 255, 255, 0.5)' : 'rgba(0, 0, 0, 0.45)',
+    minWidth: 22,
+    textAlign: 'right',
+  },
+  settingsControlLabel: {
+    fontSize: 10,
+    letterSpacing: '0.06em',
+    textTransform: 'uppercase',
+    color: isDark ? 'rgba(255, 255, 255, 0.45)' : 'rgba(0, 0, 0, 0.42)',
+    minWidth: 72,
+    whiteSpace: 'nowrap',
+  },
+  resetButton: {
+    color: textSecondary,
+    padding: 4,
+    '&:hover': {
+      color: textPrimary,
+    },
+  },
+  colorDots: {
+    display: 'flex',
+    gap: 5,
+    marginTop: 4,
+  },
+  colorDot: {
+    width: 16,
+    height: 16,
+    borderRadius: '50%',
+    border: '2px solid transparent',
+    cursor: 'pointer',
+    transition: 'border-color 120ms ease, transform 120ms ease',
+    '&:hover': {
+      transform: 'scale(1.2)',
+    },
+  },
+  colorDotActive: {
+    borderColor: isDark ? 'rgba(255, 255, 255, 0.85)' : 'rgba(0, 0, 0, 0.7)',
+  },
+})})
+
+const clamp = (v, min, max) => Math.max(min, Math.min(max, v))
+const lerp = (from, to, t) => from + (to - from) * t
+const formatLineHeight = (value) => clampLineHeight(value).toFixed(2)
+const getLineGapPx = (lineHeight) =>
+  `${Math.round(clampLineHeight(lineHeight) * KARAOKE_GROUP_SPACING_BASE_PX)}px`
+
+const normalizeForComparison = (text) =>
+  (text || '').replace(/[\s\p{P}]/gu, '').toLowerCase()
+
+const shouldShowAuxLine = (mainLine, auxLine) => {
+  if (!auxLine || !auxLine.value) return false
+  return (
+    normalizeForComparison(auxLine.value) !==
+    normalizeForComparison(mainLine.value)
+  )
+}
+
+const buildLanguageBadges = ({
+  mainLyric,
+  translationLyric,
+  pronunciationLyric,
+  showTranslation,
+  showPronunciation,
+  translationEnabled,
+  pronunciationEnabled,
+}) =>
+  [
+    {
+      key: 'main',
+      label: 'Main',
+      lang: mainLyric?.lang,
+      active: true,
+      toggleable: false,
+    },
+    pronunciationEnabled && {
+      key: 'pr',
+      label: 'PR',
+      lang: pronunciationLyric?.lang,
+      active: showPronunciation,
+      toggleable: true,
+      tooltip: showPronunciation ? 'Hide pronunciation' : 'Show pronunciation',
+    },
+    translationEnabled && {
+      key: 'tr',
+      label: 'TR',
+      lang: translationLyric?.lang,
+      active: showTranslation,
+      toggleable: true,
+      tooltip: showTranslation ? 'Hide translation' : 'Show translation',
+    },
+  ].filter((badge) => badge && badge.lang)
+
+const SettingsSection = ({ label, layer, settings, onChange, classes }) => {
+  const s = settings[layer]
+  return (
+    <div className={classes.settingsSection}>
+      <div className={classes.settingsLabel}>{label}</div>
+      <div className={classes.settingsRow}>
+        <Slider
+          className={classes.settingsSlider}
+          min={8}
+          max={40}
+          step={1}
+          value={s.fontSize}
+          onChange={(_, val) =>
+            onChange({ ...settings, [layer]: { ...s, fontSize: val } })
+          }
+        />
+        <span className={classes.settingsSliderValue}>{s.fontSize}</span>
+      </div>
+      <div className={classes.colorDots}>
+        {COLOR_PRESETS.map((preset) => (
+          <div
+            key={preset.key}
+            className={clsx(classes.colorDot, {
+              [classes.colorDotActive]: s.colorKey === preset.key,
+            })}
+            style={{ background: preset.value }}
+            title={preset.label}
+            onClick={() =>
+              onChange({ ...settings, [layer]: { ...s, colorKey: preset.key } })
+            }
+          />
+        ))}
+      </div>
+    </div>
+  )
+}
+
+const LineHeightSetting = ({ settings, onChange, classes }) => (
+  <div className={classes.settingsSection}>
+    <div className={classes.settingsLabel}>Spacing</div>
+    <div className={classes.settingsRow}>
+      <div className={classes.settingsControlLabel}>Line height</div>
+      <Slider
+        className={classes.settingsSlider}
+        min={KARAOKE_MIN_LINE_HEIGHT}
+        max={KARAOKE_MAX_LINE_HEIGHT}
+        step={KARAOKE_LINE_HEIGHT_STEP}
+        value={settings.lineHeight}
+        aria-label="Line height"
+        data-testid="lyrics-line-height-slider"
+        onChange={(_, val) =>
+          onChange({
+            ...settings,
+            lineHeight: clampLineHeight(Array.isArray(val) ? val[0] : val),
+          })
+        }
+      />
+      <span
+        className={classes.settingsSliderValue}
+        data-testid="lyrics-line-height-value"
+      >
+        {formatLineHeight(settings.lineHeight)}
+      </span>
+    </div>
+  </div>
+)
+
+const LyricsSettingsPopover = ({ settings, onChange, onReset }) => {
+  const classes = useStyles()
+  const [anchorEl, setAnchorEl] = useState(null)
+
+  const handleToggle = useCallback((e) => {
+    e.stopPropagation()
+    setAnchorEl((prev) => (prev ? null : e.currentTarget))
+  }, [])
+
+  const handleClose = useCallback(() => setAnchorEl(null), [])
+
+  return (
+    <>
+      <Tooltip title="Appearance">
+        <span>
+          <IconButton
+            className={classes.settingsButton}
+            size="small"
+            onClick={handleToggle}
+            aria-label="Lyrics settings"
+            data-testid="lyrics-settings-button"
+          >
+            <TuneIcon style={{ fontSize: 18 }} />
+          </IconButton>
+        </span>
+      </Tooltip>
+      <Popover
+        open={Boolean(anchorEl)}
+        anchorEl={anchorEl}
+        onClose={handleClose}
+        anchorOrigin={{ vertical: 'top', horizontal: 'center' }}
+        transformOrigin={{ vertical: 'bottom', horizontal: 'center' }}
+        PaperProps={{ className: classes.settingsPanel }}
+        style={{ zIndex: 1500 }}
+      >
+        <div className={classes.settingsHeader}>
+          <Typography className={classes.settingsTitle}>Appearance</Typography>
+          <Tooltip title="Reset appearance">
+            <span>
+              <IconButton
+                className={classes.resetButton}
+                size="small"
+                onClick={onReset}
+                aria-label="Reset appearance"
+                data-testid="lyrics-reset-appearance"
+              >
+                <RestoreIcon style={{ fontSize: 18 }} />
+              </IconButton>
+            </span>
+          </Tooltip>
+        </div>
+        <LineHeightSetting
+          settings={settings}
+          onChange={onChange}
+          classes={classes}
+        />
+        <SettingsSection
+          label="Translation"
+          layer="tr"
+          settings={settings}
+          onChange={onChange}
+          classes={classes}
+        />
+        <SettingsSection
+          label="Main"
+          layer="main"
+          settings={settings}
+          onChange={onChange}
+          classes={classes}
+        />
+        <SettingsSection
+          label="Pronunciation"
+          layer="pr"
+          settings={settings}
+          onChange={onChange}
+          classes={classes}
+        />
+      </Popover>
+    </>
+  )
+}
+
+const easeInOut = (v) => {
+  const clamped = clamp(v, 0, 1)
+  return clamped < 0.5 ? 2 * clamped * clamped : 1 - (-2 * clamped + 2) ** 2 / 2
+}
+
+const getMaxHeightPx = () => {
+  if (typeof window === 'undefined') {
+    return KARAOKE_MAX_HEIGHT_PX
+  }
+  return Math.min(
+    Math.floor(window.innerHeight * KARAOKE_MAX_HEIGHT_RATIO),
+    KARAOKE_MAX_HEIGHT_PX,
+  )
+}
+
+const buildSegmentsFromLine = (line) => {
+  if (!line || !Array.isArray(line.tokens) || line.tokens.length === 0) {
+    return [{ text: line?.value || '', token: null, tokenIndex: -1 }]
+  }
+
+  const text = line.value || ''
+  const exactSegments = (() => {
+    if (!text) {
+      return null
+    }
+
+    const rangedTokens = line.tokens
+      .map((token, tokenIndex) => ({
+        token,
+        tokenIndex,
+        range: utf8ByteRangeToCodeUnitRange(
+          text,
+          token?.byteStart,
+          token?.byteEnd,
+        ),
+      }))
+      .filter((entry) => entry.range != null)
+
+    if (
+      rangedTokens.length !== line.tokens.length ||
+      rangedTokens.length === 0
+    ) {
+      return null
+    }
+
+    rangedTokens.sort(
+      (a, b) =>
+        a.range.start - b.range.start ||
+        a.range.end - b.range.end ||
+        a.tokenIndex - b.tokenIndex,
+    )
+
+    const segments = []
+    let cursor = 0
+    for (const entry of rangedTokens) {
+      if (entry.range.start < cursor) {
+        return null
+      }
+      if (entry.range.start > cursor) {
+        segments.push({
+          text: text.slice(cursor, entry.range.start),
+          token: null,
+          tokenIndex: -1,
+        })
+      }
+      segments.push({
+        text: entry.range.text,
+        token: entry.token,
+        tokenIndex: entry.tokenIndex,
+      })
+      cursor = entry.range.end
+    }
+
+    if (cursor < text.length) {
+      segments.push({
+        text: text.slice(cursor),
+        token: null,
+        tokenIndex: -1,
+      })
+    }
+
+    return segments
+  })()
+  if (exactSegments) {
+    return exactSegments
+  }
+
+  const matchedSegments = []
+  const fallbackSegments = []
+  let cursor = 0
+  let allMatched = text.length > 0
+  let anyMatched = false
+
+  const pushFallbackSeparatorIfNeeded = (nextTokenText) => {
+    if (fallbackSegments.length === 0) {
+      return
+    }
+    const prevText = fallbackSegments[fallbackSegments.length - 1].text || ''
+    if (!prevText || !nextTokenText) {
+      return
+    }
+    if (/\s$/.test(prevText) || /^\s/.test(nextTokenText)) {
+      return
+    }
+    if (/[A-Za-z0-9]$/.test(prevText) && /^[A-Za-z0-9]/.test(nextTokenText)) {
+      fallbackSegments.push({ text: ' ', token: null, tokenIndex: -1 })
+    }
+  }
+
+  for (let tokenIndex = 0; tokenIndex < line.tokens.length; tokenIndex += 1) {
+    const token = line.tokens[tokenIndex]
+    const tokenText = token.value || ''
+    if (!tokenText) {
+      continue
+    }
+
+    pushFallbackSeparatorIfNeeded(tokenText)
+    fallbackSegments.push({ text: tokenText, token, tokenIndex })
+
+    if (!text) {
+      allMatched = false
+      continue
+    }
+
+    const foundAt = text.indexOf(tokenText, cursor)
+    const normalizedFoundAt =
+      foundAt >= 0
+        ? foundAt
+        : text.toLowerCase().indexOf(tokenText.toLowerCase(), cursor)
+
+    if (normalizedFoundAt >= 0) {
+      anyMatched = true
+      if (normalizedFoundAt > cursor) {
+        matchedSegments.push({
+          text: text.slice(cursor, normalizedFoundAt),
+          token: null,
+          tokenIndex: -1,
+        })
+      }
+      const matchedTokenText = text.slice(
+        normalizedFoundAt,
+        normalizedFoundAt + tokenText.length,
+      )
+      matchedSegments.push({
+        text: matchedTokenText || tokenText,
+        token,
+        tokenIndex,
+      })
+      cursor = normalizedFoundAt + tokenText.length
+    } else {
+      allMatched = false
+    }
+  }
+
+  if (allMatched && anyMatched) {
+    if (cursor < text.length) {
+      matchedSegments.push({
+        text: text.slice(cursor),
+        token: null,
+        tokenIndex: -1,
+      })
+    }
+    return matchedSegments
+  }
+
+  if (fallbackSegments.length > 0) {
+    return fallbackSegments
+  }
+
+  return [{ text, token: null, tokenIndex: -1 }]
+}
+
+const getLineRenderWindow = (line, nextLineStart) => {
+  let start = Number.isFinite(Number(line?.start)) ? Number(line.start) : null
+  let end = Number.isFinite(Number(line?.end)) ? Number(line.end) : null
+  const fallbackEnd = Number.isFinite(Number(nextLineStart))
+    ? Number(nextLineStart)
+    : null
+
+  if (end == null) {
+    end = fallbackEnd
+  }
+
+  const tokens = Array.isArray(line?.tokens) ? line.tokens : []
+  if (tokens.length > 0) {
+    const firstWindow = resolveKaraokeTokenWindow(line, 0, nextLineStart)
+    const lastWindow = resolveKaraokeTokenWindow(
+      line,
+      tokens.length - 1,
+      nextLineStart,
+    )
+
+    if (
+      firstWindow.start != null &&
+      (start == null || firstWindow.start < start)
+    ) {
+      start = firstWindow.start
+    }
+    if (lastWindow.end != null && (end == null || lastWindow.end > end)) {
+      end = lastWindow.end
+    }
+  }
+
+  return { start, end }
+}
+
+const shouldSkipLineFrame = (
+  prevPlaybackMs,
+  nextPlaybackMs,
+  line,
+  nextLineStart,
+) => {
+  if (prevPlaybackMs === nextPlaybackMs) {
+    return true
+  }
+
+  const { start, end } = getLineRenderWindow(line, nextLineStart)
+
+  if (start != null) {
+    const activationStart = start - 220
+    if (prevPlaybackMs < activationStart && nextPlaybackMs < activationStart) {
+      return true
+    }
+  }
+
+  if (end != null) {
+    const settleEnd = end + KARAOKE_WORD_SETTLE_MS + 160
+    if (prevPlaybackMs > settleEnd && nextPlaybackMs > settleEnd) {
+      return true
+    }
+  }
+
+  return false
+}
+
+const areLineStylesEqual = (prevStyle, nextStyle) => {
+  const a = prevStyle || {}
+  const b = nextStyle || {}
+  return (
+    a.opacity === b.opacity &&
+    a.color === b.color &&
+    a.fontSize === b.fontSize &&
+    a.fontWeight === b.fontWeight &&
+    a.lineHeight === b.lineHeight &&
+    a.maxWidth === b.maxWidth
+  )
+}
+
+const parseColorRGB = (rgba) => {
+  const m = (rgba || '').match(/rgba?\((\d+),\s*(\d+),\s*(\d+)/)
+  return m ? [parseInt(m[1]), parseInt(m[2]), parseInt(m[3])] : [255, 255, 255]
+}
+
+const buildTokenWipeStyle = ({
+  fillProgress,
+  highlightAlpha,
+  futureAlpha,
+  rgb,
+}) => {
+  const [r, g, b] = rgb || [255, 255, 255]
+  const fillPct = clamp(fillProgress, 0, 1) * 100
+  const doneColor = `rgba(${r}, ${g}, ${b}, ${clamp(highlightAlpha, TOKEN_DONE_ALPHA, TOKEN_ACTIVE_ALPHA)})`
+  const futureColor = `rgba(${r}, ${g}, ${b}, ${futureAlpha})`
+
+  if (fillPct <= 0) {
+    return { color: futureColor, textShadow: 'none' }
+  }
+
+  const edgeStart = clamp(fillPct - TOKEN_WIPE_EDGE_PCT, 0, 100)
+  const softEnd = clamp(fillPct + TOKEN_WIPE_SOFT_SPREAD_PCT, 0, 100)
+  return {
+    color: 'transparent',
+    WebkitTextFillColor: 'transparent',
+    backgroundImage: `linear-gradient(90deg, ${doneColor} 0%, ${doneColor} ${edgeStart}%, ${doneColor} ${fillPct}%, ${futureColor} ${softEnd}%, ${futureColor} 100%)`,
+    backgroundClip: 'text',
+    WebkitBackgroundClip: 'text',
+    textShadow: 'none',
+  }
+}
+
+const KaraokeLineRow = memo(
+  ({
+    line,
+    nextLineStart,
+    renderPlaybackMs,
+    className,
+    style,
+    tokenClassName,
+    highlightTokens = true,
+  }) => {
+    const segments = buildSegmentsFromLine(line)
+    const tokenRGB = useMemo(
+      () => (style?.color ? parseColorRGB(style.color) : [255, 255, 255]),
+      [style?.color],
+    )
+
+    return (
+      <Typography className={className} component="div" style={style}>
+        {segments.map((segment, idx) => {
+          if (!segment.token) {
+            return <span key={`text-${idx}`}>{segment.text}</span>
+          }
+
+          if (!highlightTokens) {
+            return <span key={`token-plain-${idx}`}>{segment.text}</span>
+          }
+
+          const { start: tokenStart, end: tokenEnd } =
+            resolveKaraokeTokenWindow(line, segment.tokenIndex, nextLineStart)
+
+          const isDone = tokenEnd != null ? renderPlaybackMs >= tokenEnd : false
+          const isActive =
+            !isDone && tokenStart != null && renderPlaybackMs >= tokenStart
+
+          const progress =
+            isDone ||
+            tokenStart == null ||
+            tokenEnd == null ||
+            tokenEnd <= tokenStart
+              ? isDone
+                ? 1
+                : 0
+              : clamp(
+                  (renderPlaybackMs - tokenStart) / (tokenEnd - tokenStart),
+                  0,
+                  1,
+                )
+
+          const justEnded =
+            tokenEnd != null &&
+            renderPlaybackMs > tokenEnd &&
+            renderPlaybackMs <= tokenEnd + KARAOKE_WORD_SETTLE_MS
+
+          const settleProgress =
+            justEnded && tokenEnd != null
+              ? clamp(
+                  (renderPlaybackMs - tokenEnd) / KARAOKE_WORD_SETTLE_MS,
+                  0,
+                  1,
+                )
+              : 0
+
+          let alpha = TOKEN_FUTURE_ALPHA
+          if (isDone) {
+            alpha = TOKEN_DONE_ALPHA
+          } else if (isActive) {
+            alpha = lerp(
+              TOKEN_FUTURE_ALPHA,
+              TOKEN_ACTIVE_ALPHA,
+              easeInOut(progress),
+            )
+          }
+          if (justEnded) {
+            alpha = lerp(
+              TOKEN_ACTIVE_ALPHA,
+              TOKEN_DONE_ALPHA,
+              easeInOut(settleProgress),
+            )
+          }
+          alpha = clamp(alpha, TOKEN_FUTURE_ALPHA, TOKEN_ACTIVE_ALPHA)
+          const fillProgress = isDone ? 1 : isActive ? progress : 0
+          const isBgRole = segment.token?.role === 'bg'
+
+          return (
+            <span
+              key={`token-${idx}-${tokenStart ?? 'na'}`}
+              className={tokenClassName}
+              style={{
+                ...buildTokenWipeStyle({
+                  fillProgress,
+                  highlightAlpha: isBgRole ? alpha * 0.72 : alpha,
+                  futureAlpha: isBgRole
+                    ? TOKEN_FUTURE_ALPHA * 0.72
+                    : TOKEN_FUTURE_ALPHA,
+                  rgb: tokenRGB,
+                }),
+                ...(isBgRole ? { fontStyle: 'italic' } : undefined),
+              }}
+            >
+              {segment.text}
+            </span>
+          )
+        })}
+      </Typography>
+    )
+  },
+  (prevProps, nextProps) => {
+    if (
+      prevProps.line !== nextProps.line ||
+      prevProps.nextLineStart !== nextProps.nextLineStart ||
+      prevProps.className !== nextProps.className ||
+      prevProps.tokenClassName !== nextProps.tokenClassName ||
+      prevProps.highlightTokens !== nextProps.highlightTokens ||
+      !areLineStylesEqual(prevProps.style, nextProps.style)
+    ) {
+      return false
+    }
+
+    return shouldSkipLineFrame(
+      prevProps.renderPlaybackMs,
+      nextProps.renderPlaybackMs,
+      nextProps.line,
+      nextProps.nextLineStart,
+    )
+  },
+)
+
+KaraokeLineRow.displayName = 'KaraokeLineRow'
+
+const KaraokeLyricsOverlay = ({
+  visible,
+  mainLyric,
+  translationLyric,
+  pronunciationLyric,
+  showTranslation,
+  showPronunciation,
+  translationEnabled,
+  pronunciationEnabled,
+  onToggleTranslation,
+  onTogglePronunciation,
+  audioInstance,
+  onClose,
+  inline = false,
+}) => {
+  const classes = useStyles()
+  const theme = useTheme()
+  const isDark = theme.palette.type === 'dark'
+  const [playbackMs, setPlaybackMs] = useState(0)
+  const [maxHeightPx, setMaxHeightPx] = useState(getMaxHeightPx())
+  const [bodyViewportHeight, setBodyViewportHeight] = useState(0)
+  const [isCompact, setIsCompact] = useState(
+    typeof window !== 'undefined' ? window.innerWidth <= 810 : false,
+  )
+  const [lyricsSettings, setLyricsSettings] = useState(loadLyricsSettings)
+
+  const handleSettingsChange = useCallback((next) => {
+    const normalized = normalizeLyricsSettings(next)
+    setLyricsSettings(normalized)
+    saveLyricsSettings(normalized)
+  }, [])
+
+  const handleResetAppearance = useCallback(() => {
+    const defaults = createDefaultLyricsSettings(isDark)
+    setLyricsSettings(defaults)
+    saveLyricsSettings(defaults)
+  }, [isDark])
+
+  const bodyRef = useRef(null)
+  const activeLineRef = useRef(null)
+
+  const mainLines = useMemo(() => buildKaraokeLines(mainLyric), [mainLyric])
+  const translationLines = useMemo(
+    () => buildKaraokeLines(translationLyric),
+    [translationLyric],
+  )
+  const pronunciationLines = useMemo(
+    () => buildKaraokeLines(pronunciationLyric),
+    [pronunciationLyric],
+  )
+  const overlayHeight = clamp(
+    lyricsSettings.overlayHeight,
+    KARAOKE_MIN_HEIGHT_PX,
+    maxHeightPx,
+  )
+
+  useEffect(() => {
+    const onResize = () => {
+      const nextMaxHeight = getMaxHeightPx()
+      setIsCompact(window.innerWidth <= 810)
+      setMaxHeightPx(nextMaxHeight)
+    }
+
+    onResize()
+    window.addEventListener('resize', onResize)
+    return () => window.removeEventListener('resize', onResize)
+  }, [])
+
+  useEffect(() => {
+    setLyricsSettings((prev) => {
+      const currentColor = prev.main.colorKey
+      const shouldSwap =
+        (isDark && currentColor === 'black') ||
+        (!isDark && currentColor === 'white')
+      if (!shouldSwap) return prev
+      const newColorKey = isDark ? 'white' : 'black'
+      const updated = {
+        ...prev,
+        main: { ...prev.main, colorKey: newColorKey },
+      }
+      saveLyricsSettings(updated)
+      return updated
+    })
+  }, [isDark])
+
+  useEffect(() => {
+    const body = bodyRef.current
+    if (!body) {
+      return undefined
+    }
+
+    const updateViewportHeight = () => {
+      setBodyViewportHeight(body.clientHeight || 0)
+    }
+
+    updateViewportHeight()
+
+    if (typeof ResizeObserver !== 'undefined') {
+      const observer = new ResizeObserver(updateViewportHeight)
+      observer.observe(body)
+      return () => observer.disconnect()
+    }
+
+    window.addEventListener('resize', updateViewportHeight)
+    return () => window.removeEventListener('resize', updateViewportHeight)
+  }, [overlayHeight, isCompact, showTranslation, showPronunciation, visible])
+
+  const onResizeStart = useCallback(
+    (event) => {
+      if (isCompact) {
+        return
+      }
+
+      event.preventDefault()
+      const startY = event.clientY
+      const startHeight = overlayHeight
+
+      const onMove = (moveEvent) => {
+        const delta = startY - moveEvent.clientY
+        handleSettingsChange({
+          ...lyricsSettings,
+          overlayHeight: clamp(
+            startHeight + delta,
+            KARAOKE_MIN_HEIGHT_PX,
+            maxHeightPx,
+          ),
+        })
+      }
+
+      const onUp = () => {
+        window.removeEventListener('mousemove', onMove)
+        window.removeEventListener('mouseup', onUp)
+      }
+
+      window.addEventListener('mousemove', onMove)
+      window.addEventListener('mouseup', onUp)
+    },
+    [
+      handleSettingsChange,
+      isCompact,
+      lyricsSettings,
+      maxHeightPx,
+      overlayHeight,
+    ],
+  )
+
+  useEffect(() => {
+    if (!visible || !audioInstance) {
+      setPlaybackMs(0)
+      return
+    }
+
+    let rafId = 0
+    let cancelled = false
+    let anchorAudioMs = 0
+    let anchorPerfMs = 0
+    let lastRenderMs = 0
+
+    const readPlaybackMs = () => {
+      const seconds = Number(audioInstance.currentTime)
+      if (!Number.isFinite(seconds) || seconds < 0) {
+        return 0
+      }
+      return seconds * 1000
+    }
+
+    const resetAnchor = (perfNow, observedMs) => {
+      anchorAudioMs = observedMs
+      anchorPerfMs = perfNow
+    }
+
+    const tick = () => {
+      if (cancelled) {
+        return
+      }
+
+      const observedMs = readPlaybackMs()
+      const perfNow = performance.now()
+      const playbackRate = Number(audioInstance.playbackRate)
+      const canInterpolate =
+        !audioInstance.paused &&
+        !audioInstance.seeking &&
+        Number.isFinite(playbackRate) &&
+        playbackRate > 0
+
+      let nowMs = observedMs
+
+      if (!canInterpolate) {
+        resetAnchor(perfNow, observedMs)
+      } else if (anchorPerfMs === 0) {
+        resetAnchor(perfNow, observedMs)
+      } else {
+        const predicted =
+          anchorAudioMs + (perfNow - anchorPerfMs) * playbackRate
+        const drift = observedMs - predicted
+        if (Math.abs(drift) > KARAOKE_CLOCK_DRIFT_RESET_MS) {
+          nowMs = observedMs
+          resetAnchor(perfNow, observedMs)
+        } else {
+          nowMs = predicted
+        }
+      }
+
+      const backwardsDrift = lastRenderMs - nowMs
+      if (canInterpolate && backwardsDrift > 0) {
+        nowMs = lastRenderMs
+      }
+
+      if (canInterpolate && backwardsDrift > KARAOKE_CLOCK_RESET_THRESHOLD_MS) {
+        resetAnchor(perfNow, observedMs)
+      } else if (
+        !canInterpolate &&
+        backwardsDrift > 0 &&
+        backwardsDrift <= KARAOKE_MONOTONIC_JITTER_MS
+      ) {
+        nowMs = lastRenderMs
+      }
+
+      nowMs = Math.max(0, nowMs)
+      lastRenderMs = nowMs
+
+      setPlaybackMs((prev) =>
+        Math.abs(prev - nowMs) >= KARAOKE_RENDER_UPDATE_EPSILON_MS
+          ? nowMs
+          : prev,
+      )
+      rafId = window.requestAnimationFrame(tick)
+    }
+
+    const initialMs = readPlaybackMs()
+    resetAnchor(performance.now(), initialMs)
+    lastRenderMs = initialMs
+    setPlaybackMs(initialMs)
+    rafId = window.requestAnimationFrame(tick)
+
+    return () => {
+      cancelled = true
+      if (rafId) {
+        window.cancelAnimationFrame(rafId)
+      }
+    }
+  }, [audioInstance, visible])
+
+  const renderPlaybackMs = playbackMs + KARAOKE_RENDER_LEAD_MS
+  const hasTimedMainLines = useMemo(
+    () => hasUsableKaraokeTiming(mainLines),
+    [mainLines],
+  )
+
+  const { lineIndex } = useMemo(
+    () =>
+      hasTimedMainLines
+        ? getActiveKaraokeState(mainLines, renderPlaybackMs)
+        : { lineIndex: -1, tokenIndex: -1 },
+    [hasTimedMainLines, mainLines, renderPlaybackMs],
+  )
+
+  const activeIndex = hasTimedMainLines && lineIndex >= 0 ? lineIndex : -1
+  const lineHeight = lyricsSettings.lineHeight
+  const lineGap = getLineGapPx(lineHeight)
+  const languageBadges = buildLanguageBadges({
+    mainLyric,
+    translationLyric,
+    pronunciationLyric,
+    showTranslation,
+    showPronunciation,
+    translationEnabled,
+    pronunciationEnabled,
+  })
+
+  const trByMainIndex = useMemo(() => {
+    if (!showTranslation || translationLines.length === 0) return {}
+    const map = {}
+    for (let i = 0; i < mainLines.length; i++) {
+      const { line } = resolveLayerLineForMain(mainLines, translationLines, i)
+      if (line) map[i] = line
+    }
+    return map
+  }, [mainLines, translationLines, showTranslation])
+
+  const prByMainIndex = useMemo(() => {
+    if (!showPronunciation || pronunciationLines.length === 0) return {}
+    const map = {}
+    for (let i = 0; i < mainLines.length; i++) {
+      const { line } = resolveLayerLineForMain(mainLines, pronunciationLines, i)
+      if (line) map[i] = line
+    }
+    return map
+  }, [mainLines, pronunciationLines, showPronunciation])
+
+  const hasTranslationLine = showTranslation && translationLines.length > 0
+  const hasPronunciationLine =
+    showPronunciation && pronunciationLines.length > 0
+  const measuredViewportHeight = bodyRef.current?.clientHeight || 0
+  const estimatedViewportHeight =
+    measuredViewportHeight > 0
+      ? measuredViewportHeight
+      : bodyViewportHeight > 0
+        ? bodyViewportHeight
+        : isCompact
+          ? 260
+          : Math.max(220, overlayHeight - 170)
+  const centerSpacerPx = Math.max(
+    hasTimedMainLines ? KARAOKE_CENTER_SPACER_MIN_PX : 0,
+    hasTimedMainLines
+      ? Math.floor(estimatedViewportHeight * KARAOKE_CENTER_SPACER_RATIO)
+      : 0,
+  )
+
+  useEffect(() => {
+    if (!visible || !hasTimedMainLines) {
+      return
+    }
+
+    let animFrameId = null
+    let scrollAnimId = null
+
+    animFrameId = window.requestAnimationFrame(() => {
+      const body = bodyRef.current
+      const activeNode = activeLineRef.current
+      if (!body || !activeNode) {
+        return
+      }
+
+      const bodyRect = body.getBoundingClientRect()
+      const activeRect = activeNode.getBoundingClientRect()
+      const deltaWithinBody =
+        activeRect.top -
+        bodyRect.top -
+        (body.clientHeight - activeRect.height) / 2
+      const maxTop = Math.max(0, body.scrollHeight - body.clientHeight)
+      const targetTop = clamp(body.scrollTop + deltaWithinBody, 0, maxTop)
+      const distance = targetTop - body.scrollTop
+
+      if (Math.abs(distance) < 2) {
+        return
+      }
+
+      const startTop = body.scrollTop
+      const duration = 400
+      const startTime = performance.now()
+
+      const easeOutCubic = (t) => 1 - Math.pow(1 - t, 3)
+
+      const step = (now) => {
+        const elapsed = now - startTime
+        const progress = Math.min(elapsed / duration, 1)
+        const eased = easeOutCubic(progress)
+        body.scrollTop = startTop + distance * eased
+        if (progress < 1) {
+          scrollAnimId = window.requestAnimationFrame(step)
+        }
+      }
+
+      scrollAnimId = window.requestAnimationFrame(step)
+    })
+
+    return () => {
+      if (animFrameId) window.cancelAnimationFrame(animFrameId)
+      if (scrollAnimId) window.cancelAnimationFrame(scrollAnimId)
+    }
+  }, [
+    centerSpacerPx,
+    hasTimedMainLines,
+    hasPronunciationLine,
+    hasTranslationLine,
+    lineIndex,
+    overlayHeight,
+    visible,
+  ])
+
+  if (
+    !visible ||
+    !hasStructuredLyricContent(mainLyric) ||
+    mainLines.length === 0
+  ) {
+    return null
+  }
+
+  const getMainLineStyle = (idx) => {
+    const [r, g, b] = parseColorRGB(getColorValue(lyricsSettings.main.colorKey))
+    if (!hasTimedMainLines) {
+      return {
+        opacity: 1,
+        color: `rgba(${r}, ${g}, ${b}, 0.98)`,
+        fontSize: lyricsSettings.main.fontSize,
+        lineHeight,
+      }
+    }
+
+    const delta = idx - activeIndex
+    const isActive = delta === 0
+    let opacity = isActive ? 1 : delta < 0 ? 0.6 : 0.72
+    const color = isActive
+      ? `rgba(${r}, ${g}, ${b}, 0.98)`
+      : delta < 0
+        ? `rgba(${r}, ${g}, ${b}, 0.4)`
+        : `rgba(${r}, ${g}, ${b}, 0.54)`
+
+    if (delta > 1) {
+      const level = clamp(delta, 1, 6)
+      opacity = Math.max(0.36, 0.74 - level * 0.08)
+    }
+
+    if (delta < -1) {
+      const level = clamp(Math.abs(delta), 1, 6)
+      opacity = Math.max(0.28, 0.62 - level * 0.08)
+    }
+
+    const baseFontSize = lyricsSettings.main.fontSize
+    const fontSize = isActive
+      ? baseFontSize
+      : Math.round(baseFontSize * KARAOKE_MAIN_INACTIVE_FONT_FACTOR)
+
+    return {
+      opacity,
+      color,
+      fontSize,
+      lineHeight,
+      maxWidth: isActive
+        ? '100%'
+        : `${Math.round(KARAOKE_MAIN_INACTIVE_FONT_FACTOR * 100)}%`,
+    }
+  }
+
+  const getAuxLineStyle = (idx, layerKey) => {
+    const [r, g, b] = parseColorRGB(
+      getColorValue(lyricsSettings[layerKey].colorKey),
+    )
+    const baseFontSize = lyricsSettings[layerKey].fontSize
+    if (!hasTimedMainLines) {
+      return {
+        opacity: 0.94,
+        fontSize: baseFontSize,
+        color: `rgba(${r}, ${g}, ${b}, 0.94)`,
+        lineHeight: KARAOKE_AUX_LINE_HEIGHT,
+      }
+    }
+
+    const delta = idx - activeIndex
+    const isActive = delta === 0
+
+    let opacity = isActive ? 0.94 : delta < 0 ? 0.5 : 0.62
+    const color = isActive
+      ? `rgba(${r}, ${g}, ${b}, 0.94)`
+      : delta < 0
+        ? `rgba(${r}, ${g}, ${b}, 0.42)`
+        : `rgba(${r}, ${g}, ${b}, 0.56)`
+
+    if (delta > 1) {
+      const level = clamp(delta, 1, 6)
+      opacity = Math.max(0.28, 0.64 - level * 0.08)
+    }
+
+    if (delta < -1) {
+      const level = clamp(Math.abs(delta), 1, 6)
+      opacity = Math.max(0.22, 0.5 - level * 0.08)
+    }
+
+    const fontSize = isActive
+      ? baseFontSize
+      : Math.round(baseFontSize * KARAOKE_AUX_INACTIVE_FONT_FACTOR)
+
+    return {
+      opacity,
+      fontSize,
+      color,
+      lineHeight: KARAOKE_AUX_LINE_HEIGHT,
+      maxWidth: isActive
+        ? '100%'
+        : `${Math.round(KARAOKE_AUX_INACTIVE_FONT_FACTOR * 100)}%`,
+    }
+  }
+
+  const overlayStyle = inline
+    ? undefined
+    : isCompact
+    ? undefined
+    : {
+        height: overlayHeight,
+        maxHeight: maxHeightPx,
+      }
+
+  return (
+    <div
+      className={clsx(classes.overlay, {
+        [classes.overlayInline]: inline,
+      })}
+      data-testid="karaoke-lyrics-overlay"
+      data-inline={inline ? 'true' : 'false'}
+      style={overlayStyle}
+      onClick={inline ? (event) => event.stopPropagation() : undefined}
+    >
+      {!inline && (
+        <div
+          className={classes.resizeHandle}
+          onMouseDown={onResizeStart}
+          data-testid="lyrics-resize-handle"
+        />
+      )}
+
+      <div
+        className={clsx(classes.header, {
+          [classes.headerInline]: inline,
+        })}
+      >
+        <div className={classes.headerLeft}>
+          <div className={classes.languageBadges}>
+            {languageBadges.map((badge) => {
+              const badgeEl = (
+                <div
+                  key={badge.key}
+                  className={clsx(classes.languageBadge, {
+                    [classes.languageBadgeActive]: badge.active,
+                    [classes.languageBadgeToggle]: badge.toggleable,
+                  })}
+                  data-testid={`lyrics-language-badge-${badge.key}`}
+                  role={badge.toggleable ? 'button' : undefined}
+                  tabIndex={badge.toggleable ? 0 : undefined}
+                  onClick={
+                    badge.toggleable
+                      ? badge.key === 'tr'
+                        ? onToggleTranslation
+                        : onTogglePronunciation
+                      : undefined
+                  }
+                  onKeyDown={
+                    badge.toggleable
+                      ? (e) => {
+                          if (e.key === 'Enter' || e.key === ' ') {
+                            e.preventDefault()
+                            ;(badge.key === 'tr'
+                              ? onToggleTranslation
+                              : onTogglePronunciation)()
+                          }
+                        }
+                      : undefined
+                  }
+                >
+                  <span className={classes.languageBadgeLabel}>
+                    {badge.label}
+                  </span>
+                  <span className={classes.languageBadgeValue}>{badge.lang}</span>
+                </div>
+              )
+              return badge.toggleable ? (
+                <Tooltip key={badge.key} title={badge.tooltip}>
+                  {badgeEl}
+                </Tooltip>
+              ) : badgeEl
+            })}
+          </div>
+        </div>
+
+        <div style={{ display: 'flex', alignItems: 'center', gap: 2 }}>
+          <LyricsSettingsPopover
+            settings={lyricsSettings}
+            onChange={handleSettingsChange}
+            onReset={handleResetAppearance}
+          />
+          <IconButton
+            className={classes.closeButton}
+            size="small"
+            onClick={onClose}
+            aria-label="Close lyrics"
+          >
+            <CloseIcon fontSize="small" />
+          </IconButton>
+        </div>
+      </div>
+
+      <div className={classes.bodyWrapper}>
+      <div
+        className={clsx(classes.body, {
+          [classes.bodyInline]: inline,
+        })}
+        ref={bodyRef}
+      >
+        <div className={classes.lines} style={{ gap: lineGap }}>
+          <div aria-hidden style={{ height: centerSpacerPx }} />
+          {mainLines.map((line, idx) => {
+            const trLine = trByMainIndex[idx]
+            const prLine = prByMainIndex[idx]
+            const mainNextLineStart = mainLines[idx + 1]?.start ?? null
+            const highlightedMainLine = buildHighlightedMainLine(
+              line,
+              mainNextLineStart,
+            )
+            const highlightedTrLine = buildHighlightedAuxLine(
+              line,
+              trLine,
+              mainNextLineStart,
+            )
+            const highlightedPrLine = buildHighlightedAuxLine(
+              line,
+              prLine,
+              mainNextLineStart,
+            )
+            const showTr = shouldShowAuxLine(line, trLine)
+            const showPr = shouldShowAuxLine(line, prLine)
+            const lineStyle = getMainLineStyle(idx)
+            const trStyle = getAuxLineStyle(idx, 'tr')
+            const prStyle = getAuxLineStyle(idx, 'pr')
+            return (
+              <div
+                key={`line-${line.index}-${line.start ?? idx}`}
+                ref={
+                  idx === activeIndex && hasTimedMainLines
+                    ? activeLineRef
+                    : null
+                }
+                className={classes.lineGroup}
+                style={{ cursor: line.start != null ? 'pointer' : undefined }}
+                onClick={() => {
+                  if (audioInstance && line.start != null) {
+                    audioInstance.currentTime = line.start / 1000
+                  }
+                }}
+              >
+                <KaraokeLineRow
+                  line={highlightedMainLine}
+                  nextLineStart={mainNextLineStart}
+                  renderPlaybackMs={renderPlaybackMs}
+                  className={classes.line}
+                  style={lineStyle}
+                  tokenClassName={classes.token}
+                  highlightTokens={hasTimedMainLines}
+                />
+                {showPr && (
+                  <KaraokeLineRow
+                    line={highlightedPrLine}
+                    nextLineStart={null}
+                    renderPlaybackMs={renderPlaybackMs}
+                    className={classes.inlinePr}
+                    style={prStyle}
+                    tokenClassName={classes.token}
+                    highlightTokens={hasTimedMainLines}
+                  />
+                )}
+                {showTr && (
+                  <KaraokeLineRow
+                    line={highlightedTrLine}
+                    nextLineStart={null}
+                    renderPlaybackMs={renderPlaybackMs}
+                    className={classes.inlineTr}
+                    style={trStyle}
+                    tokenClassName={classes.token}
+                    highlightTokens={hasTimedMainLines}
+                  />
+                )}
+              </div>
+            )
+          })}
+          <div aria-hidden style={{ height: centerSpacerPx }} />
+        </div>
+      </div>
+      </div>
+    </div>
+  )
+}
+
+export default KaraokeLyricsOverlay
diff --git a/ui/src/audioplayer/KaraokeLyricsOverlay.test.jsx b/ui/src/audioplayer/KaraokeLyricsOverlay.test.jsx
new file mode 100644
index 000000000..dba354363
--- /dev/null
+++ b/ui/src/audioplayer/KaraokeLyricsOverlay.test.jsx
@@ -0,0 +1,514 @@
+import React from 'react'
+import {
+  cleanup,
+  fireEvent,
+  render,
+  screen,
+  waitFor,
+} from '@testing-library/react'
+import KaraokeLyricsOverlay from './KaraokeLyricsOverlay'
+
+const DEFAULT_LINE_HEIGHT_TEXT = '1.30'
+const NEXT_LINE_HEIGHT_TEXT = '1.32'
+
+const audioInstance = {
+  currentTime: 0,
+  paused: true,
+  seeking: false,
+  playbackRate: 1,
+}
+
+const buildLyric = (kind, lang, value) => ({
+  kind,
+  lang,
+  synced: true,
+  line: [{ start: 1000, value }],
+})
+
+const renderOverlay = (props = {}) =>
+  render(
+    <KaraokeLyricsOverlay
+      visible={true}
+      mainLyric={buildLyric('main', 'ja', 'こんにちは')}
+      translationLyric={buildLyric('translation', 'en', 'Hello')}
+      pronunciationLyric={buildLyric('pronunciation', 'ja-Latn', 'konnichiwa')}
+      showTranslation={false}
+      showPronunciation={true}
+      translationEnabled={true}
+      pronunciationEnabled={true}
+      onToggleTranslation={() => {}}
+      onTogglePronunciation={() => {}}
+      audioInstance={audioInstance}
+      onClose={() => {}}
+      {...props}
+    />,
+  )
+
+describe('<KaraokeLyricsOverlay /> behavior', () => {
+  beforeEach(() => {
+    localStorage.clear()
+    window.innerWidth = 1200
+    window.innerHeight = 900
+    vi.spyOn(window, 'requestAnimationFrame').mockImplementation(() => 1)
+    vi.spyOn(window, 'cancelAnimationFrame').mockImplementation(() => {})
+  })
+
+  afterEach(() => {
+    vi.restoreAllMocks()
+    cleanup()
+  })
+
+  it('shows tooltips for translation, pronunciation, and appearance controls', async () => {
+    renderOverlay()
+
+    fireEvent.mouseOver(screen.getByTestId('lyrics-language-badge-tr'))
+    expect(await screen.findByText('Show translation')).toBeInTheDocument()
+
+    fireEvent.mouseOver(screen.getByTestId('lyrics-language-badge-pr'))
+    expect(await screen.findByText('Hide pronunciation')).toBeInTheDocument()
+
+    fireEvent.mouseOver(screen.getByTestId('lyrics-settings-button'))
+    expect(await screen.findByText('Appearance')).toBeInTheDocument()
+  })
+
+  it('renders inline mode without the desktop resize handle', () => {
+    renderOverlay({ inline: true })
+
+    expect(screen.getByTestId('karaoke-lyrics-overlay')).toHaveAttribute(
+      'data-inline',
+      'true',
+    )
+    expect(screen.queryByTestId('lyrics-resize-handle')).not.toBeInTheDocument()
+  })
+
+  it('renders the appearance popup with Main label and default line height for older settings', async () => {
+    localStorage.setItem(
+      'karaoke-lyrics-settings',
+      JSON.stringify({
+        tr: { fontSize: 16, colorKey: 'blue' },
+        main: { fontSize: 26, colorKey: 'white' },
+        pr: { fontSize: 15, colorKey: 'green' },
+      }),
+    )
+
+    renderOverlay()
+
+    fireEvent.click(screen.getByTestId('lyrics-settings-button'))
+
+    expect(await screen.findByText('Appearance')).toBeInTheDocument()
+    expect(screen.getByText('Main', { selector: 'div' })).toBeInTheDocument()
+    expect(screen.queryByText('Default')).not.toBeInTheDocument()
+    expect(screen.getByTestId('lyrics-reset-appearance')).toBeInTheDocument()
+    expect(screen.getByTestId('lyrics-line-height-value')).toHaveTextContent(
+      DEFAULT_LINE_HEIGHT_TEXT,
+    )
+  })
+
+  it('renders the lyric group in main, pronunciation, translation order with layer badges', () => {
+    renderOverlay({
+      showTranslation: true,
+      showPronunciation: true,
+    })
+
+    const mainLine = screen.getByText('こんにちは')
+    const pronunciationLine = screen.getByText('konnichiwa')
+    const translationLine = screen.getByText('Hello')
+
+    expect(
+      mainLine.compareDocumentPosition(pronunciationLine) &
+        Node.DOCUMENT_POSITION_FOLLOWING,
+    ).toBeTruthy()
+    expect(
+      pronunciationLine.compareDocumentPosition(translationLine) &
+        Node.DOCUMENT_POSITION_FOLLOWING,
+    ).toBeTruthy()
+
+    expect(screen.getByTestId('lyrics-language-badge-main')).toHaveTextContent(
+      'Mainja',
+    )
+    expect(screen.getByTestId('lyrics-language-badge-pr')).toHaveTextContent(
+      'PRja-Latn',
+    )
+    expect(screen.getByTestId('lyrics-language-badge-tr')).toHaveTextContent(
+      'TRen',
+    )
+  })
+
+  it('renders line-timed rows as whole-line spans without synthetic token splits', () => {
+    renderOverlay({
+      mainLyric: {
+        kind: 'main',
+        lang: 'en',
+        synced: true,
+        line: [
+          { start: 1000, end: 2400, value: 'Batter up, batter up, batter up' },
+        ],
+      },
+      translationLyric: {
+        kind: 'translation',
+        lang: 'ja',
+        synced: true,
+        line: [
+          {
+            start: 1000,
+            end: 2400,
+            value: 'バッターアップ、バッターアップ、バッターアップ',
+          },
+        ],
+      },
+      pronunciationLyric: {
+        kind: 'pronunciation',
+        lang: 'ja-Latn',
+        synced: true,
+        line: [
+          {
+            start: 1000,
+            end: 2400,
+            value: 'Battaa appu, battaa appu, battaa appu',
+          },
+        ],
+      },
+      showTranslation: true,
+      showPronunciation: true,
+    })
+
+    const mainLine = screen.getByText(
+      'Batter up, batter up, batter up',
+    ).parentElement
+    const pronunciationLine = screen.getByText(
+      'Battaa appu, battaa appu, battaa appu',
+    ).parentElement
+    const translationLine = screen.getByText(
+      'バッターアップ、バッターアップ、バッターアップ',
+    ).parentElement
+
+    expect(mainLine.querySelectorAll('span')).toHaveLength(1)
+    expect(pronunciationLine.querySelectorAll('span')).toHaveLength(1)
+    expect(translationLine.querySelectorAll('span')).toHaveLength(1)
+  })
+
+  it('uses cue byte offsets to segment repeated words in the karaoke line', () => {
+    renderOverlay({
+      mainLyric: {
+        kind: 'main',
+        lang: 'en',
+        synced: true,
+        line: [{ start: 0, end: 2400, value: 'Oh love love me tonight' }],
+        cueLine: [
+          {
+            index: 0,
+            start: 0,
+            end: 2400,
+            value: 'Oh love love me tonight',
+            cue: [
+              { start: 0, end: 300, value: 'Oh', byteStart: 0, byteEnd: 1 },
+              {
+                start: 900,
+                end: 1300,
+                value: 'love',
+                byteStart: 8,
+                byteEnd: 11,
+              },
+              {
+                start: 1300,
+                end: 1600,
+                value: 'me',
+                byteStart: 13,
+                byteEnd: 14,
+              },
+              {
+                start: 1600,
+                end: 2400,
+                value: 'tonight',
+                byteStart: 16,
+                byteEnd: 22,
+              },
+            ],
+          },
+        ],
+      },
+      translationLyric: null,
+      pronunciationLyric: null,
+      showTranslation: false,
+      showPronunciation: false,
+      translationEnabled: false,
+      pronunciationEnabled: false,
+      audioInstance: {
+        ...audioInstance,
+        currentTime: 1.0,
+      },
+    })
+
+    const mainLine = screen.getByText('Oh').parentElement
+    const segments = Array.from(mainLine.querySelectorAll('span')).map(
+      (span) => span.textContent,
+    )
+
+    expect(segments).toEqual([
+      'Oh',
+      ' love ',
+      'love',
+      ' ',
+      'me',
+      ' ',
+      'tonight',
+    ])
+  })
+
+  it('uses cue byte offsets to preserve explicit space cues in multibyte karaoke lines', () => {
+    renderOverlay({
+      mainLyric: {
+        kind: 'main',
+        lang: 'ko',
+        synced: true,
+        line: [{ start: 0, end: 900, value: '눈을 뜬 순간' }],
+        cueLine: [
+          {
+            index: 0,
+            start: 0,
+            end: 900,
+            value: '눈을 뜬 순간',
+            cue: [
+              { start: 0, end: 150, value: '눈을', byteStart: 0, byteEnd: 5 },
+              { start: 150, end: 250, value: ' ', byteStart: 6, byteEnd: 6 },
+              { start: 250, end: 450, value: '뜬', byteStart: 7, byteEnd: 9 },
+              { start: 450, end: 550, value: ' ', byteStart: 10, byteEnd: 10 },
+              { start: 550, end: 900, value: '순간', byteStart: 11, byteEnd: 16 },
+            ],
+          },
+        ],
+      },
+      translationLyric: null,
+      pronunciationLyric: null,
+      showTranslation: false,
+      showPronunciation: false,
+      translationEnabled: false,
+      pronunciationEnabled: false,
+      audioInstance: {
+        ...audioInstance,
+        currentTime: 0.3,
+      },
+    })
+
+    const mainLine = screen.getByText('눈을').parentElement
+    const segments = Array.from(mainLine.querySelectorAll('span')).map(
+      (span) => span.textContent,
+    )
+
+    expect(segments).toEqual(['눈을', ' ', '뜬', ' ', '순간'])
+  })
+
+  it('highlights line-timed pronunciation and translation rows with the active main line', () => {
+    renderOverlay({
+      mainLyric: {
+        kind: 'main',
+        lang: 'en',
+        synced: true,
+        line: [
+          { start: 1000, end: 1800, value: 'Line one' },
+          { start: 2500, end: 3300, value: 'Line two' },
+        ],
+      },
+      translationLyric: {
+        kind: 'translation',
+        lang: 'ja',
+        synced: true,
+        line: [
+          { start: 1000, end: 1800, value: '一行目' },
+          { start: 2500, end: 3300, value: '二行目' },
+        ],
+      },
+      pronunciationLyric: {
+        kind: 'pronunciation',
+        lang: 'ja-Latn',
+        synced: true,
+        line: [
+          { start: 1000, end: 1800, value: 'ichigyoume' },
+          { start: 2500, end: 3300, value: 'nigyoume' },
+        ],
+      },
+      showTranslation: true,
+      showPronunciation: true,
+      audioInstance: {
+        ...audioInstance,
+        currentTime: 1.2,
+      },
+    })
+
+    const activePronunciation = screen.getByText('ichigyoume').parentElement
+    const inactivePronunciation = screen.getByText('nigyoume').parentElement
+    const activeTranslation = screen.getByText('一行目').parentElement
+    const inactiveTranslation = screen.getByText('二行目').parentElement
+
+    expect(parseFloat(activePronunciation.style.opacity)).toBeGreaterThan(
+      parseFloat(inactivePronunciation.style.opacity),
+    )
+    expect(parseFloat(activeTranslation.style.opacity)).toBeGreaterThan(
+      parseFloat(inactiveTranslation.style.opacity),
+    )
+  })
+
+  it('pre-wraps inactive main lines so the active line keeps the same wrap shape', () => {
+    renderOverlay({
+      mainLyric: {
+        kind: 'main',
+        lang: 'en',
+        synced: true,
+        line: [
+          { start: 1000, end: 1800, value: 'First line that is getting focus' },
+          { start: 2500, end: 3300, value: 'Second line waiting below' },
+        ],
+      },
+      translationLyric: null,
+      pronunciationLyric: null,
+      showTranslation: false,
+      showPronunciation: false,
+      translationEnabled: false,
+      pronunciationEnabled: false,
+      audioInstance: {
+        ...audioInstance,
+        currentTime: 1.2,
+      },
+    })
+
+    const activeLine = screen.getByText('First line that is getting focus')
+      .parentElement
+    const inactiveLine = screen.getByText('Second line waiting below')
+      .parentElement
+
+    expect(parseFloat(activeLine.style.fontSize)).toBeGreaterThan(
+      parseFloat(inactiveLine.style.fontSize),
+    )
+    expect(activeLine.style.maxWidth).toBe('100%')
+    expect(inactiveLine.style.maxWidth).toBe('80%')
+  })
+
+  it('centers pronunciation text inside the pill container', () => {
+    renderOverlay({
+      showTranslation: false,
+      showPronunciation: true,
+    })
+
+    const pronunciationLine = screen.getByText('konnichiwa').parentElement
+    const styles = window.getComputedStyle(pronunciationLine)
+
+    expect(styles.display).toBe('inline-flex')
+    expect(styles.justifyContent).toBe('center')
+    expect(styles.alignItems).toBe('center')
+  })
+
+  it('renders untimed text lyrics in manual reading mode without a pinned active line', () => {
+    renderOverlay({
+      mainLyric: {
+        kind: 'main',
+        lang: 'en',
+        synced: false,
+        line: [{ value: 'First plain line' }, { value: 'Second plain line' }],
+      },
+      translationLyric: null,
+      pronunciationLyric: null,
+      showTranslation: false,
+      showPronunciation: false,
+      translationEnabled: false,
+      pronunciationEnabled: false,
+    })
+
+    const firstLine = screen.getByText('First plain line').parentElement
+    const secondLine = screen.getByText('Second plain line').parentElement
+
+    expect(firstLine.style.opacity).toBe('1')
+    expect(secondLine.style.opacity).toBe('1')
+    expect(firstLine.style.color).toBe(secondLine.style.color)
+  })
+
+  it('persists line height changes, keeps aux line spacing fixed, and stores overlay height', async () => {
+    renderOverlay({
+      mainLyric: buildLyric('main', 'en', 'Hello world'),
+      translationLyric: buildLyric('translation', 'es', 'Hola'),
+      pronunciationLyric: buildLyric('pronunciation', 'en-Latn', 'heh-loh'),
+      showTranslation: true,
+      showPronunciation: true,
+      translationEnabled: true,
+      pronunciationEnabled: true,
+    })
+
+    const overlay = screen.getByTestId('karaoke-lyrics-overlay')
+    const mainLine = screen.getByText('Hello world').parentElement
+    const pronunciationLine = screen.getByText('heh-loh').parentElement
+    expect(mainLine).toHaveStyle(`line-height: ${DEFAULT_LINE_HEIGHT_TEXT}`)
+    expect(pronunciationLine).toHaveStyle('line-height: 1.2')
+
+    fireEvent.click(screen.getByTestId('lyrics-settings-button'))
+
+    const slider = screen.getByRole('slider', { name: 'Line height' })
+    slider.focus()
+    fireEvent.keyDown(slider, { key: 'ArrowRight' })
+
+    await waitFor(() =>
+      expect(screen.getByTestId('lyrics-line-height-value')).toHaveTextContent(
+        NEXT_LINE_HEIGHT_TEXT,
+      ),
+    )
+
+    await waitFor(() =>
+      expect(mainLine).toHaveStyle(`line-height: ${NEXT_LINE_HEIGHT_TEXT}`),
+    )
+    expect(pronunciationLine).toHaveStyle('line-height: 1.2')
+
+    fireEvent.mouseDown(screen.getByTestId('lyrics-resize-handle'), {
+      clientY: 400,
+    })
+    fireEvent.mouseMove(window, { clientY: 360 })
+    fireEvent.mouseUp(window)
+
+    await waitFor(() => expect(overlay).toHaveStyle('height: 340px'))
+
+    const stored = JSON.parse(localStorage.getItem('karaoke-lyrics-settings'))
+    expect(stored.lineHeight).toBeCloseTo(1.32, 2)
+    expect(stored.overlayHeight).toBe(340)
+  })
+
+  it('resets appearance back to the default spacing and overlay height', async () => {
+    localStorage.setItem(
+      'karaoke-lyrics-settings',
+      JSON.stringify({
+        lineHeight: 1.8,
+        overlayHeight: 420,
+        tr: { fontSize: 16, colorKey: 'yellow' },
+        main: { fontSize: 28, colorKey: 'cyan' },
+        pr: { fontSize: 15, colorKey: 'pink' },
+      }),
+    )
+
+    renderOverlay({
+      mainLyric: buildLyric('main', 'en', 'Hello world'),
+      translationLyric: null,
+      pronunciationLyric: null,
+      showPronunciation: false,
+      translationEnabled: false,
+      pronunciationEnabled: false,
+    })
+
+    const overlay = screen.getByTestId('karaoke-lyrics-overlay')
+    const mainLine = screen.getByText('Hello world').parentElement
+    expect(overlay).toHaveStyle('height: 420px')
+    expect(mainLine).toHaveStyle('line-height: 1.8')
+
+    fireEvent.click(screen.getByTestId('lyrics-settings-button'))
+    fireEvent.click(screen.getByTestId('lyrics-reset-appearance'))
+
+    await waitFor(() =>
+      expect(screen.getByTestId('lyrics-line-height-value')).toHaveTextContent(
+        DEFAULT_LINE_HEIGHT_TEXT,
+      ),
+    )
+    await waitFor(() => expect(overlay).toHaveStyle('height: 300px'))
+    await waitFor(() =>
+      expect(mainLine).toHaveStyle(`line-height: ${DEFAULT_LINE_HEIGHT_TEXT}`),
+    )
+
+    const stored = JSON.parse(localStorage.getItem('karaoke-lyrics-settings'))
+    expect(stored.lineHeight).toBeCloseTo(1.3, 2)
+    expect(stored.overlayHeight).toBe(300)
+  })
+})
diff --git a/ui/src/audioplayer/MobileKaraokeLyricsPortal.jsx b/ui/src/audioplayer/MobileKaraokeLyricsPortal.jsx
new file mode 100644
index 000000000..636107184
--- /dev/null
+++ b/ui/src/audioplayer/MobileKaraokeLyricsPortal.jsx
@@ -0,0 +1,65 @@
+import React, { useEffect, useState } from 'react'
+import { createPortal } from 'react-dom'
+
+export const MOBILE_KARAOKE_LYRICS_HOST_SELECTOR =
+  '.react-jinke-music-player-mobile-cover'
+export const MOBILE_KARAOKE_LYRICS_ACTIVE_CLASS = 'nd-mobile-lyrics-active'
+
+const resolveMobileLyricsHost = () => {
+  if (typeof document === 'undefined') {
+    return null
+  }
+  return document.querySelector(MOBILE_KARAOKE_LYRICS_HOST_SELECTOR)
+}
+
+const MobileKaraokeLyricsPortal = ({ active, children }) => {
+  const [host, setHost] = useState(() =>
+    active ? resolveMobileLyricsHost() : null,
+  )
+
+  useEffect(() => {
+    if (typeof document === 'undefined') {
+      setHost(null)
+      return undefined
+    }
+
+    if (!active) {
+      setHost(null)
+      return undefined
+    }
+
+    const syncHost = () => {
+      setHost(resolveMobileLyricsHost())
+    }
+
+    syncHost()
+
+    const observer = new MutationObserver(syncHost)
+    observer.observe(document.body, {
+      childList: true,
+      subtree: true,
+    })
+
+    return () => observer.disconnect()
+  }, [active])
+
+  useEffect(() => {
+    if (!host) {
+      return undefined
+    }
+
+    host.classList.toggle(MOBILE_KARAOKE_LYRICS_ACTIVE_CLASS, active)
+
+    return () => {
+      host.classList.remove(MOBILE_KARAOKE_LYRICS_ACTIVE_CLASS)
+    }
+  }, [active, host])
+
+  if (!active || !host) {
+    return null
+  }
+
+  return createPortal(children, host)
+}
+
+export default MobileKaraokeLyricsPortal
diff --git a/ui/src/audioplayer/MobileKaraokeLyricsPortal.test.jsx b/ui/src/audioplayer/MobileKaraokeLyricsPortal.test.jsx
new file mode 100644
index 000000000..8b237e184
--- /dev/null
+++ b/ui/src/audioplayer/MobileKaraokeLyricsPortal.test.jsx
@@ -0,0 +1,55 @@
+import React from 'react'
+import { cleanup, render, screen, waitFor } from '@testing-library/react'
+import MobileKaraokeLyricsPortal, {
+  MOBILE_KARAOKE_LYRICS_ACTIVE_CLASS,
+} from './MobileKaraokeLyricsPortal'
+
+const HOST_CLASS = 'react-jinke-music-player-mobile-cover'
+
+describe('<MobileKaraokeLyricsPortal />', () => {
+  afterEach(() => {
+    cleanup()
+    document.body.innerHTML = ''
+  })
+
+  it('renders lyrics into the mobile cover host and toggles the active class', () => {
+    const host = document.createElement('div')
+    host.className = HOST_CLASS
+    document.body.appendChild(host)
+
+    const { rerender } = render(
+      <MobileKaraokeLyricsPortal active={true}>
+        <div data-testid="mobile-inline-lyrics">Lyrics</div>
+      </MobileKaraokeLyricsPortal>,
+    )
+
+    expect(host).toContainElement(screen.getByTestId('mobile-inline-lyrics'))
+    expect(host).toHaveClass(MOBILE_KARAOKE_LYRICS_ACTIVE_CLASS)
+
+    rerender(
+      <MobileKaraokeLyricsPortal active={false}>
+        <div data-testid="mobile-inline-lyrics">Lyrics</div>
+      </MobileKaraokeLyricsPortal>,
+    )
+
+    expect(screen.queryByTestId('mobile-inline-lyrics')).not.toBeInTheDocument()
+    expect(host).not.toHaveClass(MOBILE_KARAOKE_LYRICS_ACTIVE_CLASS)
+  })
+
+  it('attaches when the mobile cover host appears after mount', async () => {
+    render(
+      <MobileKaraokeLyricsPortal active={true}>
+        <div data-testid="mobile-inline-lyrics">Lyrics</div>
+      </MobileKaraokeLyricsPortal>,
+    )
+
+    const host = document.createElement('div')
+    host.className = HOST_CLASS
+    document.body.appendChild(host)
+
+    await waitFor(() =>
+      expect(host).toContainElement(screen.getByTestId('mobile-inline-lyrics')),
+    )
+    expect(host).toHaveClass(MOBILE_KARAOKE_LYRICS_ACTIVE_CLASS)
+  })
+})
diff --git a/ui/src/audioplayer/Player.jsx b/ui/src/audioplayer/Player.jsx
index 5599b9e1d..3603e263e 100644
--- a/ui/src/audioplayer/Player.jsx
+++ b/ui/src/audioplayer/Player.jsx
@@ -22,6 +22,7 @@ import {
   refreshQueue,
   setPlayMode,
   setTranscodingProfile,
+  updateQueueLyric,
   setVolume,
   syncQueue,
 } from '../actions'
@@ -33,6 +34,30 @@ import { keyMap } from '../hotkeys'
 import keyHandlers from './keyHandlers'
 import { calculateGain } from '../utils/calculateReplayGain'
 import { detectBrowserProfile, decisionService } from '../transcode'
+import {
+  getPreferredLyricLanguage,
+  hasStructuredLyricContent,
+  selectLyricLayers,
+  structuredLyricToLrc,
+} from './lyrics'
+import {
+  resolveLyricsOverlayState,
+  togglePronunciationPreference,
+} from './lyricsOverlayState'
+import KaraokeLyricsOverlay from './KaraokeLyricsOverlay'
+import MobileKaraokeLyricsPortal from './MobileKaraokeLyricsPortal'
+
+const emptyLyricLayers = {
+  main: null,
+  translation: null,
+  pronunciation: null,
+}
+
+const normalizeLyricLayers = (layers) => ({
+  main: layers?.main || null,
+  translation: layers?.translation || null,
+  pronunciation: layers?.pronunciation || null,
+})
 
 const Player = () => {
   const theme = useCurrentTheme()
@@ -120,6 +145,83 @@ const Player = () => {
   const gainInfo = useSelector((state) => state.replayGain)
   const [context, setContext] = useState(null)
   const [gainNode, setGainNode] = useState(null)
+  const lyricCacheRef = useRef(new Map())
+  const lyricRequestIdRef = useRef(0)
+  const playerRef = useRef(null)
+  const [karaokeVisiblePreference, setKaraokeVisiblePreference] =
+    useState(false)
+  const [selectedLyricLayers, setSelectedLyricLayers] =
+    useState(emptyLyricLayers)
+  const [translationPreference, setTranslationPreference] = useState(false)
+  const [pronunciationPreference, setPronunciationPreference] = useState(null)
+  const currentTrackId = playerState.current?.trackId
+  const currentTrackIsRadio = playerState.current?.isRadio
+  const selectedStructuredLyric = selectedLyricLayers.main
+  const hasKaraokeLyric = hasStructuredLyricContent(selectedStructuredLyric)
+  const hasTranslationLyric = hasStructuredLyricContent(
+    selectedLyricLayers.translation,
+  )
+  const hasPronunciationLyric = hasStructuredLyricContent(
+    selectedLyricLayers.pronunciation,
+  )
+  const { karaokeVisible, showTranslation, showPronunciation } =
+    resolveLyricsOverlayState({
+      karaokeVisiblePreference,
+      translationPreference,
+      pronunciationPreference,
+      hasKaraokeLyric,
+      hasTranslationLyric,
+      hasPronunciationLyric,
+    })
+  const useInlineMobileLyrics = karaokeVisible && !isDesktop
+
+  const applyLyricToRuntimePlayer = useCallback((trackId, lyric) => {
+    if (!trackId) {
+      return
+    }
+
+    const player = playerRef.current
+    if (!player || typeof player.setState !== 'function') {
+      return
+    }
+
+    player.setState((prevState) => {
+      const prevLists = Array.isArray(prevState.audioLists)
+        ? prevState.audioLists
+        : []
+      let changed = false
+      const audioLists = prevLists.map((item) => {
+        if (item.trackId !== trackId) {
+          return item
+        }
+        if (item.lyric === lyric) {
+          return item
+        }
+        changed = true
+        return {
+          ...item,
+          lyric,
+        }
+      })
+
+      const currentItem = audioLists.find(
+        (item) => item.musicSrc === prevState.musicSrc,
+      )
+      const currentLyric =
+        typeof currentItem?.lyric === 'string'
+          ? currentItem.lyric
+          : prevState.lyric
+
+      if (!changed && currentLyric === prevState.lyric) {
+        return null
+      }
+
+      return {
+        audioLists,
+        lyric: currentLyric,
+      }
+    })
+  }, [])
 
   useEffect(() => {
     if (
@@ -166,6 +268,88 @@ const Player = () => {
     return () => window.removeEventListener('beforeunload', handleBeforeUnload)
   }, [playerState, audioInstance])
 
+  useEffect(() => {
+    if (!currentTrackId || currentTrackIsRadio) {
+      setSelectedLyricLayers(emptyLyricLayers)
+      return
+    }
+
+    const cached = lyricCacheRef.current.get(currentTrackId)
+    let layers = emptyLyricLayers
+    if (cached && typeof cached !== 'string') {
+      if (cached.layers) {
+        layers = normalizeLyricLayers(cached.layers)
+      } else if (cached.structuredLyric) {
+        layers = normalizeLyricLayers({
+          main: cached.structuredLyric,
+        })
+      }
+    }
+    setSelectedLyricLayers(layers)
+  }, [currentTrackId, currentTrackIsRadio])
+
+  useEffect(() => {
+    lyricRequestIdRef.current += 1
+    const requestId = lyricRequestIdRef.current
+
+    if (!currentTrackId || currentTrackIsRadio) {
+      return
+    }
+
+    const cached = lyricCacheRef.current.get(currentTrackId)
+    if (cached !== undefined) {
+      const cachedLyric =
+        typeof cached === 'string' ? cached : cached?.lrc || ''
+      const cachedLayers =
+        typeof cached === 'string'
+          ? emptyLyricLayers
+          : cached?.layers
+            ? normalizeLyricLayers(cached.layers)
+            : normalizeLyricLayers({ main: cached?.structuredLyric })
+
+      setSelectedLyricLayers(cachedLayers)
+      if (cachedLyric) {
+        dispatch(updateQueueLyric(currentTrackId, cachedLyric))
+        applyLyricToRuntimePlayer(currentTrackId, cachedLyric)
+      }
+      return
+    }
+
+    subsonic
+      .getLyricsBySongId(currentTrackId)
+      .then((resp) => {
+        if (lyricRequestIdRef.current !== requestId) {
+          return
+        }
+
+        const structuredLyrics =
+          resp?.json?.['subsonic-response']?.lyricsList?.structuredLyrics || []
+        const layers = selectLyricLayers(
+          structuredLyrics,
+          getPreferredLyricLanguage(),
+        )
+        const lyric = layers.main ? structuredLyricToLrc(layers.main) : ''
+        lyricCacheRef.current.set(currentTrackId, {
+          lrc: lyric,
+          layers,
+        })
+        setSelectedLyricLayers(layers)
+
+        if (lyric !== '') {
+          dispatch(updateQueueLyric(currentTrackId, lyric))
+          applyLyricToRuntimePlayer(currentTrackId, lyric)
+        }
+      })
+      .catch(() => {
+        if (lyricRequestIdRef.current !== requestId) {
+          return
+        }
+        setSelectedLyricLayers(emptyLyricLayers)
+        // Do not cache network/request failures as empty lyrics, so we can retry.
+        lyricCacheRef.current.delete(currentTrackId)
+      })
+  }, [dispatch, currentTrackId, currentTrackIsRadio, applyLyricToRuntimePlayer])
+
   const defaultOptions = useMemo(
     () => ({
       theme: playerTheme,
@@ -177,7 +361,7 @@ const Player = () => {
       clearPriorAudioLists: false,
       showDestroy: true,
       showDownload: false,
-      showLyric: true,
+      showLyric: false,
       showReload: false,
       toggleMode: !isDesktop,
       glassBg: false,
@@ -215,12 +399,26 @@ const Player = () => {
         (playerState.clear || playerState.playIndex === 0),
       clearPriorAudioLists: playerState.clear,
       extendsContent: (
-        <PlayerToolbar id={current.trackId} isRadio={current.isRadio} />
+        <PlayerToolbar
+          id={current.trackId}
+          isRadio={current.isRadio}
+          onToggleLyrics={() =>
+            setKaraokeVisiblePreference((visible) => !visible)
+          }
+          lyricsActive={karaokeVisible}
+          lyricsDisabled={!hasKaraokeLyric}
+        />
       ),
       defaultVolume: isMobilePlayer ? 1 : playerState.volume,
       showMediaSession: !current.isRadio,
     }
-  }, [playerState, defaultOptions, isMobilePlayer])
+  }, [
+    playerState,
+    defaultOptions,
+    isMobilePlayer,
+    karaokeVisible,
+    hasKaraokeLyric,
+  ])
 
   const onAudioListsChange = useCallback(
     (_, audioLists, audioInfo) => dispatch(syncQueue(audioInfo, audioLists)),
@@ -340,10 +538,13 @@ const Player = () => {
   )
 
   const onCoverClick = useCallback((mode, audioLists, audioInfo) => {
+    if (!isDesktop && karaokeVisible) {
+      return
+    }
     if (mode === 'full' && audioInfo?.song?.albumId) {
       window.location.href = `#/album/${audioInfo.song.albumId}/show`
     }
-  }, [])
+  }, [isDesktop, karaokeVisible])
 
   const onAudioError = useCallback(
     (error, currentPlayId, audioLists, audioInfo) => {
@@ -392,6 +593,7 @@ const Player = () => {
   return (
     <ThemeProvider theme={createMuiTheme(theme)}>
       <ReactJkMusicPlayer
+        ref={playerRef}
         {...options}
         className={classes.player}
         onAudioListsChange={onAudioListsChange}
@@ -407,6 +609,55 @@ const Player = () => {
         onBeforeDestroy={onBeforeDestroy}
         getAudioInstance={setAudioInstance}
       />
+      {isDesktop && (
+        <KaraokeLyricsOverlay
+          visible={karaokeVisible}
+          mainLyric={selectedLyricLayers.main}
+          translationLyric={selectedLyricLayers.translation}
+          pronunciationLyric={selectedLyricLayers.pronunciation}
+          showTranslation={showTranslation}
+          showPronunciation={showPronunciation}
+          translationEnabled={hasTranslationLyric}
+          pronunciationEnabled={hasPronunciationLyric}
+          onToggleTranslation={() =>
+            setTranslationPreference((previous) =>
+              hasTranslationLyric ? !previous : false,
+            )
+          }
+          onTogglePronunciation={() =>
+            setPronunciationPreference((previous) =>
+              togglePronunciationPreference(previous, hasPronunciationLyric),
+            )
+          }
+          audioInstance={audioInstance}
+          onClose={() => setKaraokeVisiblePreference(false)}
+        />
+      )}
+      <MobileKaraokeLyricsPortal active={useInlineMobileLyrics}>
+        <KaraokeLyricsOverlay
+          visible={useInlineMobileLyrics}
+          inline={true}
+          mainLyric={selectedLyricLayers.main}
+          translationLyric={selectedLyricLayers.translation}
+          pronunciationLyric={selectedLyricLayers.pronunciation}
+          showTranslation={showTranslation}
+          showPronunciation={showPronunciation}
+          translationEnabled={hasTranslationLyric}
+          pronunciationEnabled={hasPronunciationLyric}
+          onToggleTranslation={() =>
+            setTranslationPreference((previous) =>
+              hasTranslationLyric ? !previous : false,
+            )
+          }
+          onTogglePronunciation={() =>
+            setPronunciationPreference((previous) =>
+              togglePronunciationPreference(previous, hasPronunciationLyric),
+            )
+          }
+          audioInstance={audioInstance}
+          onClose={() => setKaraokeVisiblePreference(false)}
+        />
+      </MobileKaraokeLyricsPortal>
       <GlobalHotKeys handlers={handlers} keyMap={keyMap} allowChanges />
     </ThemeProvider>
   )
diff --git a/ui/src/audioplayer/Player.lyricsState.test.jsx b/ui/src/audioplayer/Player.lyricsState.test.jsx
new file mode 100644
index 000000000..c47abea76
--- /dev/null
+++ b/ui/src/audioplayer/Player.lyricsState.test.jsx
@@ -0,0 +1,77 @@
+import {
+  resolveLyricsOverlayState,
+  togglePronunciationPreference,
+} from './lyricsOverlayState'
+
+describe('Player lyrics state helpers', () => {
+  it('keeps the lyrics window preference across track changes in the session', () => {
+    const visibleOnCurrentTrack = resolveLyricsOverlayState({
+      karaokeVisiblePreference: true,
+      translationPreference: false,
+      pronunciationPreference: null,
+      hasKaraokeLyric: true,
+      hasTranslationLyric: true,
+      hasPronunciationLyric: true,
+    })
+    expect(visibleOnCurrentTrack.karaokeVisible).toBe(true)
+
+    const hiddenForTrackWithoutLyrics = resolveLyricsOverlayState({
+      karaokeVisiblePreference: true,
+      translationPreference: false,
+      pronunciationPreference: null,
+      hasKaraokeLyric: false,
+      hasTranslationLyric: false,
+      hasPronunciationLyric: false,
+    })
+    expect(hiddenForTrackWithoutLyrics.karaokeVisible).toBe(false)
+
+    const restoredOnNextLyricsTrack = resolveLyricsOverlayState({
+      karaokeVisiblePreference: true,
+      translationPreference: false,
+      pronunciationPreference: null,
+      hasKaraokeLyric: true,
+      hasTranslationLyric: false,
+      hasPronunciationLyric: false,
+    })
+    expect(restoredOnNextLyricsTrack.karaokeVisible).toBe(true)
+  })
+
+  it('restores translation and pronunciation preferences after tracks without those layers', () => {
+    const initialState = resolveLyricsOverlayState({
+      karaokeVisiblePreference: false,
+      translationPreference: false,
+      pronunciationPreference: null,
+      hasKaraokeLyric: true,
+      hasTranslationLyric: true,
+      hasPronunciationLyric: true,
+    })
+    expect(initialState.showTranslation).toBe(false)
+    expect(initialState.showPronunciation).toBe(true)
+
+    const translationPreference = true
+    const pronunciationPreference = togglePronunciationPreference(null, true)
+    expect(pronunciationPreference).toBe(false)
+
+    const hiddenOnTrackWithoutAuxLayers = resolveLyricsOverlayState({
+      karaokeVisiblePreference: false,
+      translationPreference,
+      pronunciationPreference,
+      hasKaraokeLyric: true,
+      hasTranslationLyric: false,
+      hasPronunciationLyric: false,
+    })
+    expect(hiddenOnTrackWithoutAuxLayers.showTranslation).toBe(false)
+    expect(hiddenOnTrackWithoutAuxLayers.showPronunciation).toBe(false)
+
+    const restoredOnTrackWithAuxLayers = resolveLyricsOverlayState({
+      karaokeVisiblePreference: false,
+      translationPreference,
+      pronunciationPreference,
+      hasKaraokeLyric: true,
+      hasTranslationLyric: true,
+      hasPronunciationLyric: true,
+    })
+    expect(restoredOnTrackWithAuxLayers.showTranslation).toBe(true)
+    expect(restoredOnTrackWithAuxLayers.showPronunciation).toBe(false)
+  })
+})
diff --git a/ui/src/audioplayer/PlayerToolbar.jsx b/ui/src/audioplayer/PlayerToolbar.jsx
index 4812141ab..8487b0655 100644
--- a/ui/src/audioplayer/PlayerToolbar.jsx
+++ b/ui/src/audioplayer/PlayerToolbar.jsx
@@ -4,7 +4,9 @@ import { useGetOne } from 'react-admin'
 import { GlobalHotKeys } from 'react-hotkeys'
 import IconButton from '@material-ui/core/IconButton'
 import { useMediaQuery } from '@material-ui/core'
+import Tooltip from '@material-ui/core/Tooltip'
 import { RiSaveLine } from 'react-icons/ri'
+import { RiFileMusicLine } from 'react-icons/ri'
 import { LoveButton, useToggleLove } from '../common'
 import { openSaveQueueDialog } from '../actions'
 import { keyMap } from '../hotkeys'
@@ -55,7 +57,13 @@ const useStyles = makeStyles((theme) => ({
   },
 }))
 
-const PlayerToolbar = ({ id, isRadio }) => {
+const PlayerToolbar = ({
+  id,
+  isRadio,
+  onToggleLyrics,
+  lyricsActive = false,
+  lyricsDisabled = false,
+}) => {
   const dispatch = useDispatch()
   const { data, loading } = useGetOne('song', id, { enabled: !!id && !isRadio })
   const [toggleLove, toggling] = useToggleLove('song', data)
@@ -99,6 +107,25 @@ const PlayerToolbar = ({ id, isRadio }) => {
     />
   )
 
+  const toggleLyricsButton = (
+    <Tooltip title="Toggle lyrics">
+      <span>
+        <IconButton
+          size={isDesktop ? 'small' : undefined}
+          onClick={onToggleLyrics}
+          disabled={!onToggleLyrics || lyricsDisabled}
+          data-testid="toggle-lyrics-button"
+          className={buttonClass}
+          color={lyricsActive ? 'primary' : 'default'}
+        >
+          <RiFileMusicLine
+            className={!isDesktop ? classes.mobileIcon : undefined}
+          />
+        </IconButton>
+      </span>
+    </Tooltip>
+  )
+
   return (
     <>
       <GlobalHotKeys keyMap={keyMap} handlers={handlers} allowChanges />
@@ -106,11 +133,13 @@ const PlayerToolbar = ({ id, isRadio }) => {
         <li className={`${listItemClass} item`}>
           {saveQueueButton}
           {loveButton}
+          {toggleLyricsButton}
         </li>
       ) : (
         <>
           <li className={`${listItemClass} item`}>{saveQueueButton}</li>
           <li className={`${listItemClass} item`}>{loveButton}</li>
+          <li className={`${listItemClass} item`}>{toggleLyricsButton}</li>
         </>
       )}
     </>
diff --git a/ui/src/audioplayer/PlayerToolbar.test.jsx b/ui/src/audioplayer/PlayerToolbar.test.jsx
index d0368b0f0..3041001eb 100644
--- a/ui/src/audioplayer/PlayerToolbar.test.jsx
+++ b/ui/src/audioplayer/PlayerToolbar.test.jsx
@@ -71,6 +71,7 @@ describe('<PlayerToolbar />', () => {
       // Verify both buttons are rendered
       expect(screen.getByTestId('save-queue-button')).toBeInTheDocument()
       expect(screen.getByTestId('love-button')).toBeInTheDocument()
+      expect(screen.getByTestId('toggle-lyrics-button')).toBeInTheDocument()
 
       // Verify desktop classes are applied
       expect(listItems[0].className).toContain('toolbar')
@@ -102,6 +103,14 @@ describe('<PlayerToolbar />', () => {
         type: 'OPEN_SAVE_QUEUE_DIALOG',
       })
     })
+
+    it('triggers lyric toggle callback when lyrics button is clicked', () => {
+      const onToggleLyrics = vi.fn()
+      render(<PlayerToolbar id="song-1" onToggleLyrics={onToggleLyrics} />)
+
+      fireEvent.click(screen.getByTestId('toggle-lyrics-button'))
+      expect(onToggleLyrics).toHaveBeenCalledTimes(1)
+    })
   })
 
   describe('Mobile layout', () => {
@@ -114,11 +123,12 @@ describe('<PlayerToolbar />', () => {
 
       // Each button should be in its own list item
       const listItems = screen.getAllByRole('listitem')
-      expect(listItems).toHaveLength(2)
+      expect(listItems).toHaveLength(3)
 
       // Verify both buttons are rendered
       expect(screen.getByTestId('save-queue-button')).toBeInTheDocument()
       expect(screen.getByTestId('love-button')).toBeInTheDocument()
+      expect(screen.getByTestId('toggle-lyrics-button')).toBeInTheDocument()
 
       // Verify mobile classes are applied
       expect(listItems[0].className).toContain('mobileListItem')
@@ -140,6 +150,13 @@ describe('<PlayerToolbar />', () => {
       const loveButton = screen.getByTestId('love-button')
       expect(loveButton).toBeDisabled()
     })
+
+    it('disables lyrics button when lyrics are unavailable', () => {
+      render(<PlayerToolbar id="song-1" lyricsDisabled={true} />)
+
+      const lyricsButton = screen.getByTestId('toggle-lyrics-button')
+      expect(lyricsButton).toBeDisabled()
+    })
   })
 
   describe('Common behavior', () => {
diff --git a/ui/src/audioplayer/lyrics.js b/ui/src/audioplayer/lyrics.js
new file mode 100644
index 000000000..98c638ab3
--- /dev/null
+++ b/ui/src/audioplayer/lyrics.js
@@ -0,0 +1,725 @@
+const normalizeLanguageTag = (language) =>
+  (language || '').toLowerCase().replace('_', '-')
+
+// Roughly one 60fps frame; keeps line/token switching stable near tight boundaries.
+const KARAOKE_SWITCH_EPSILON_MS = 50
+const LYRIC_KIND_MAIN = 'main'
+const LYRIC_KIND_TRANSLATION = 'translation'
+const LYRIC_KIND_PRONUNCIATION = 'pronunciation'
+
+const padTime = (value) => {
+  const str = value.toString()
+  return str.length === 1 ? `0${str}` : str
+}
+
+const toTime = (value) => {
+  if (value == null || value === '') {
+    return null
+  }
+  const numeric = Number(value)
+  return Number.isFinite(numeric) ? numeric : null
+}
+
+const toByteOffset = (value) => {
+  if (value == null || value === '') {
+    return null
+  }
+  const numeric = Number(value)
+  if (!Number.isInteger(numeric) || numeric < 0) {
+    return null
+  }
+  return numeric
+}
+
+const compareNullableTime = (a, b) => {
+  if (a == null && b == null) {
+    return 0
+  }
+  if (a == null) {
+    return 1
+  }
+  if (b == null) {
+    return -1
+  }
+  return a - b
+}
+
+const sortTokensByStart = (tokens) =>
+  tokens
+    .map((token, order) => ({ ...token, order }))
+    .sort((a, b) => {
+      const byStart = compareNullableTime(a.start, b.start)
+      if (byStart !== 0) {
+        return byStart
+      }
+      const byEnd = compareNullableTime(a.end, b.end)
+      if (byEnd !== 0) {
+        return byEnd
+      }
+      return a.order - b.order
+    })
+    .map(({ order, ...token }) => token)
+
+const languageMatch = (candidate, preferred) => {
+  if (!candidate || !preferred) {
+    return false
+  }
+  return (
+    candidate === preferred ||
+    candidate.startsWith(`${preferred}-`) ||
+    preferred.startsWith(`${candidate}-`)
+  )
+}
+
+const hasTimedLines = (lyric) =>
+  lyric &&
+  lyric.synced &&
+  Array.isArray(lyric.line) &&
+  lyric.line.some((line) => Number.isFinite(Number(line.start)))
+
+const preferTimedLyrics = (lyrics) => {
+  const timed = lyrics.filter(hasTimedLines)
+  return timed.length > 0 ? timed : lyrics
+}
+
+const normalizeToken = (token) => {
+  if (!token) {
+    return null
+  }
+  const value = typeof token.value === 'string' ? token.value : ''
+  if (value.length === 0) {
+    return null
+  }
+  const byteStart = toByteOffset(token.byteStart)
+  const byteEnd = toByteOffset(token.byteEnd)
+  return {
+    start: toTime(token.start),
+    end: toTime(token.end),
+    value,
+    ...(byteStart != null ? { byteStart } : {}),
+    ...(byteEnd != null ? { byteEnd } : {}),
+  }
+}
+
+const utf8BytesForCodePoint = (codePoint) => {
+  if (codePoint <= 0x7f) {
+    return 1
+  }
+  if (codePoint <= 0x7ff) {
+    return 2
+  }
+  if (codePoint <= 0xffff) {
+    return 3
+  }
+  return 4
+}
+
+export const utf8ByteOffsetToCodeUnitIndex = (text, targetByteOffset) => {
+  if (typeof text !== 'string' || text.length === 0) {
+    return 0
+  }
+
+  const target = toByteOffset(targetByteOffset)
+  if (target == null || target <= 0) {
+    return 0
+  }
+
+  let byteOffset = 0
+  let index = 0
+  while (index < text.length) {
+    if (byteOffset >= target) {
+      return index
+    }
+    const codePoint = text.codePointAt(index)
+    byteOffset += utf8BytesForCodePoint(codePoint)
+    index += codePoint > 0xffff ? 2 : 1
+  }
+
+  return text.length
+}
+
+export const utf8ByteRangeToCodeUnitRange = (text, byteStart, byteEnd) => {
+  if (typeof text !== 'string') {
+    return null
+  }
+
+  const start = toByteOffset(byteStart)
+  const end = toByteOffset(byteEnd)
+  if (start == null || end == null || end < start) {
+    return null
+  }
+
+  const startIndex = utf8ByteOffsetToCodeUnitIndex(text, start)
+  const endIndex = utf8ByteOffsetToCodeUnitIndex(text, end + 1)
+  if (
+    startIndex >= endIndex ||
+    startIndex > text.length ||
+    endIndex > text.length
+  ) {
+    return null
+  }
+
+  return {
+    start: startIndex,
+    end: endIndex,
+    text: text.slice(startIndex, endIndex),
+  }
+}
+
+const buildAgentLookup = (structuredLyric) => {
+  const lookup = new Map()
+  const agents = Array.isArray(structuredLyric?.agents)
+    ? structuredLyric.agents
+    : []
+  for (const agent of agents) {
+    const id = typeof agent?.id === 'string' ? agent.id : ''
+    if (!id || lookup.has(id)) {
+      continue
+    }
+    lookup.set(id, {
+      id,
+      role: typeof agent?.role === 'string' ? agent.role : '',
+      name: typeof agent?.name === 'string' ? agent.name : '',
+    })
+  }
+  return lookup
+}
+
+const deriveUiRole = (agent) => {
+  if (!agent?.role || agent.role === 'main') {
+    return ''
+  }
+  return agent.role
+}
+
+const normalizeCueLine = (cueLine, fallbackIndex, agentLookup) => {
+  const index = Number.isFinite(Number(cueLine?.index))
+    ? Number(cueLine.index)
+    : fallbackIndex
+  const agentId = typeof cueLine?.agentId === 'string' ? cueLine.agentId : ''
+  const agent = agentId ? agentLookup.get(agentId) || null : null
+  const fallbackRole = typeof cueLine?.role === 'string' ? cueLine.role : ''
+  const tokens = sortTokensByStart(
+    Array.isArray(cueLine?.cue)
+      ? cueLine.cue.map(normalizeToken).filter(Boolean)
+      : [],
+  )
+
+  return {
+    index,
+    start: toTime(cueLine?.start),
+    end: toTime(cueLine?.end),
+    value: typeof cueLine?.value === 'string' ? cueLine.value : '',
+    role: agent ? deriveUiRole(agent) : fallbackRole,
+    agentId,
+    agentRole: agent?.role || fallbackRole,
+    agentName: agent?.name || '',
+    tokens,
+  }
+}
+
+const normalizeLyricKind = (kind) => {
+  const normalized = (kind || '').toLowerCase().trim()
+  switch (normalized) {
+    case LYRIC_KIND_TRANSLATION:
+      return LYRIC_KIND_TRANSLATION
+    case LYRIC_KIND_PRONUNCIATION:
+      return LYRIC_KIND_PRONUNCIATION
+    default:
+      return LYRIC_KIND_MAIN
+  }
+}
+
+const pickLyricByLanguage = (lyrics, preferredLanguage) => {
+  if (!Array.isArray(lyrics) || lyrics.length === 0) {
+    return null
+  }
+
+  const preferred = normalizeLanguageTag(preferredLanguage)
+  const preferredBase = preferred.split('-')[0]
+
+  return (
+    lyrics.find((lyric) =>
+      languageMatch(normalizeLanguageTag(lyric.lang), preferred),
+    ) ||
+    lyrics.find((lyric) =>
+      languageMatch(normalizeLanguageTag(lyric.lang), preferredBase),
+    ) ||
+    lyrics.find((lyric) =>
+      languageMatch(normalizeLanguageTag(lyric.lang), 'en'),
+    ) ||
+    lyrics[0]
+  )
+}
+
+const lineTimeWindow = (lines, index) => {
+  const line = lines[index]
+  if (!line) {
+    return { start: null, end: null }
+  }
+
+  const start = toTime(line.start)
+  const end = toTime(line.end) ?? toTime(lines[index + 1]?.start)
+  return { start, end }
+}
+
+export const hasCueTiming = (structuredLyric) =>
+  Boolean(
+    structuredLyric &&
+    Array.isArray(structuredLyric.cueLine) &&
+    structuredLyric.cueLine.some(
+      (cueLine) =>
+        Array.isArray(cueLine?.cue) &&
+        cueLine.cue.some((cue) => Number.isFinite(Number(cue?.start))),
+    ),
+  )
+
+export const hasStructuredLyricContent = (structuredLyric) =>
+  Boolean(
+    structuredLyric &&
+    ((Array.isArray(structuredLyric.line) &&
+      structuredLyric.line.some(
+        (line) => typeof line?.value === 'string' && line.value.trim() !== '',
+      )) ||
+      hasCueTiming(structuredLyric)),
+  )
+
+export const getPreferredLyricLanguage = () => {
+  if (typeof window !== 'undefined' && window.localStorage) {
+    const stored = window.localStorage.getItem('locale')
+    if (stored) {
+      return stored
+    }
+  }
+  if (typeof navigator !== 'undefined' && navigator.language) {
+    return navigator.language
+  }
+  return 'en'
+}
+
+export const selectLyricLayers = (structuredLyrics, preferredLanguage) => {
+  if (!Array.isArray(structuredLyrics)) {
+    return {
+      main: null,
+      translation: null,
+      pronunciation: null,
+    }
+  }
+
+  const available = structuredLyrics.filter(hasStructuredLyricContent)
+  if (available.length === 0) {
+    return {
+      main: null,
+      translation: null,
+      pronunciation: null,
+    }
+  }
+
+  const grouped = {
+    [LYRIC_KIND_MAIN]: [],
+    [LYRIC_KIND_TRANSLATION]: [],
+    [LYRIC_KIND_PRONUNCIATION]: [],
+  }
+
+  for (const lyric of available) {
+    grouped[normalizeLyricKind(lyric?.kind)].push(lyric)
+  }
+
+  const mainCandidates = grouped[LYRIC_KIND_MAIN].length
+    ? grouped[LYRIC_KIND_MAIN]
+    : available
+
+  return {
+    main: pickLyricByLanguage(
+      preferTimedLyrics(mainCandidates),
+      preferredLanguage,
+    ),
+    translation: pickLyricByLanguage(
+      preferTimedLyrics(grouped[LYRIC_KIND_TRANSLATION]),
+      preferredLanguage,
+    ),
+    pronunciation: pickLyricByLanguage(
+      preferTimedLyrics(grouped[LYRIC_KIND_PRONUNCIATION]),
+      preferredLanguage,
+    ),
+  }
+}
+
+export const pickStructuredLyric = (structuredLyrics, preferredLanguage) =>
+  selectLyricLayers(structuredLyrics, preferredLanguage).main
+
+export const structuredLyricToLrc = (structuredLyric) => {
+  if (!structuredLyric || !Array.isArray(structuredLyric.line)) {
+    return ''
+  }
+
+  let lyricText = ''
+  for (const line of structuredLyric.line) {
+    const start = Number(line.start)
+    if (!Number.isFinite(start) || start < 0) {
+      continue
+    }
+
+    let time = Math.floor(start / 10)
+    const ms = time % 100
+    time = Math.floor(time / 100)
+    const sec = time % 60
+    time = Math.floor(time / 60)
+    const min = time % 60
+
+    lyricText += `[${padTime(min)}:${padTime(sec)}.${padTime(ms)}] ${line.value || ''}\n`
+  }
+  return lyricText
+}
+
+export const structuredLyricsToLrc = (structuredLyrics, preferredLanguage) => {
+  const selected = pickStructuredLyric(structuredLyrics, preferredLanguage)
+  if (!selected) {
+    return ''
+  }
+  return structuredLyricToLrc(selected)
+}
+
+const buildBaseKaraokeLines = (baseLines) =>
+  baseLines.map((line, index) => ({
+    index,
+    start: toTime(line.start),
+    end: toTime(line.end),
+    value: typeof line.value === 'string' ? line.value : '',
+    tokens: [],
+  }))
+
+export const buildKaraokeLinesFromCueLines = (
+  rawCueLines,
+  baseLines,
+  agentLookup,
+) => {
+  const normalizedCueLines = rawCueLines.map((cueLine, fallbackIndex) => {
+    const normalized = normalizeCueLine(cueLine, fallbackIndex, agentLookup)
+    return {
+      ...normalized,
+      tokens: normalized.tokens.map((token) => ({
+        ...token,
+        role: normalized.role,
+        agentId: normalized.agentId,
+        agentName: normalized.agentName,
+        agentRole: normalized.agentRole,
+      })),
+    }
+  })
+
+  const byIndex = new Map()
+  for (const cueLine of normalizedCueLines) {
+    if (!byIndex.has(cueLine.index)) {
+      byIndex.set(cueLine.index, [])
+    }
+    byIndex.get(cueLine.index).push(cueLine)
+  }
+
+  return Array.from(byIndex.entries()).map(([index, group]) => {
+    const first = group[0]
+    const baseLine = baseLines[index] || {}
+    const tokens = sortTokensByStart(group.flatMap((cueLine) => cueLine.tokens))
+    const fallbackStart =
+      tokens.find((token) => token.start != null)?.start ?? null
+    const fallbackEnd =
+      [...tokens].reverse().find((token) => token.end != null)?.end ?? null
+    const value =
+      first.value ||
+      (typeof baseLine.value === 'string' ? baseLine.value : '') ||
+      tokens.map((token) => token.value).join('')
+
+    return {
+      index,
+      start: first.start ?? toTime(baseLine.start) ?? fallbackStart,
+      end: first.end ?? toTime(baseLine.end) ?? fallbackEnd,
+      value,
+      agentId: first.agentId,
+      agentName: first.agentName,
+      agentRole: first.agentRole,
+      tokens,
+    }
+  })
+}
+
+export const buildKaraokeLines = (structuredLyric) => {
+  if (!structuredLyric) {
+    return []
+  }
+
+  const agentLookup = buildAgentLookup(structuredLyric)
+  const baseLines = Array.isArray(structuredLyric.line)
+    ? structuredLyric.line
+    : []
+  const rawCueLines = Array.isArray(structuredLyric.cueLine)
+    ? structuredLyric.cueLine
+    : []
+
+  const lines =
+    rawCueLines.length > 0
+      ? buildKaraokeLinesFromCueLines(rawCueLines, baseLines, agentLookup)
+      : buildBaseKaraokeLines(baseLines)
+
+  const normalized = lines
+    .filter((line) => line.value || line.tokens.length > 0)
+    .sort((a, b) => {
+      if (a.start == null && b.start == null) {
+        return a.index - b.index
+      }
+      if (a.start == null) {
+        return 1
+      }
+      if (b.start == null) {
+        return -1
+      }
+      if (a.start !== b.start) {
+        return a.start - b.start
+      }
+      return a.index - b.index
+    })
+
+  for (let i = 0; i < normalized.length; i += 1) {
+    if (normalized[i].end == null) {
+      const nextStart = normalized[i + 1]?.start
+      if (nextStart != null) {
+        normalized[i].end = nextStart
+      }
+    }
+  }
+
+  return normalized
+}
+
+export const resolveKaraokeTokenWindow = (
+  line,
+  tokenIndex,
+  lineEndFallback = null,
+) => {
+  const tokens = Array.isArray(line?.tokens) ? line.tokens : []
+  const token = tokens[tokenIndex]
+  if (!token) {
+    return { start: null, end: null }
+  }
+
+  const prevToken = tokenIndex > 0 ? tokens[tokenIndex - 1] : null
+  const nextToken =
+    tokenIndex + 1 < tokens.length ? tokens[tokenIndex + 1] : null
+
+  const lineStart = toTime(line?.start)
+  const lineEnd = toTime(line?.end) ?? toTime(lineEndFallback)
+  const tokenCount = tokens.length
+  const hasLineWindow =
+    lineStart != null &&
+    lineEnd != null &&
+    Number.isFinite(lineStart) &&
+    Number.isFinite(lineEnd) &&
+    lineEnd > lineStart
+  const estimatedStart =
+    hasLineWindow && tokenCount > 0
+      ? lineStart + ((lineEnd - lineStart) * tokenIndex) / tokenCount
+      : null
+  const estimatedEnd =
+    hasLineWindow && tokenCount > 0
+      ? lineStart + ((lineEnd - lineStart) * (tokenIndex + 1)) / tokenCount
+      : null
+
+  let explicitStartCount = 0
+  let explicitEndCount = 0
+  const uniqueStarts = new Set()
+  const uniqueEnds = new Set()
+
+  for (let i = 0; i < tokenCount; i += 1) {
+    const explicitStart = toTime(tokens[i]?.start)
+    if (explicitStart != null) {
+      explicitStartCount += 1
+      uniqueStarts.add(explicitStart)
+    }
+
+    const explicitEnd = toTime(tokens[i]?.end)
+    if (explicitEnd != null) {
+      explicitEndCount += 1
+      uniqueEnds.add(explicitEnd)
+    }
+  }
+
+  const collapsedStarts =
+    explicitStartCount > 1 && uniqueStarts.size <= Math.max(1, tokenCount / 4)
+  const collapsedEnds =
+    explicitEndCount > 1 && uniqueEnds.size <= Math.max(1, tokenCount / 4)
+  const shouldForceEstimated =
+    hasLineWindow && tokenCount > 1 && (collapsedStarts || collapsedEnds)
+
+  if (shouldForceEstimated) {
+    return {
+      start: estimatedStart,
+      end: estimatedEnd,
+    }
+  }
+  const prevEnd = toTime(prevToken?.end) ?? toTime(prevToken?.start)
+
+  let start = toTime(token.start)
+  if (start == null) {
+    start = prevEnd ?? estimatedStart ?? lineStart
+  }
+
+  let end = toTime(token.end)
+  if (end == null) {
+    const nextDirectStart = toTime(nextToken?.start)
+    const nextEstimatedStart =
+      hasLineWindow && tokenIndex + 1 < tokenCount
+        ? lineStart + ((lineEnd - lineStart) * (tokenIndex + 1)) / tokenCount
+        : null
+    end = nextDirectStart ?? nextEstimatedStart ?? estimatedEnd ?? lineEnd
+  }
+
+  if (
+    tokenCount === 1 &&
+    hasLineWindow &&
+    (start == null || end == null || end <= start + 1)
+  ) {
+    start = lineStart
+    end = lineEnd
+  }
+
+  if (start != null && end != null && end < start) {
+    end = start
+  }
+
+  return { start, end }
+}
+
+export const getActiveKaraokeState = (lines, currentTimeMs) => {
+  if (!Array.isArray(lines) || lines.length === 0) {
+    return { lineIndex: -1, tokenIndex: -1 }
+  }
+
+  const current = Number.isFinite(Number(currentTimeMs))
+    ? Number(currentTimeMs)
+    : 0
+  let lineIndex = 0
+  for (let i = 0; i < lines.length; i += 1) {
+    const lineStart = toTime(lines[i]?.start)
+    if (lineStart == null || lineStart <= current + KARAOKE_SWITCH_EPSILON_MS) {
+      lineIndex = i
+      continue
+    }
+    break
+  }
+
+  for (let i = lineIndex; i >= 0; i -= 1) {
+    const lineStart = toTime(lines[i]?.start)
+    const lineEnd = toTime(lines[i]?.end) ?? toTime(lines[i + 1]?.start)
+    if (lineStart != null && current + KARAOKE_SWITCH_EPSILON_MS < lineStart) {
+      continue
+    }
+    if (lineEnd == null || current <= lineEnd + KARAOKE_SWITCH_EPSILON_MS) {
+      lineIndex = i
+      break
+    }
+  }
+
+  const activeLine = lines[lineIndex] || null
+  const tokens = Array.isArray(activeLine?.tokens) ? activeLine.tokens : []
+  let tokenIndex = -1
+  for (let i = 0; i < tokens.length; i += 1) {
+    const { start: tokenStart, end: tokenEnd } = resolveKaraokeTokenWindow(
+      activeLine,
+      i,
+      lines[lineIndex + 1]?.start,
+    )
+    if (
+      tokenStart == null ||
+      tokenStart <= current + KARAOKE_SWITCH_EPSILON_MS
+    ) {
+      tokenIndex = i
+      if (tokenEnd != null && current <= tokenEnd + KARAOKE_SWITCH_EPSILON_MS) {
+        break
+      }
+      continue
+    }
+    break
+  }
+
+  return { lineIndex, tokenIndex }
+}
+
+export const hasUsableKaraokeTiming = (lines) =>
+  Array.isArray(lines) &&
+  lines.some(
+    (line) =>
+      toTime(line?.start) != null ||
+      (Array.isArray(line?.tokens) &&
+        line.tokens.some(
+          (token) => toTime(token?.start) != null || toTime(token?.end) != null,
+        )),
+  )
+
+export const findLayerLineIndexForMain = (mainLines, layerLines, mainIndex) => {
+  if (
+    !Array.isArray(mainLines) ||
+    !Array.isArray(layerLines) ||
+    mainLines.length === 0 ||
+    layerLines.length === 0 ||
+    mainIndex < 0 ||
+    mainIndex >= mainLines.length
+  ) {
+    return -1
+  }
+
+  const { start: mainStart, end: mainEnd } = lineTimeWindow(
+    mainLines,
+    mainIndex,
+  )
+
+  if (mainStart == null) {
+    return -1
+  }
+  const mainWindowEnd = mainEnd ?? mainStart
+  const mainWindowDuration = Math.max(0, mainWindowEnd - mainStart)
+  const maxDelta = Math.max(550, Math.min(1400, mainWindowDuration + 420))
+
+  let bestIdx = -1
+  let bestScore = Number.POSITIVE_INFINITY
+
+  for (let i = 0; i < layerLines.length; i += 1) {
+    const { start, end } = lineTimeWindow(layerLines, i)
+
+    if (start != null && end != null) {
+      const overlap = Math.min(end, mainEnd ?? end) - Math.max(start, mainStart)
+      if (overlap >= 0) {
+        const score = Math.abs(start - mainStart) + Math.abs(i - mainIndex) * 30
+        if (score < bestScore) {
+          bestScore = score
+          bestIdx = i
+        }
+        continue
+      }
+    }
+
+    if (start != null) {
+      if (Math.abs(start - mainStart) > maxDelta) {
+        continue
+      }
+      const score = Math.abs(start - mainStart) + Math.abs(i - mainIndex) * 45
+      if (score < bestScore) {
+        bestScore = score
+        bestIdx = i
+      }
+    }
+  }
+
+  return bestIdx
+}
+
+export const resolveLayerLineForMain = (mainLines, layerLines, mainIndex) => {
+  const index = findLayerLineIndexForMain(mainLines, layerLines, mainIndex)
+  return {
+    index,
+    line: index >= 0 ? layerLines[index] : null,
+  }
+}
+
+export const buildHighlightedMainLine = (line) => line
+
+export const buildHighlightedAuxLine = (_referenceLine, auxiliaryLine) =>
+  auxiliaryLine ?? null
diff --git a/ui/src/audioplayer/lyrics.test.js b/ui/src/audioplayer/lyrics.test.js
new file mode 100644
index 000000000..1abea57a5
--- /dev/null
+++ b/ui/src/audioplayer/lyrics.test.js
@@ -0,0 +1,786 @@
+import {
+  buildHighlightedAuxLine,
+  buildHighlightedMainLine,
+  buildKaraokeLines,
+  buildKaraokeLinesFromCueLines,
+  findLayerLineIndexForMain,
+  getActiveKaraokeState,
+  getPreferredLyricLanguage,
+  hasUsableKaraokeTiming,
+  hasStructuredLyricContent,
+  pickStructuredLyric,
+  resolveKaraokeTokenWindow,
+  resolveLayerLineForMain,
+  selectLyricLayers,
+  structuredLyricsToLrc,
+  structuredLyricToLrc,
+  utf8ByteOffsetToCodeUnitIndex,
+  utf8ByteRangeToCodeUnitRange,
+} from './lyrics'
+
+describe('lyrics helpers', () => {
+  beforeEach(() => {
+    localStorage.clear()
+  })
+
+  it('prefers a lyric track that matches the locale', () => {
+    const selected = pickStructuredLyric(
+      [
+        {
+          lang: 'eng',
+          synced: true,
+          line: [{ start: 1000, value: 'English line' }],
+        },
+        {
+          lang: 'pt-BR',
+          synced: true,
+          line: [{ start: 1000, value: 'Linha em portugues' }],
+        },
+      ],
+      'pt-BR',
+    )
+
+    expect(selected.lang).toBe('pt-BR')
+  })
+
+  it('falls back to english when preferred locale is not available', () => {
+    const selected = pickStructuredLyric(
+      [
+        {
+          lang: 'eng',
+          synced: true,
+          line: [{ start: 1000, value: 'English line' }],
+        },
+        {
+          lang: 'deu',
+          synced: true,
+          line: [{ start: 1000, value: 'Deutsche Zeile' }],
+        },
+      ],
+      'pt-BR',
+    )
+
+    expect(selected.lang).toBe('eng')
+  })
+
+  it('falls back to first synced track when english is missing', () => {
+    const selected = pickStructuredLyric(
+      [
+        {
+          lang: 'jpn',
+          synced: true,
+          line: [{ start: 1000, value: 'Nihongo' }],
+        },
+        {
+          lang: 'deu',
+          synced: true,
+          line: [{ start: 1000, value: 'Deutsch' }],
+        },
+      ],
+      'pt-BR',
+    )
+
+    expect(selected.lang).toBe('jpn')
+  })
+
+  it('selects translation and pronunciation layers by kind', () => {
+    const layers = selectLyricLayers(
+      [
+        {
+          kind: 'main',
+          lang: 'ja',
+          synced: true,
+          line: [{ start: 1000, value: 'こんにちは' }],
+        },
+        {
+          kind: 'translation',
+          lang: 'es',
+          synced: true,
+          line: [{ start: 1000, value: 'Hola' }],
+        },
+        {
+          kind: 'pronunciation',
+          lang: 'ja-Latn',
+          synced: true,
+          line: [{ start: 1000, value: 'konnichiwa' }],
+        },
+      ],
+      'es-MX',
+    )
+
+    expect(layers.main.lang).toBe('ja')
+    expect(layers.translation.lang).toBe('es')
+    expect(layers.pronunciation.lang).toBe('ja-Latn')
+  })
+
+  it('treats missing kind as main for backward compatibility', () => {
+    const layers = selectLyricLayers(
+      [
+        {
+          lang: 'eng',
+          synced: true,
+          line: [{ start: 1000, value: 'Main' }],
+        },
+      ],
+      'eng',
+    )
+
+    expect(layers.main.lang).toBe('eng')
+    expect(layers.translation).toBeNull()
+    expect(layers.pronunciation).toBeNull()
+  })
+
+  it('falls back to unsynced lyric content when no timed track exists', () => {
+    const layers = selectLyricLayers(
+      [
+        {
+          lang: 'eng',
+          synced: false,
+          line: [{ value: 'Plain embedded lyric' }],
+        },
+      ],
+      'eng',
+    )
+
+    expect(layers.main).toEqual({
+      lang: 'eng',
+      synced: false,
+      line: [{ value: 'Plain embedded lyric' }],
+    })
+  })
+
+  it('still prefers timed lyrics when both timed and untimed tracks exist', () => {
+    const layers = selectLyricLayers(
+      [
+        {
+          lang: 'eng',
+          synced: false,
+          line: [{ value: 'Plain lyric' }],
+        },
+        {
+          lang: 'eng',
+          synced: true,
+          line: [{ start: 1000, value: 'Timed lyric' }],
+        },
+      ],
+      'eng',
+    )
+
+    expect(layers.main).toEqual({
+      lang: 'eng',
+      synced: true,
+      line: [{ start: 1000, value: 'Timed lyric' }],
+    })
+  })
+
+  it('matches layer line by timing for the active main line', () => {
+    const mainLines = [
+      { index: 0, start: 1000, end: 1800, value: 'Line A', tokens: [] },
+      { index: 1, start: 2000, end: 2800, value: 'Line B', tokens: [] },
+    ]
+    const layerLines = [
+      { index: 0, start: 900, end: 1750, value: 'A2', tokens: [] },
+      { index: 1, start: 2050, end: 2900, value: 'B2', tokens: [] },
+    ]
+
+    expect(findLayerLineIndexForMain(mainLines, layerLines, 1)).toBe(1)
+    expect(resolveLayerLineForMain(mainLines, layerLines, 0).line.value).toBe(
+      'A2',
+    )
+  })
+
+  it('matches metadata layers by nearest timing even when indexes differ', () => {
+    const mainLines = [
+      { index: 0, start: 1000, end: 1800, value: 'Line A', tokens: [] },
+      { index: 1, start: 2000, end: 2800, value: 'Line B', tokens: [] },
+      { index: 2, start: 3000, end: 3800, value: 'Line C', tokens: [] },
+    ]
+    const layerLines = [
+      { index: 2, start: 3020, end: 3820, value: 'C2', tokens: [] },
+      { index: 0, start: 980, end: 1760, value: 'A2', tokens: [] },
+      { index: 1, start: 2010, end: 2810, value: 'B2', tokens: [] },
+    ]
+
+    expect(findLayerLineIndexForMain(mainLines, layerLines, 1)).toBe(2)
+    expect(resolveLayerLineForMain(mainLines, layerLines, 2).line.value).toBe(
+      'C2',
+    )
+  })
+
+  it('keeps translation lines line-level when they do not have real cue timing', () => {
+    const mainLine = {
+      index: 0,
+      start: 1000,
+      end: 2200,
+      value: '불을 질러라',
+      tokens: [
+        { start: 1000, end: 1300, value: '불을 ' },
+        { start: 1300, end: 1650, value: '질' },
+        { start: 1650, end: 2200, value: '러라' },
+      ],
+    }
+    const translationLine = {
+      index: 0,
+      start: 1000,
+      end: 2200,
+      value: 'Set it on fire',
+      tokens: [],
+    }
+
+    const highlighted = buildHighlightedAuxLine(mainLine, translationLine, 2600)
+
+    expect(highlighted).toBe(translationLine)
+    expect(highlighted.tokens).toEqual([])
+  })
+
+  it('keeps pronunciation lines line-level when they do not have real cue timing', () => {
+    const mainLine = {
+      index: 0,
+      start: 1000,
+      end: 2200,
+      value: 'You もっと強く 素早く 吹き飛ばせ',
+      tokens: [],
+    }
+    const pronunciationLine = {
+      index: 0,
+      start: 1000,
+      end: 2200,
+      value: 'You motto tsuyoku subayaku fukitobase',
+      tokens: [],
+    }
+
+    const highlighted = buildHighlightedAuxLine(
+      mainLine,
+      pronunciationLine,
+      2600,
+    )
+
+    expect(highlighted).toBe(pronunciationLine)
+    expect(highlighted.tokens).toEqual([])
+  })
+
+  it('keeps main lines line-level when they do not have real cue timing', () => {
+    const line = {
+      index: 0,
+      start: 1000,
+      end: 2200,
+      value: 'Youもっと強く 素早く 吹き飛ばせ',
+      tokens: [],
+    }
+
+    const highlighted = buildHighlightedMainLine(line, 2600)
+
+    expect(highlighted).toBe(line)
+    expect(highlighted.tokens).toEqual([])
+  })
+
+  it('keeps auxiliary lines line-level when end time is missing and they lack cues', () => {
+    const mainLine = {
+      index: 0,
+      start: 1000,
+      end: null,
+      value: 'Hello there',
+      tokens: [],
+    }
+    const translationLine = {
+      index: 0,
+      start: 1000,
+      end: null,
+      value: 'Bonjour toi',
+      tokens: [],
+    }
+
+    const highlighted = buildHighlightedAuxLine(mainLine, translationLine, 2400)
+
+    expect(highlighted).toBe(translationLine)
+    expect(highlighted.tokens).toEqual([])
+  })
+
+  it('keeps main lines line-level when end time is missing and they lack cues', () => {
+    const line = {
+      index: 0,
+      start: 1000,
+      end: null,
+      value: 'One more time',
+      tokens: [],
+    }
+
+    const highlighted = buildHighlightedMainLine(line, 2400)
+
+    expect(highlighted).toBe(line)
+    expect(highlighted.tokens).toEqual([])
+  })
+
+  it('returns no layer match when the nearest line is too far in time', () => {
+    const mainLines = [
+      { index: 0, start: 1000, end: 1800, value: 'Line A', tokens: [] },
+      { index: 1, start: 2000, end: 2800, value: 'Line B', tokens: [] },
+    ]
+    const layerLines = [
+      { index: 0, start: 60000, end: 60800, value: 'Far line', tokens: [] },
+    ]
+
+    expect(findLayerLineIndexForMain(mainLines, layerLines, 1)).toBe(-1)
+    expect(resolveLayerLineForMain(mainLines, layerLines, 1).line).toBeNull()
+  })
+
+  it('converts a structured lyric track to LRC', () => {
+    const lrc = structuredLyricToLrc({
+      lang: 'eng',
+      synced: true,
+      line: [
+        { start: 18800, value: "We're no strangers to love" },
+        { start: 22801, value: 'You know the rules and so do I' },
+      ],
+    })
+
+    expect(lrc).toBe(
+      "[00:18.80] We're no strangers to love\n[00:22.80] You know the rules and so do I\n",
+    )
+  })
+
+  it('returns empty text when no synced lyrics are available', () => {
+    const lrc = structuredLyricsToLrc(
+      [{ lang: 'eng', synced: false, line: [{ value: 'Unsynced line' }] }],
+      'eng',
+    )
+
+    expect(lrc).toBe('')
+  })
+
+  it('reads preferred language from localStorage first', () => {
+    localStorage.setItem('locale', 'pt-BR')
+    expect(getPreferredLyricLanguage()).toBe('pt-BR')
+  })
+
+  it('builds karaoke lines from agent-based cueLine payload', () => {
+    const lines = buildKaraokeLines({
+      lang: 'eng',
+      synced: true,
+      line: [{ start: 1000, end: 3000, value: 'Hello world' }],
+      agents: [
+        { id: 'lead', role: 'main', name: 'Lead Vocal' },
+        { id: 'backing', role: 'bg' },
+      ],
+      cueLine: [
+        {
+          index: 0,
+          start: 1000,
+          end: 3000,
+          value: 'Hello world',
+          agentId: 'lead',
+          cue: [{ start: 1000, end: 1500, value: 'Hello' }],
+        },
+        {
+          index: 0,
+          start: 1000,
+          end: 3000,
+          value: 'Hello world',
+          agentId: 'backing',
+          cue: [{ start: 2000, end: 2500, value: 'world' }],
+        },
+      ],
+    })
+
+    expect(lines).toEqual([
+      {
+        agentId: 'lead',
+        agentName: 'Lead Vocal',
+        agentRole: 'main',
+        index: 0,
+        start: 1000,
+        end: 3000,
+        value: 'Hello world',
+        tokens: [
+          {
+            start: 1000,
+            end: 1500,
+            value: 'Hello',
+            role: '',
+            agentId: 'lead',
+            agentName: 'Lead Vocal',
+            agentRole: 'main',
+          },
+          {
+            start: 2000,
+            end: 2500,
+            value: 'world',
+            role: 'bg',
+            agentId: 'backing',
+            agentName: '',
+            agentRole: 'bg',
+          },
+        ],
+      },
+    ])
+  })
+
+  it('builds grouped karaoke lines directly from cue lines', () => {
+    const agentLookup = new Map([
+      ['lead', { id: 'lead', role: 'main', name: 'Lead Vocal' }],
+      ['backing', { id: 'backing', role: 'bg', name: '' }],
+    ])
+
+    const lines = buildKaraokeLinesFromCueLines(
+      [
+        {
+          index: 0,
+          start: 1000,
+          end: 3000,
+          value: 'Hello world',
+          agentId: 'lead',
+          cue: [{ start: 1000, end: 1500, value: 'Hello' }],
+        },
+        {
+          index: 0,
+          start: 1000,
+          end: 3000,
+          value: 'Hello world',
+          agentId: 'backing',
+          cue: [{ start: 2000, end: 2500, value: 'world' }],
+        },
+      ],
+      [{ start: 1000, end: 3000, value: 'Hello world' }],
+      agentLookup,
+    )
+
+    expect(lines).toEqual([
+      {
+        agentId: 'lead',
+        agentName: 'Lead Vocal',
+        agentRole: 'main',
+        index: 0,
+        start: 1000,
+        end: 3000,
+        value: 'Hello world',
+        tokens: [
+          {
+            start: 1000,
+            end: 1500,
+            value: 'Hello',
+            role: '',
+            agentId: 'lead',
+            agentName: 'Lead Vocal',
+            agentRole: 'main',
+          },
+          {
+            start: 2000,
+            end: 2500,
+            value: 'world',
+            role: 'bg',
+            agentId: 'backing',
+            agentName: '',
+            agentRole: 'bg',
+          },
+        ],
+      },
+    ])
+  })
+
+  it('preserves cue byte offsets on karaoke tokens', () => {
+    const lines = buildKaraokeLines({
+      lang: 'eng',
+      synced: true,
+      line: [{ start: 0, end: 2400, value: 'Oh love love me tonight' }],
+      cueLine: [
+        {
+          index: 0,
+          start: 0,
+          end: 2400,
+          value: 'Oh love love me tonight',
+          cue: [
+            { start: 0, end: 300, value: 'Oh', byteStart: 0, byteEnd: 1 },
+            { start: 900, end: 1300, value: 'love', byteStart: 8, byteEnd: 11 },
+            { start: 1300, end: 1600, value: 'me', byteStart: 13, byteEnd: 14 },
+            {
+              start: 1600,
+              end: 2400,
+              value: 'tonight',
+              byteStart: 16,
+              byteEnd: 22,
+            },
+          ],
+        },
+      ],
+    })
+
+    expect(
+      lines[0].tokens.map((token) => [
+        token.value,
+        token.byteStart,
+        token.byteEnd,
+      ]),
+    ).toEqual([
+      ['Oh', 0, 1],
+      ['love', 8, 11],
+      ['me', 13, 14],
+      ['tonight', 16, 22],
+    ])
+  })
+
+  it('preserves whitespace-only cues for exact byte-range rendering', () => {
+    const lines = buildKaraokeLines({
+      lang: 'kor',
+      synced: true,
+      line: [{ start: 0, end: 900, value: '눈을 뜬 순간' }],
+      cueLine: [
+        {
+          index: 0,
+          start: 0,
+          end: 900,
+          value: '눈을 뜬 순간',
+          cue: [
+            { start: 0, end: 150, value: '눈을', byteStart: 0, byteEnd: 5 },
+            { start: 150, end: 250, value: ' ', byteStart: 6, byteEnd: 6 },
+            { start: 250, end: 450, value: '뜬', byteStart: 7, byteEnd: 9 },
+            { start: 450, end: 550, value: ' ', byteStart: 10, byteEnd: 10 },
+            { start: 550, end: 900, value: '순간', byteStart: 11, byteEnd: 16 },
+          ],
+        },
+      ],
+    })
+
+    expect(
+      lines[0].tokens.map((token) => [
+        token.value,
+        token.byteStart,
+        token.byteEnd,
+      ]),
+    ).toEqual([
+      ['눈을', 0, 5],
+      [' ', 6, 6],
+      ['뜬', 7, 9],
+      [' ', 10, 10],
+      ['순간', 11, 16],
+    ])
+  })
+
+  it('maps UTF-8 byte offsets to string ranges for multibyte lyrics', () => {
+    const text = '눈을 뜬 순간'
+
+    expect(utf8ByteOffsetToCodeUnitIndex(text, 0)).toBe(0)
+    expect(utf8ByteOffsetToCodeUnitIndex(text, 3)).toBe(1)
+    expect(utf8ByteOffsetToCodeUnitIndex(text, 7)).toBe(3)
+    expect(utf8ByteRangeToCodeUnitRange(text, 11, 16)).toEqual({
+      start: 5,
+      end: 7,
+      text: '순간',
+    })
+  })
+
+  it('falls back to legacy cueLine role values when agents are absent', () => {
+    const lines = buildKaraokeLines({
+      lang: 'eng',
+      synced: true,
+      line: [{ start: 1000, end: 3000, value: 'Hello world' }],
+      cueLine: [
+        {
+          index: 0,
+          start: 1000,
+          end: 3000,
+          value: 'Hello world',
+          role: 'bg',
+          cue: [{ start: 1000, end: 1500, value: 'Hello' }],
+        },
+      ],
+    })
+
+    expect(lines[0].tokens[0].role).toBe('bg')
+    expect(lines[0].tokens[0].agentId).toBe('')
+    expect(lines[0].tokens[0].agentName).toBe('')
+  })
+
+  it('sorts token timing by start to keep playback stable', () => {
+    const lines = buildKaraokeLines({
+      lang: 'eng',
+      synced: true,
+      line: [{ start: 1000, end: 3000, value: 'Hello world' }],
+      cueLine: [
+        {
+          index: 0,
+          start: 1000,
+          end: 3000,
+          value: 'Hello world',
+          role: '',
+          cue: [
+            { start: 2000, end: 2500, value: 'world' },
+            { start: 1000, end: 1500, value: 'Hello' },
+          ],
+        },
+      ],
+    })
+
+    expect(lines[0].tokens.map((token) => token.value)).toEqual([
+      'Hello',
+      'world',
+    ])
+  })
+
+  it('keeps a single full-line token unchanged instead of expanding it synthetically', () => {
+    const lines = buildKaraokeLines({
+      lang: 'ko-Latn',
+      synced: true,
+      line: [{ start: 1000, end: 2000, value: 'Da-la-lun, dun' }],
+      cueLine: [
+        {
+          index: 0,
+          start: 1000,
+          end: 2000,
+          value: 'Da-la-lun, dun',
+          role: '',
+          cue: [{ start: 1000, end: 2000, value: 'Da-la-lun, dun' }],
+        },
+      ],
+    })
+
+    expect(lines).toHaveLength(1)
+    expect(lines[0].tokens).toHaveLength(1)
+    expect(lines[0].tokens[0].value).toBe('Da-la-lun, dun')
+
+    const firstWindow = resolveKaraokeTokenWindow(lines[0], 0)
+
+    expect(firstWindow.start).toBeCloseTo(1000)
+    expect(firstWindow.end).toBeCloseTo(2000)
+  })
+
+  it('detects active line and token for karaoke timing', () => {
+    const state = getActiveKaraokeState(
+      [
+        {
+          index: 0,
+          start: 1000,
+          end: 3000,
+          value: 'Hello world',
+          tokens: [
+            { start: 1000, end: 1500, value: 'Hello', role: '' },
+            { start: 2000, end: 2500, value: 'world', role: '' },
+          ],
+        },
+        {
+          index: 1,
+          start: 3500,
+          end: 5000,
+          value: 'Second line',
+          tokens: [],
+        },
+      ],
+      2200,
+    )
+
+    expect(state).toEqual({ lineIndex: 0, tokenIndex: 1 })
+  })
+
+  it('resolves token window fallback boundaries from neighboring tokens', () => {
+    const line = {
+      start: 1000,
+      end: 3000,
+      value: 'Hello world',
+      tokens: [
+        { start: 1200, value: 'Hello', role: '' },
+        { start: 1800, value: 'world', role: '' },
+      ],
+    }
+
+    expect(resolveKaraokeTokenWindow(line, 0)).toEqual({
+      start: 1200,
+      end: 1800,
+    })
+    expect(resolveKaraokeTokenWindow(line, 1)).toEqual({
+      start: 1800,
+      end: 3000,
+    })
+  })
+
+  it('infers sequential token windows when token timings are missing', () => {
+    const line = {
+      start: 1000,
+      end: 2000,
+      value: 'A B C',
+      tokens: [
+        { value: 'A', role: '' },
+        { value: 'B', role: '' },
+        { value: 'C', role: '' },
+      ],
+    }
+
+    const first = resolveKaraokeTokenWindow(line, 0)
+    const second = resolveKaraokeTokenWindow(line, 1)
+    const third = resolveKaraokeTokenWindow(line, 2)
+
+    expect(first.start).toBeCloseTo(1000)
+    expect(first.end).toBeCloseTo(1333.3333333333333)
+
+    expect(second.start).toBeCloseTo(1333.3333333333333)
+    expect(second.end).toBeCloseTo(1666.6666666666667)
+
+    expect(third.start).toBeCloseTo(1666.6666666666667)
+    expect(third.end).toBeCloseTo(2000)
+  })
+
+  it('falls back to sequential windows when token timings are collapsed', () => {
+    const line = {
+      start: 1000,
+      end: 2000,
+      value: 'A B C',
+      tokens: [
+        { start: 1000, end: 2000, value: 'A', role: '' },
+        { start: 1000, end: 2000, value: 'B', role: '' },
+        { start: 1000, end: 2000, value: 'C', role: '' },
+      ],
+    }
+
+    const first = resolveKaraokeTokenWindow(line, 0)
+    const second = resolveKaraokeTokenWindow(line, 1)
+    const third = resolveKaraokeTokenWindow(line, 2)
+
+    expect(first.start).toBeCloseTo(1000)
+    expect(first.end).toBeCloseTo(1333.3333333333333)
+    expect(second.start).toBeCloseTo(1333.3333333333333)
+    expect(second.end).toBeCloseTo(1666.6666666666667)
+    expect(third.start).toBeCloseTo(1666.6666666666667)
+    expect(third.end).toBeCloseTo(2000)
+  })
+
+  it('keeps token selection stable near tight token boundaries', () => {
+    const state = getActiveKaraokeState(
+      [
+        {
+          index: 0,
+          start: 1000,
+          end: 2000,
+          value: 'A B',
+          tokens: [
+            { start: 1000, end: 1100, value: 'A', role: '' },
+            { start: 1110, end: 1300, value: 'B', role: '' },
+          ],
+        },
+      ],
+      1108,
+    )
+
+    expect(state).toEqual({ lineIndex: 0, tokenIndex: 0 })
+  })
+
+  it('reports structured lyric content when token timing exists', () => {
+    expect(
+      hasStructuredLyricContent({
+        cueLine: [{ cue: [{ start: 100, value: 'a' }] }],
+      }),
+    ).toBe(true)
+  })
+
+  it('detects when built karaoke lines have no usable timing', () => {
+    expect(
+      hasUsableKaraokeTiming([
+        { index: 0, value: 'First line', tokens: [] },
+        { index: 1, value: 'Second line', tokens: [] },
+      ]),
+    ).toBe(false)
+
+    expect(
+      hasUsableKaraokeTiming([
+        { index: 0, start: 1000, value: 'Timed line', tokens: [] },
+      ]),
+    ).toBe(true)
+  })
+})
diff --git a/ui/src/audioplayer/lyricsOverlayState.js b/ui/src/audioplayer/lyricsOverlayState.js
new file mode 100644
index 000000000..e8ff0e0a8
--- /dev/null
+++ b/ui/src/audioplayer/lyricsOverlayState.js
@@ -0,0 +1,27 @@
+export const resolveLyricsOverlayState = ({
+  karaokeVisiblePreference,
+  translationPreference,
+  pronunciationPreference,
+  hasKaraokeLyric,
+  hasTranslationLyric,
+  hasPronunciationLyric,
+}) => ({
+  karaokeVisible: karaokeVisiblePreference && hasKaraokeLyric,
+  showTranslation: translationPreference && hasTranslationLyric,
+  showPronunciation:
+    (pronunciationPreference == null
+      ? hasPronunciationLyric
+      : pronunciationPreference) && hasPronunciationLyric,
+})
+
+export const togglePronunciationPreference = (
+  previousPreference,
+  hasPronunciationLyric,
+) => {
+  if (!hasPronunciationLyric) {
+    return false
+  }
+  const currentPreference =
+    previousPreference == null ? hasPronunciationLyric : previousPreference
+  return !currentPreference
+}
diff --git a/ui/src/audioplayer/styles.js b/ui/src/audioplayer/styles.js
index 30a14d4db..30ccf7afb 100644
--- a/ui/src/audioplayer/styles.js
+++ b/ui/src/audioplayer/styles.js
@@ -62,12 +62,30 @@ const useStyle = makeStyles(
           // Fix cover display when image is not square
           aspectRatio: '1/1',
           display: 'flex',
+          position: 'relative',
+        },
+      '& .react-jinke-music-player-mobile .react-jinke-music-player-mobile-cover.nd-mobile-lyrics-active':
+        {
+          width: '100%',
+          maxWidth: 'none',
+          height: 'clamp(280px, 42vh, 460px)',
+          aspectRatio: 'auto',
+          borderRadius: 12,
+          border: 'none',
+          boxShadow: 'none',
+          background: 'transparent',
+          cursor: 'default',
         },
       '& .react-jinke-music-player-mobile .react-jinke-music-player-mobile-cover img.cover':
         {
           animationDuration: (props) => !props.enableCoverAnimation && '0s',
           objectFit: 'contain', // Fix cover display when image is not square
         },
+      '& .react-jinke-music-player-mobile .react-jinke-music-player-mobile-cover.nd-mobile-lyrics-active img.cover':
+        {
+          opacity: 0,
+          pointerEvents: 'none',
+        },
       // Hide old singer display
       '& .react-jinke-music-player-mobile .react-jinke-music-player-mobile-singer':
         {
diff --git a/ui/src/reducers/playerReducer.js b/ui/src/reducers/playerReducer.js
index d6ab7484b..449dcd294 100644
--- a/ui/src/reducers/playerReducer.js
+++ b/ui/src/reducers/playerReducer.js
@@ -7,6 +7,7 @@ import {
   PLAYER_CURRENT,
   PLAYER_PLAY_NEXT,
   PLAYER_PLAY_TRACKS,
+  PLAYER_UPDATE_LYRIC,
   PLAYER_SET_TRACK,
   PLAYER_SET_VOLUME,
   PLAYER_SYNC_QUEUE,
@@ -60,21 +61,25 @@ const mapToAudioLists = (item) => {
   let lyricText = ''
 
   if (lyrics) {
-    const structured = JSON.parse(lyrics)
-    for (const structuredLyric of structured) {
-      if (structuredLyric.synced) {
-        for (const line of structuredLyric.line) {
-          let time = Math.floor(line.start / 10)
-          const ms = time % 100
-          time = Math.floor(time / 100)
-          const sec = time % 60
-          time = Math.floor(time / 60)
-          const min = time % 60
+    try {
+      const structured = JSON.parse(lyrics)
+      for (const structuredLyric of structured) {
+        if (structuredLyric.synced) {
+          for (const line of structuredLyric.line) {
+            let time = Math.floor(line.start / 10)
+            const ms = time % 100
+            time = Math.floor(time / 100)
+            const sec = time % 60
+            time = Math.floor(time / 60)
+            const min = time % 60
 
-          ms.toString()
-          lyricText += `[${pad(min)}:${pad(sec)}.${pad(ms)}] ${line.value}\n`
+            ms.toString()
+            lyricText += `[${pad(min)}:${pad(sec)}.${pad(ms)}] ${line.value}\n`
+          }
         }
       }
+    } catch {
+      lyricText = ''
     }
   }
 
@@ -208,6 +213,45 @@ const reduceMode = (state, { data: { mode } }) => {
   }
 }
 
+const reduceUpdateLyric = (state, { data: { trackId, lyric } }) => {
+  if (!trackId) {
+    return state
+  }
+
+  let changed = false
+  const queue = state.queue.map((item) => {
+    if (item.trackId !== trackId) {
+      return item
+    }
+    if (item.lyric === lyric) {
+      return item
+    }
+    changed = true
+    return {
+      ...item,
+      lyric,
+    }
+  })
+
+  if (!changed) {
+    return state
+  }
+
+  const current =
+    state.current?.trackId === trackId
+      ? {
+          ...state.current,
+          lyric,
+        }
+      : state.current
+
+  return {
+    ...state,
+    queue,
+    current,
+  }
+}
+
 export const playerReducer = (previousState = initialState, payload) => {
   const { type } = payload
   switch (type) {
@@ -245,6 +289,8 @@ export const playerReducer = (previousState = initialState, payload) => {
           previousState.savedPlayIndex >= 0 ? previousState.savedPlayIndex : 0,
       }
     }
+    case PLAYER_UPDATE_LYRIC:
+      return reduceUpdateLyric(previousState, payload)
     default:
       return previousState
   }
diff --git a/ui/src/reducers/playerReducer.test.js b/ui/src/reducers/playerReducer.test.js
index 110ce8c53..43f24ec55 100644
--- a/ui/src/reducers/playerReducer.test.js
+++ b/ui/src/reducers/playerReducer.test.js
@@ -1,11 +1,24 @@
-import { describe, it, expect } from 'vitest'
+import { describe, expect, it, vi } from 'vitest'
 import { playerReducer } from './playerReducer'
 import {
-  PLAYER_SYNC_QUEUE,
   PLAYER_CURRENT,
   PLAYER_REFRESH_QUEUE,
+  PLAYER_SET_TRACK,
+  PLAYER_SYNC_QUEUE,
+  PLAYER_UPDATE_LYRIC,
 } from '../actions'
 
+vi.mock('uuid', () => ({
+  v4: () => 'test-uuid',
+}))
+
+vi.mock('../subsonic', () => ({
+  default: {
+    streamUrl: vi.fn((id) => `/rest/stream?id=${id}`),
+    getCoverArtUrl: vi.fn(() => '/rest/getCoverArt?id=test'),
+  },
+}))
+
 describe('playerReducer', () => {
   describe('pending track selection survives SYNC_QUEUE and premature CURRENT', () => {
     // Simulates the real sequence when clicking a new song while one is playing:
@@ -54,8 +67,6 @@ describe('playerReducer', () => {
     })
 
     it('CURRENT for old track preserves pending playIndex', () => {
-      // After SYNC_QUEUE, queue has new UUIDs. The old track's UUID (zzz)
-      // is at index 2, but playIndex is 0. This is a premature callback.
       const stateAfterSync = {
         ...stateAfterPlayTracks,
         queue: [
@@ -71,7 +82,7 @@ describe('playerReducer', () => {
       const result = playerReducer(stateAfterSync, action)
       expect(result.playIndex).toBe(0)
       expect(result.clear).toBe(true)
-      expect(result.savedPlayIndex).toBe(2) // preserved from before
+      expect(result.savedPlayIndex).toBe(2)
     })
 
     it('CURRENT for correct track consumes pending playIndex', () => {
@@ -83,7 +94,6 @@ describe('playerReducer', () => {
           { trackId: 's3', uuid: 'zzz', name: 'Song 3' },
         ],
       }
-      // Player switched to Song 1 (uuid 'xxx', index 0 == playIndex)
       const action = {
         type: PLAYER_CURRENT,
         data: { uuid: 'xxx', name: 'Song 1', volume: 1 },
@@ -224,4 +234,80 @@ describe('playerReducer', () => {
       expect(result.playIndex).toBe(0)
     })
   })
+
+  it('maps embedded synced lyrics to LRC text', () => {
+    const lyrics = JSON.stringify([
+      {
+        lang: 'eng',
+        synced: true,
+        line: [{ start: 1000, value: 'Line one' }],
+      },
+      {
+        lang: 'eng',
+        synced: false,
+        line: [{ value: 'Unsynced line' }],
+      },
+    ])
+
+    const state = playerReducer(undefined, {
+      type: PLAYER_SET_TRACK,
+      data: {
+        id: 'song-1',
+        title: 'Test Song',
+        artist: 'Test Artist',
+        album: 'Test Album',
+        duration: 60,
+        lyrics,
+      },
+    })
+
+    expect(state.queue).toHaveLength(1)
+    expect(state.queue[0].lyric).toBe('[00:01.00] Line one\n')
+  })
+
+  it('updates queue lyric by track id', () => {
+    const initial = playerReducer(undefined, {
+      type: PLAYER_SET_TRACK,
+      data: {
+        id: 'song-1',
+        title: 'Test Song',
+        artist: 'Test Artist',
+        album: 'Test Album',
+        duration: 60,
+      },
+    })
+
+    const updated = playerReducer(initial, {
+      type: PLAYER_UPDATE_LYRIC,
+      data: {
+        trackId: 'song-1',
+        lyric: '[00:01.00] Updated lyric\n',
+      },
+    })
+
+    expect(updated.queue[0].lyric).toBe('[00:01.00] Updated lyric\n')
+  })
+
+  it('returns same state when lyric update does not match any track', () => {
+    const initial = playerReducer(undefined, {
+      type: PLAYER_SET_TRACK,
+      data: {
+        id: 'song-1',
+        title: 'Test Song',
+        artist: 'Test Artist',
+        album: 'Test Album',
+        duration: 60,
+      },
+    })
+
+    const updated = playerReducer(initial, {
+      type: PLAYER_UPDATE_LYRIC,
+      data: {
+        trackId: 'missing-track',
+        lyric: '[00:01.00] Updated lyric\n',
+      },
+    })
+
+    expect(updated).toBe(initial)
+  })
 })
diff --git a/ui/src/subsonic/index.js b/ui/src/subsonic/index.js
index 3579619aa..47ebabe99 100644
--- a/ui/src/subsonic/index.js
+++ b/ui/src/subsonic/index.js
@@ -1,5 +1,5 @@
-import { baseUrl } from '../utils'
 import { httpClient } from '../dataProvider'
+import { baseUrl } from '../utils'
 
 const url = (command, id, options) => {
   const username = localStorage.getItem('username')
@@ -120,6 +120,10 @@ const getTopSongs = (artist, count = 50) => {
   return httpClient(url('getTopSongs', null, { artist, count }))
 }
 
+const getLyricsBySongId = (id) => {
+  return httpClient(url('getLyricsBySongId', id, { enhanced: true }))
+}
+
 const streamUrl = (id, options) => {
   return baseUrl(
     url('stream', id, {
@@ -149,4 +153,5 @@ export default {
   getArtistInfo,
   getTopSongs,
   getSimilarSongs2,
+  getLyricsBySongId,
 }
diff --git a/ui/src/subsonic/index.test.js b/ui/src/subsonic/index.test.js
index a750694f4..6bd5e08ee 100644
--- a/ui/src/subsonic/index.test.js
+++ b/ui/src/subsonic/index.test.js
@@ -1,7 +1,13 @@
 import { vi } from 'vitest'
-import config from '../config'
+import { httpClient } from '../dataProvider'
 import subsonic from './index'
 
+vi.mock('../dataProvider', () => ({
+  httpClient: vi.fn(() => Promise.resolve({})),
+}))
+
+const COVER_ART_SIZE = 600
+
 describe('getCoverArtUrl', () => {
   beforeEach(() => {
     // Mock window.location
@@ -31,11 +37,7 @@ describe('getCoverArtUrl', () => {
       updatedAt: '2023-01-01T00:00:00Z',
     }
 
-    const url = subsonic.getCoverArtUrl(
-      playlistRecord,
-      config.uiCoverArtSize,
-      true,
-    )
+    const url = subsonic.getCoverArtUrl(playlistRecord, COVER_ART_SIZE, true)
 
     expect(url).toContain('pl-playlist-123')
     expect(url).toContain('size=600')
@@ -49,11 +51,7 @@ describe('getCoverArtUrl', () => {
       sync: true,
     }
 
-    const url = subsonic.getCoverArtUrl(
-      playlistRecord,
-      config.uiCoverArtSize,
-      true,
-    )
+    const url = subsonic.getCoverArtUrl(playlistRecord, COVER_ART_SIZE, true)
 
     expect(url).toContain('pl-playlist-123')
     expect(url).toContain('size=600')
@@ -68,11 +66,7 @@ describe('getCoverArtUrl', () => {
       updatedAt: '2023-01-01T00:00:00Z',
     }
 
-    const url = subsonic.getCoverArtUrl(
-      albumRecord,
-      config.uiCoverArtSize,
-      true,
-    )
+    const url = subsonic.getCoverArtUrl(albumRecord, COVER_ART_SIZE, true)
 
     expect(url).toContain('al-album-123')
     expect(url).toContain('size=600')
@@ -86,7 +80,7 @@ describe('getCoverArtUrl', () => {
       updatedAt: '2023-01-01T00:00:00Z',
     }
 
-    const url = subsonic.getCoverArtUrl(songRecord, config.uiCoverArtSize, true)
+    const url = subsonic.getCoverArtUrl(songRecord, COVER_ART_SIZE, true)
 
     expect(url).toContain('mf-song-123')
     expect(url).toContain('size=600')
@@ -99,11 +93,7 @@ describe('getCoverArtUrl', () => {
       updatedAt: '2023-01-01T00:00:00Z',
     }
 
-    const url = subsonic.getCoverArtUrl(
-      artistRecord,
-      config.uiCoverArtSize,
-      true,
-    )
+    const url = subsonic.getCoverArtUrl(artistRecord, COVER_ART_SIZE, true)
 
     expect(url).toContain('ar-artist-123')
     expect(url).toContain('size=600')
@@ -194,3 +184,30 @@ describe('getAvatarUrl', () => {
     expect(url).toContain('username=john')
   })
 })
+
+describe('getLyricsBySongId', () => {
+  beforeEach(() => {
+    vi.clearAllMocks()
+    const localStorageMock = {
+      getItem: vi.fn((key) => {
+        const values = {
+          username: 'testuser',
+          'subsonic-token': 'testtoken',
+          'subsonic-salt': 'testsalt',
+        }
+        return values[key] || null
+      }),
+    }
+    Object.defineProperty(window, 'localStorage', { value: localStorageMock })
+  })
+
+  it('calls the getLyricsBySongId endpoint with enhanced=true', async () => {
+    await subsonic.getLyricsBySongId('song-1')
+
+    expect(httpClient).toHaveBeenCalledTimes(1)
+    const calledUrl = httpClient.mock.calls[0][0]
+    expect(calledUrl).toContain('/rest/getLyricsBySongId?')
+    expect(calledUrl).toContain('id=song-1')
+    expect(calledUrl).toContain('enhanced=true')
+  })
+})