Merge 656a673eed0c6b251fdae0554a223cf0e5f4f221 into 2b9f32699348d520fc96acbd74be24b12702b02a

2026-05-03 06:51:16 +00:00 · 2026-05-01 02:25:15 +03:00 · 2026-05-01 02:25:15 +03:00 · 80bf5d94e9
commit 80bf5d94e9
parent 2b9f326993 656a673eed
41 changed files with 7806 additions and 99 deletions
--- a/README.md
+++ b/README.md
@ -52,6 +52,7 @@ A share of the revenue helps fund the development of Navidrome at no additional
 - **Multi-platform**, runs on macOS, Linux and Windows. **Docker** images are also provided
 - Ready to use binaries for all major platforms, including **Raspberry Pi**
 - Automatically **monitors your library** for changes, importing new files and reloading new metadata 
+ - Supports lyrics from sidecar **.ttml**, **.elrc**, **.lrc**, **.srt**, **.txt** files and embedded tags (via `lyricspriority`)
 - **Themeable**, modern and responsive **Web interface** based on [Material UI](https://material-ui.com)
 - **Compatible** with all Subsonic/Madsonic/Airsonic [clients](https://www.navidrome.org/docs/overview/#apps)
 - **Transcoding** on the fly. Can be set per user/player. **Opus encoding is supported**
--- a/conf/configuration.go
+++ b/conf/configuration.go
@ -763,7 +763,7 @@ func setViperDefaults() {
 	viper.SetDefault("artistartpriority", "artist.*, album/artist.*, external")
 	viper.SetDefault("artistimagefolder", "")
 	viper.SetDefault("discartpriority", "disc*.*, cd*.*, cover.*, folder.*, front.*, discsubtitle, embedded")
-	viper.SetDefault("lyricspriority", ".lrc,.txt,embedded")
+	viper.SetDefault("lyricspriority", ".ttml,.elrc,.lrc,.srt,.txt,embedded")
 	viper.SetDefault("enablegravatar", false)
 	viper.SetDefault("enablefavourites", true)
 	viper.SetDefault("enablestarrating", true)
--- a/core/lyrics/lyrics.go
+++ b/core/lyrics/lyrics.go
@ -14,6 +14,12 @@ type Lyrics interface {
 	GetLyrics(ctx context.Context, mf *model.MediaFile) (model.LyricList, error)
 }

+// BatchLyrics can resolve lyrics across multiple candidate media files while
+// still honoring the configured source priority globally.
+type BatchLyrics interface {
+	GetLyricsForMediaFiles(ctx context.Context, mediaFiles []model.MediaFile) (model.LyricList, error)
+}
+
 // PluginLoader discovers and loads lyrics provider plugins.
 type PluginLoader interface {
 	LoadLyricsProvider(name string) (Lyrics, bool)
@ -32,28 +38,53 @@ func NewLyrics(pluginLoader PluginLoader) Lyrics {
 // GetLyrics returns lyrics for the given media file, trying sources in the
 // order specified by conf.Server.LyricsPriority.
 func (l *lyricsService) GetLyrics(ctx context.Context, mf *model.MediaFile) (model.LyricList, error) {
-	var lyricsList model.LyricList
-	var err error
+	return l.getLyricsForCandidates(ctx, []*model.MediaFile{mf})
+}

+// GetLyricsForMediaFiles resolves lyrics across duplicate media files while
+// preserving the configured source priority across the full candidate set.
+func (l *lyricsService) GetLyricsForMediaFiles(ctx context.Context, mediaFiles []model.MediaFile) (model.LyricList, error) {
+	candidates := make([]*model.MediaFile, 0, len(mediaFiles))
+	for i := range mediaFiles {
+		candidates = append(candidates, &mediaFiles[i])
+	}
+	return l.getLyricsForCandidates(ctx, candidates)
+}
+
+func (l *lyricsService) getLyricsForCandidates(ctx context.Context, mediaFiles []*model.MediaFile) (model.LyricList, error) {
 	for pattern := range strings.SplitSeq(conf.Server.LyricsPriority, ",") {
 		pattern = strings.TrimSpace(pattern)
-		switch {
-		case strings.EqualFold(pattern, "embedded"):
-			lyricsList, err = fromEmbedded(ctx, mf)
-		case strings.HasPrefix(pattern, "."):
-			lyricsList, err = fromExternalFile(ctx, mf, strings.ToLower(pattern))
-		default:
-			lyricsList, err = l.fromPlugin(ctx, mf, pattern)
+		if pattern == "" {
+			continue
 		}

-		if err != nil {
-			log.Error(ctx, "error getting lyrics", "source", pattern, err)
-		}
+		for _, mf := range mediaFiles {
+			if mf == nil {
+				continue
+			}

-		if len(lyricsList) > 0 {
-			return lyricsList, nil
+			lyricsList, err := l.getLyricsFromSource(ctx, mf, pattern)
+			if err != nil {
+				log.Error(ctx, "error getting lyrics", "source", pattern, err)
+				continue
+			}
+
+			if len(lyricsList) > 0 {
+				return lyricsList, nil
+			}
 		}
 	}

 	return nil, nil
 }
+
+func (l *lyricsService) getLyricsFromSource(ctx context.Context, mf *model.MediaFile, pattern string) (model.LyricList, error) {
+	switch {
+	case strings.EqualFold(pattern, "embedded"):
+		return fromEmbedded(ctx, mf)
+	case strings.HasPrefix(pattern, "."):
+		return fromExternalFile(ctx, mf, strings.ToLower(pattern))
+	default:
+		return l.fromPlugin(ctx, mf, pattern)
+	}
+}
--- a/core/lyrics/lyrics_test.go
+++ b/core/lyrics/lyrics_test.go
@ -45,6 +45,71 @@ var _ = Describe("sources", func() {
 		},
 	}

+	elrcLyrics := model.LyricList{
+		model.Lyrics{
+			DisplayArtist: "ELRC Artist",
+			DisplayTitle:  "ELRC Song",
+			Lang:          "eng",
+			Line: []model.Line{
+				{
+					Start: gg.P(int64(1000)),
+					End:   gg.P(int64(3000)),
+					Value: "Lead words",
+					Cue: []model.Cue{
+						{
+							Start:     gg.P(int64(1000)),
+							End:       gg.P(int64(1500)),
+							Value:     "Lead ",
+							ByteStart: 0,
+							ByteEnd:   4,
+						},
+						{
+							Start:     gg.P(int64(1500)),
+							End:       gg.P(int64(3000)),
+							Value:     "words",
+							ByteStart: 5,
+							ByteEnd:   9,
+						},
+					},
+				},
+				{
+					Start: gg.P(int64(3000)),
+					Value: "Fallback line",
+				},
+			},
+			Synced: true,
+		},
+	}
+
+	ttmlLyrics := model.LyricList{
+		model.Lyrics{
+			Kind: "main",
+			Lang: "eng",
+			Line: []model.Line{
+				{
+					Start: gg.P(int64(18800)),
+					Value: "We're no strangers to love",
+				},
+				{
+					Start: gg.P(int64(22800)),
+					Value: "You know the rules and so do I",
+				},
+			},
+			Synced: true,
+		},
+		model.Lyrics{
+			Kind: "main",
+			Lang: "por",
+			Line: []model.Line{
+				{
+					Start: gg.P(int64(18800)),
+					Value: "Nao somos estranhos ao amor",
+				},
+			},
+			Synced: true,
+		},
+	}
+
 	unsyncedLyrics := model.LyricList{
 		model.Lyrics{
 			Lang: "xxx",
@ -60,6 +125,25 @@ var _ = Describe("sources", func() {
 		},
 	}

+	srtLyrics := model.LyricList{
+		model.Lyrics{
+			Lang: "xxx",
+			Line: []model.Line{
+				{
+					Start: gg.P(int64(18800)),
+					End:   gg.P(int64(22800)),
+					Value: "We're from subtitles",
+				},
+				{
+					Start: gg.P(int64(22801)),
+					End:   gg.P(int64(26000)),
+					Value: "Another subtitle line",
+				},
+			},
+			Synced: true,
+		},
+	}
+
 	BeforeEach(func() {
 		DeferCleanup(configtest.SetupConfig())

@ -81,7 +165,33 @@ var _ = Describe("sources", func() {
 	},
 		Entry("embedded > lrc > txt", "embedded,.lrc,.txt", embeddedLyrics),
 		Entry("lrc > embedded > txt", ".lrc,embedded,.txt", syncedLyrics),
-		Entry("txt > lrc > embedded", ".txt,.lrc,embedded", unsyncedLyrics))
+		Entry("elrc > lrc > embedded", ".elrc,.lrc,embedded", elrcLyrics),
+		Entry("srt > txt > embedded", ".srt,.txt,embedded", srtLyrics),
+		Entry("txt > lrc > embedded", ".txt,.lrc,embedded", unsyncedLyrics),
+		Entry("ttml > elrc > lrc > srt > embedded", ".ttml,.elrc,.lrc,.srt,embedded", ttmlLyrics))
+
+	It("resolves source priority across duplicate media files", func() {
+		conf.Server.LyricsPriority = ".ttml,embedded"
+		embeddedJSON, err := json.Marshal(embeddedLyrics)
+		Expect(err).To(BeNil())
+
+		svc := lyrics.NewLyrics(nil)
+		batchSvc, ok := svc.(lyrics.BatchLyrics)
+		Expect(ok).To(BeTrue())
+
+		list, err := batchSvc.GetLyricsForMediaFiles(ctx, []model.MediaFile{
+			{
+				Lyrics: string(embeddedJSON),
+				Path:   "tests/fixtures/01 Invisible (RED) Edit Version.mp3",
+			},
+			{
+				Lyrics: "[]",
+				Path:   "tests/fixtures/test.mp3",
+			},
+		})
+		Expect(err).To(BeNil())
+		Expect(list).To(Equal(ttmlLyrics))
+	})

 	Context("Errors", func() {
 		var RegularUserContext = XContext
--- a/core/lyrics/sources.go
+++ b/core/lyrics/sources.go
@ -5,6 +5,7 @@ import (
 	"errors"
 	"os"
 	"path"
+	"strings"

 	"github.com/navidrome/navidrome/log"
 	"github.com/navidrome/navidrome/model"
@ -36,18 +37,38 @@ func fromExternalFile(ctx context.Context, mf *model.MediaFile, suffix string) (
 		return nil, err
 	}

-	lyrics, err := model.ToLyrics("xxx", string(contents))
-	if err != nil {
-		log.Error(ctx, "error parsing lyric external file", "path", externalLyric, err)
-		return nil, err
-	} else if lyrics == nil {
+	var list model.LyricList
+	switch {
+	case strings.EqualFold(suffix, ".ttml"):
+		list, err = parseTTML(contents)
+		if err != nil {
+			log.Error(ctx, "error parsing ttml external file", "path", externalLyric, err)
+			return nil, err
+		}
+	case strings.EqualFold(suffix, ".srt"):
+		list, err = parseSRT(contents)
+		if err != nil {
+			log.Error(ctx, "error parsing srt external file", "path", externalLyric, err)
+			return nil, err
+		}
+	default:
+		lyrics, err := model.ToLyrics("xxx", string(contents))
+		if err != nil {
+			log.Error(ctx, "error parsing lyric external file", "path", externalLyric, err)
+			return nil, err
+		}
+		if lyrics != nil {
+			list = model.LyricList{*lyrics}
+		}
+	}
+
+	if len(list) == 0 {
 		log.Trace(ctx, "empty lyrics from external file", "path", externalLyric)
 		return nil, nil
 	}

 	log.Trace(ctx, "retrieved lyrics from external file", "path", externalLyric)
-
-	return model.LyricList{*lyrics}, nil
+	return list, nil
 }

 // fromPlugin attempts to load lyrics from a plugin with the given name.
--- a/core/lyrics/sources_test.go
+++ b/core/lyrics/sources_test.go
@ -88,6 +88,89 @@ var _ = Describe("sources", func() {
 			}))
 		})

+		It("should return Enhanced LRC lyrics with word-level cues from a file", func() {
+			mf := model.MediaFile{Path: "tests/fixtures/test-enhanced.mp3"}
+			lyrics, err := fromExternalFile(ctx, &mf, ".lrc")
+
+			Expect(err).To(BeNil())
+			Expect(lyrics).To(HaveLen(1))
+			Expect(lyrics[0].DisplayArtist).To(Equal("Test Artist"))
+			Expect(lyrics[0].DisplayTitle).To(Equal("Enhanced Test"))
+			Expect(lyrics[0].Lang).To(Equal("eng"))
+			Expect(lyrics[0].Synced).To(BeTrue())
+			Expect(lyrics[0].Line).To(HaveLen(3))
+
+			// Line 1: has inline markers → Cue array populated
+			Expect(lyrics[0].Line[0].Start).To(Equal(gg.P(int64(1000))))
+			Expect(lyrics[0].Line[0].End).To(Equal(gg.P(int64(3000))))
+			Expect(lyrics[0].Line[0].Value).To(Equal("Some lyrics here"))
+			Expect(lyrics[0].Line[0].Cue).To(HaveLen(3))
+			Expect(*lyrics[0].Line[0].Cue[0].Start).To(Equal(int64(1000)))
+			Expect(lyrics[0].Line[0].Cue[0].Value).To(Equal("Some "))
+			Expect(lyrics[0].Line[0].Cue[0].End).To(Equal(gg.P(int64(1500))))
+			Expect(lyrics[0].Line[0].Cue[0].ByteStart).To(Equal(0))
+			Expect(lyrics[0].Line[0].Cue[0].ByteEnd).To(Equal(4))
+			Expect(*lyrics[0].Line[0].Cue[1].Start).To(Equal(int64(1500)))
+			Expect(lyrics[0].Line[0].Cue[1].Value).To(Equal("lyrics "))
+			Expect(lyrics[0].Line[0].Cue[1].End).To(Equal(gg.P(int64(2000))))
+			Expect(lyrics[0].Line[0].Cue[1].ByteStart).To(Equal(5))
+			Expect(lyrics[0].Line[0].Cue[1].ByteEnd).To(Equal(11))
+			Expect(*lyrics[0].Line[0].Cue[2].Start).To(Equal(int64(2000)))
+			Expect(lyrics[0].Line[0].Cue[2].Value).To(Equal("here"))
+			Expect(lyrics[0].Line[0].Cue[2].End).To(Equal(gg.P(int64(3000))))
+			Expect(lyrics[0].Line[0].Cue[2].ByteStart).To(Equal(12))
+			Expect(lyrics[0].Line[0].Cue[2].ByteEnd).To(Equal(15))
+
+			// Line 2: has inline markers
+			Expect(lyrics[0].Line[1].Start).To(Equal(gg.P(int64(3000))))
+			Expect(lyrics[0].Line[1].End).To(Equal(gg.P(int64(5000))))
+			Expect(lyrics[0].Line[1].Value).To(Equal("More words"))
+			Expect(lyrics[0].Line[1].Cue).To(HaveLen(2))
+			Expect(lyrics[0].Line[1].Cue[0].End).To(Equal(gg.P(int64(3500))))
+			Expect(lyrics[0].Line[1].Cue[1].End).To(Equal(gg.P(int64(5000))))
+			Expect(lyrics[0].Line[1].Cue[0].ByteStart).To(Equal(0))
+			Expect(lyrics[0].Line[1].Cue[0].ByteEnd).To(Equal(4))
+			Expect(lyrics[0].Line[1].Cue[1].ByteStart).To(Equal(5))
+			Expect(lyrics[0].Line[1].Cue[1].ByteEnd).To(Equal(9))
+
+			// Line 3: plain line, no cues
+			Expect(lyrics[0].Line[2].Start).To(Equal(gg.P(int64(5000))))
+			Expect(lyrics[0].Line[2].Value).To(Equal("Plain line without inline markers"))
+			Expect(lyrics[0].Line[2].Cue).To(BeNil())
+		})
+
+		It("should return Enhanced LRC lyrics from an ELRC file", func() {
+			mf := model.MediaFile{Path: "tests/fixtures/test.mp3"}
+			lyrics, err := fromExternalFile(ctx, &mf, ".elrc")
+
+			Expect(err).To(BeNil())
+			Expect(lyrics).To(HaveLen(1))
+			Expect(lyrics[0].DisplayArtist).To(Equal("ELRC Artist"))
+			Expect(lyrics[0].DisplayTitle).To(Equal("ELRC Song"))
+			Expect(lyrics[0].Lang).To(Equal("eng"))
+			Expect(lyrics[0].Synced).To(BeTrue())
+			Expect(lyrics[0].Line).To(HaveLen(2))
+
+			Expect(lyrics[0].Line[0].Start).To(Equal(gg.P(int64(1000))))
+			Expect(lyrics[0].Line[0].End).To(Equal(gg.P(int64(3000))))
+			Expect(lyrics[0].Line[0].Value).To(Equal("Lead words"))
+			Expect(lyrics[0].Line[0].Cue).To(HaveLen(2))
+			Expect(*lyrics[0].Line[0].Cue[0].Start).To(Equal(int64(1000)))
+			Expect(lyrics[0].Line[0].Cue[0].Value).To(Equal("Lead "))
+			Expect(lyrics[0].Line[0].Cue[0].End).To(Equal(gg.P(int64(1500))))
+			Expect(lyrics[0].Line[0].Cue[0].ByteStart).To(Equal(0))
+			Expect(lyrics[0].Line[0].Cue[0].ByteEnd).To(Equal(4))
+			Expect(*lyrics[0].Line[0].Cue[1].Start).To(Equal(int64(1500)))
+			Expect(lyrics[0].Line[0].Cue[1].Value).To(Equal("words"))
+			Expect(lyrics[0].Line[0].Cue[1].End).To(Equal(gg.P(int64(3000))))
+			Expect(lyrics[0].Line[0].Cue[1].ByteStart).To(Equal(5))
+			Expect(lyrics[0].Line[0].Cue[1].ByteEnd).To(Equal(9))
+
+			Expect(lyrics[0].Line[1].Start).To(Equal(gg.P(int64(3000))))
+			Expect(lyrics[0].Line[1].Value).To(Equal("Fallback line"))
+			Expect(lyrics[0].Line[1].Cue).To(BeNil())
+		})
+
 		It("should return unsynchronized lyrics from a file", func() {
 			mf := model.MediaFile{Path: "tests/fixtures/test.mp3"}
 			lyrics, err := fromExternalFile(ctx, &mf, ".txt")
@ -109,6 +192,66 @@ var _ = Describe("sources", func() {
 			}))
 		})

+		It("should return synchronized lyrics from an SRT file", func() {
+			mf := model.MediaFile{Path: "tests/fixtures/test.mp3"}
+			lyrics, err := fromExternalFile(ctx, &mf, ".srt")
+
+			Expect(err).To(BeNil())
+			Expect(lyrics).To(Equal(model.LyricList{
+				model.Lyrics{
+					Lang: "xxx",
+					Line: []model.Line{
+						{
+							Start: gg.P(int64(18800)),
+							End:   gg.P(int64(22800)),
+							Value: "We're from subtitles",
+						},
+						{
+							Start: gg.P(int64(22801)),
+							End:   gg.P(int64(26000)),
+							Value: "Another subtitle line",
+						},
+					},
+					Synced: true,
+				},
+			}))
+		})
+
+		It("should return synchronized multilingual lyrics from a TTML file", func() {
+			mf := model.MediaFile{Path: "tests/fixtures/test.mp3"}
+			lyrics, err := fromExternalFile(ctx, &mf, ".ttml")
+
+			Expect(err).To(BeNil())
+			Expect(lyrics).To(Equal(model.LyricList{
+				{
+					Kind: "main",
+					Lang: "eng",
+					Line: []model.Line{
+						{
+							Start: gg.P(int64(18800)),
+							Value: "We're no strangers to love",
+						},
+						{
+							Start: gg.P(int64(22800)),
+							Value: "You know the rules and so do I",
+						},
+					},
+					Synced: true,
+				},
+				{
+					Kind: "main",
+					Lang: "por",
+					Line: []model.Line{
+						{
+							Start: gg.P(int64(18800)),
+							Value: "Nao somos estranhos ao amor",
+						},
+					},
+					Synced: true,
+				},
+			}))
+		})
+
 		It("should handle LRC files with UTF-8 BOM marker (issue #4631)", func() {
 			// The function looks for <basePath-without-ext><suffix>, so we need to pass
 			// a MediaFile with .mp3 path and look for .lrc suffix
@ -142,5 +285,33 @@ var _ = Describe("sources", func() {
 			Expect(lyrics[0].Line[1].Start).To(Equal(gg.P(int64(22801))))
 			Expect(lyrics[0].Line[1].Value).To(Equal("You know the rules and so do I"))
 		})
+
+		It("should handle TTML files with UTF-8 BOM marker", func() {
+			mf := model.MediaFile{Path: "tests/fixtures/bom-test.mp3"}
+			lyrics, err := fromExternalFile(ctx, &mf, ".ttml")
+
+			Expect(err).To(BeNil())
+			Expect(lyrics).To(HaveLen(1))
+			Expect(lyrics[0].Kind).To(Equal("main"))
+			Expect(lyrics[0].Synced).To(BeTrue())
+			Expect(lyrics[0].Line).To(HaveLen(1))
+			Expect(lyrics[0].Line[0].Start).To(Equal(gg.P(int64(0))))
+			Expect(lyrics[0].Line[0].Value).To(Equal("BOM test line"))
+		})
+
+		It("should handle UTF-16 BE encoded TTML files", func() {
+			mf := model.MediaFile{Path: "tests/fixtures/bom-utf16-test.mp3"}
+			lyrics, err := fromExternalFile(ctx, &mf, ".ttml")
+
+			Expect(err).To(BeNil())
+			Expect(lyrics).To(HaveLen(1))
+			Expect(lyrics[0].Kind).To(Equal("main"))
+			Expect(lyrics[0].Synced).To(BeTrue())
+			Expect(lyrics[0].Line).To(HaveLen(2))
+			Expect(lyrics[0].Line[0].Start).To(Equal(gg.P(int64(18800))))
+			Expect(lyrics[0].Line[0].Value).To(Equal("UTF16 line one"))
+			Expect(lyrics[0].Line[1].Start).To(Equal(gg.P(int64(22801))))
+			Expect(lyrics[0].Line[1].Value).To(Equal("UTF16 line two"))
+		})
 	})
 })
--- a/core/lyrics/srt.go
+++ b/core/lyrics/srt.go
@ -0,0 +1,161 @@
+package lyrics
+
+import (
+	"bytes"
+	"regexp"
+	"strconv"
+	"strings"
+
+	"github.com/navidrome/navidrome/model"
+	"github.com/navidrome/navidrome/utils/str"
+)
+
+var srtTimeRegex = regexp.MustCompile(`^\s*(\d{1,2}):(\d{2}):(\d{2})[,.](\d{1,3})\s*$`)
+
+func parseSRT(contents []byte) (model.LyricList, error) {
+	raw := strings.ReplaceAll(string(contents), "\r\n", "\n")
+	raw = strings.ReplaceAll(raw, "\r", "\n")
+
+	blocks := splitSRTBlocks(raw)
+	lines := make([]model.Line, 0, len(blocks))
+
+	for _, block := range blocks {
+		line, ok, err := parseSRTBlock(block)
+		if err != nil {
+			return nil, err
+		}
+		if ok {
+			lines = append(lines, line)
+		}
+	}
+
+	if len(lines) == 0 {
+		return nil, nil
+	}
+
+	lyrics := model.NormalizeLyrics(model.Lyrics{
+		Lang:   "xxx",
+		Line:   lines,
+		Synced: true,
+	})
+	return model.LyricList{lyrics}, nil
+}
+
+func splitSRTBlocks(raw string) []string {
+	raw = strings.TrimSpace(raw)
+	if raw == "" {
+		return nil
+	}
+
+	parts := strings.Split(raw, "\n\n")
+	blocks := make([]string, 0, len(parts))
+	for _, part := range parts {
+		part = strings.TrimSpace(part)
+		if part != "" {
+			blocks = append(blocks, part)
+		}
+	}
+	return blocks
+}
+
+func parseSRTBlock(block string) (model.Line, bool, error) {
+	scanner := bytes.Split([]byte(block), []byte("\n"))
+	if len(scanner) == 0 {
+		return model.Line{}, false, nil
+	}
+
+	lines := make([]string, 0, len(scanner))
+	for _, line := range scanner {
+		lines = append(lines, strings.TrimSpace(string(line)))
+	}
+
+	if len(lines) == 0 {
+		return model.Line{}, false, nil
+	}
+
+	startIdx := 0
+	if digitsOnly(lines[0]) {
+		startIdx = 1
+	}
+	if startIdx >= len(lines) {
+		return model.Line{}, false, nil
+	}
+
+	timing := strings.Split(lines[startIdx], "-->")
+	if len(timing) != 2 {
+		return model.Line{}, false, nil
+	}
+
+	startMs, err := parseSRTTime(timing[0])
+	if err != nil {
+		return model.Line{}, false, err
+	}
+	endMs, err := parseSRTTime(timing[1])
+	if err != nil {
+		return model.Line{}, false, err
+	}
+
+	textLines := make([]string, 0, len(lines)-startIdx-1)
+	for _, line := range lines[startIdx+1:] {
+		if line == "" {
+			continue
+		}
+		textLines = append(textLines, line)
+	}
+
+	value := str.SanitizeText(strings.Join(textLines, "\n"))
+	if value == "" {
+		return model.Line{}, false, nil
+	}
+
+	return model.Line{
+		Start: &startMs,
+		End:   &endMs,
+		Value: value,
+	}, true, nil
+}
+
+func parseSRTTime(value string) (int64, error) {
+	match := srtTimeRegex.FindStringSubmatch(strings.TrimSpace(value))
+	if match == nil {
+		return 0, strconv.ErrSyntax
+	}
+
+	hours, err := strconv.ParseInt(match[1], 10, 64)
+	if err != nil {
+		return 0, err
+	}
+	minutes, err := strconv.ParseInt(match[2], 10, 64)
+	if err != nil {
+		return 0, err
+	}
+	seconds, err := strconv.ParseInt(match[3], 10, 64)
+	if err != nil {
+		return 0, err
+	}
+	millis, err := strconv.ParseInt(match[4], 10, 64)
+	if err != nil {
+		return 0, err
+	}
+
+	switch len(match[4]) {
+	case 1:
+		millis *= 100
+	case 2:
+		millis *= 10
+	}
+
+	return (((hours*60)+minutes)*60+seconds)*1000 + millis, nil
+}
+
+func digitsOnly(value string) bool {
+	if value == "" {
+		return false
+	}
+	for _, ch := range value {
+		if ch < '0' || ch > '9' {
+			return false
+		}
+	}
+	return true
+}
--- a/core/lyrics/ttml.go
+++ b/core/lyrics/ttml.go
--- a/core/lyrics/ttml_test.go
+++ b/core/lyrics/ttml_test.go
@ -0,0 +1,407 @@
+package lyrics
+
+import (
+	"github.com/navidrome/navidrome/model"
+	"github.com/navidrome/navidrome/utils/gg"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("parseTTML", func() {
+	Describe("Multi-language and timing", func() {
+		It("should parse multiple language divs with inherited offsets and frame/tick timing", func() {
+			content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
+<tt xmlns="http://www.w3.org/ns/ttml" xmlns:ttp="http://www.w3.org/ns/ttml#parameter" ttp:frameRate="30" ttp:subFrameRate="2" ttp:tickRate="10">
+  <body>
+    <div xml:lang="eng" begin="1s">
+      <p begin="2s">Line one</p>
+      <p begin="00:00:04:15.1"><span>Line two</span><br/>with break</p>
+    </div>
+    <div xml:lang="por">
+      <p begin="45t">Linha</p>
+    </div>
+  </body>
+</tt>`)
+
+			list, err := parseTTML(content)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(list).To(HaveLen(2))
+
+			By("parsing the English track")
+			eng := list[0]
+			Expect(eng.Lang).To(Equal("eng"))
+			Expect(eng.Synced).To(BeTrue())
+			Expect(eng.Line[0].Start).To(Equal(gg.P(int64(3000))))
+			Expect(eng.Line[0].Value).To(Equal("Line one"))
+			Expect(eng.Line[1].Start).To(Equal(gg.P(int64(4517))))
+			Expect(eng.Line[1].Value).To(Equal("Line two\nwith break"))
+
+			By("parsing the Portuguese track")
+			por := list[1]
+			Expect(por.Lang).To(Equal("por"))
+			Expect(por.Line[0].Start).To(Equal(gg.P(int64(4500))))
+			Expect(por.Line[0].Value).To(Equal("Linha"))
+		})
+	})
+
+	Describe("Unsupported cue handling", func() {
+		It("should skip wallclock cues and keep valid ones", func() {
+			content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
+<tt xmlns="http://www.w3.org/ns/ttml">
+  <body xml:lang="eng">
+    <div>
+      <p begin="wallclock(2026-01-01T00:00:00Z)">Skip me</p>
+      <p begin="1s">Keep me</p>
+    </div>
+  </body>
+</tt>`)
+
+			list, err := parseTTML(content)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(list).To(HaveLen(1))
+			Expect(list[0].Line).To(HaveLen(1))
+			Expect(list[0].Line[0].Start).To(Equal(gg.P(int64(1000))))
+			Expect(list[0].Line[0].Value).To(Equal("Keep me"))
+		})
+	})
+
+	Describe("Begin/End/Dur with inheritance", func() {
+		It("should correctly accumulate nested timing from body, div, and p elements", func() {
+			content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
+<tt xmlns="http://www.w3.org/ns/ttml">
+  <body xml:lang="eng" begin="10s">
+    <div begin="5s" dur="8s">
+      <p begin="1s" dur="2s">First line</p>
+      <p begin="3s" end="5s">Second line</p>
+    </div>
+  </body>
+</tt>`)
+
+			list, err := parseTTML(content)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(list).To(HaveLen(1))
+			Expect(list[0].Lang).To(Equal("eng"))
+			Expect(list[0].Line).To(HaveLen(2))
+			Expect(list[0].Line[0].Start).To(Equal(gg.P(int64(16000))))
+			Expect(list[0].Line[0].Value).To(Equal("First line"))
+			Expect(list[0].Line[1].Start).To(Equal(gg.P(int64(18000))))
+			Expect(list[0].Line[1].Value).To(Equal("Second line"))
+		})
+	})
+
+	Describe("Non-standard bare second offsets", func() {
+		It("should parse bare decimal numbers as seconds", func() {
+			content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
+<tt xmlns="http://www.w3.org/ns/ttml">
+  <body xml:lang="eng" begin="10">
+    <div>
+      <p begin="0.170">First line</p>
+      <p begin="3.710">Second line</p>
+    </div>
+  </body>
+</tt>`)
+
+			list, err := parseTTML(content)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(list).To(HaveLen(1))
+			Expect(list[0].Line).To(HaveLen(2))
+			Expect(list[0].Line[0].Start).To(Equal(gg.P(int64(10170))))
+			Expect(list[0].Line[0].Value).To(Equal("First line"))
+			Expect(list[0].Line[1].Start).To(Equal(gg.P(int64(13710))))
+			Expect(list[0].Line[1].Value).To(Equal("Second line"))
+		})
+	})
+
+	Describe("Word timing tokens", func() {
+		It("should extract timed tokens from spans including background role", func() {
+			content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
+<tt xmlns="http://www.w3.org/ns/ttml" xmlns:ttm="http://www.w3.org/ns/ttml#metadata">
+  <body xml:lang="eng">
+    <div>
+      <p begin="00:01.000" end="00:03.000">
+        <span begin="00:01.000" end="00:01.400">He</span><span begin="00:01.400" end="00:01.800">llo</span>
+        <span ttm:role="x-bg"><span begin="00:02.000" end="00:02.500">echo</span></span>
+      </p>
+    </div>
+  </body>
+</tt>`)
+
+			list, err := parseTTML(content)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(list).To(HaveLen(1))
+			Expect(list[0].Agents).To(Equal([]model.Agent{
+				{ID: "main", Role: "main"},
+				{ID: "__nd_bg__|main", Role: "bg"},
+			}))
+			Expect(list[0].Line).To(HaveLen(1))
+
+			line := list[0].Line[0]
+			Expect(line.Start).To(Equal(gg.P(int64(1000))))
+			Expect(line.Value).To(Equal("Hello\necho"))
+			Expect(line.End).To(Equal(gg.P(int64(3000))))
+			Expect(line.Cue).To(HaveLen(3))
+
+			Expect(line.Cue[0]).To(Equal(model.Cue{Start: gg.P(int64(1000)), End: gg.P(int64(1400)), Value: "He", ByteStart: 0, ByteEnd: 1, AgentID: "main"}))
+			Expect(line.Cue[1]).To(Equal(model.Cue{Start: gg.P(int64(1400)), End: gg.P(int64(1800)), Value: "llo", ByteStart: 2, ByteEnd: 4, AgentID: "main"}))
+			Expect(line.Cue[2]).To(Equal(model.Cue{Start: gg.P(int64(2000)), End: gg.P(int64(2500)), Value: "echo", ByteStart: 6, ByteEnd: 9, AgentID: "__nd_bg__|main"}))
+		})
+
+		It("should parse named TTML agents into main, voice, and group roles", func() {
+			content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
+<tt xmlns="http://www.w3.org/ns/ttml" xmlns:ttm="http://www.w3.org/ns/ttml#metadata">
+  <head>
+    <metadata>
+      <ttm:agent xml:id="v1" type="person"><ttm:name>Chris Martin</ttm:name></ttm:agent>
+      <ttm:agent xml:id="v2" type="person"><ttm:name>Jin</ttm:name></ttm:agent>
+      <ttm:agent xml:id="v1000" type="group"><ttm:name>All</ttm:name></ttm:agent>
+    </metadata>
+  </head>
+  <body xml:lang="eng">
+    <div>
+      <p begin="1s" end="2s" ttm:agent="v1"><span begin="1s" end="1.5s">You</span></p>
+      <p begin="2s" end="3s" ttm:agent="v2"><span begin="2s" end="2.5s">and</span></p>
+      <p begin="3s" end="4s" ttm:agent="v1000"><span begin="3s" end="3.5s">All</span></p>
+    </div>
+  </body>
+</tt>`)
+
+			list, err := parseTTML(content)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(list).To(HaveLen(1))
+			Expect(list[0].Agents).To(Equal([]model.Agent{
+				{ID: "v1", Role: "main", Name: "Chris Martin"},
+				{ID: "v2", Role: "voice", Name: "Jin"},
+				{ID: "v1000", Role: "group", Name: "All"},
+			}))
+			Expect(list[0].Line[0].Cue[0].AgentID).To(Equal("v1"))
+			Expect(list[0].Line[1].Cue[0].AgentID).To(Equal("v2"))
+			Expect(list[0].Line[2].Cue[0].AgentID).To(Equal("v1000"))
+		})
+
+		It("should avoid collisions between derived background agents and explicit TTML agent ids", func() {
+			content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
+<tt xmlns="http://www.w3.org/ns/ttml" xmlns:ttm="http://www.w3.org/ns/ttml#metadata">
+  <head>
+    <metadata>
+      <ttm:agent xml:id="lead" type="person"><ttm:name>Lead</ttm:name></ttm:agent>
+      <ttm:agent xml:id="lead__bg" type="person"><ttm:name>Existing Background Id</ttm:name></ttm:agent>
+    </metadata>
+  </head>
+  <body xml:lang="eng">
+    <div>
+      <p begin="1s" end="2s" ttm:agent="lead">
+        <span begin="1s" end="1.4s">Lead</span>
+        <span ttm:role="x-bg"><span begin="1.5s" end="1.8s">Echo</span></span>
+      </p>
+      <p begin="2s" end="3s" ttm:agent="lead__bg">
+        <span begin="2s" end="2.5s">Named</span>
+      </p>
+    </div>
+  </body>
+</tt>`)
+
+			list, err := parseTTML(content)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(list).To(HaveLen(1))
+			Expect(list[0].Agents).To(Equal([]model.Agent{
+				{ID: "lead", Role: "main", Name: "Lead"},
+				{ID: "__nd_bg__|lead", Role: "bg", Name: "Lead"},
+				{ID: "lead__bg", Role: "voice", Name: "Existing Background Id"},
+			}))
+			Expect(list[0].Line).To(HaveLen(2))
+			Expect(list[0].Line[0].Cue).To(HaveLen(2))
+			Expect(list[0].Line[0].Cue[0].AgentID).To(Equal("lead"))
+			Expect(list[0].Line[0].Cue[1].AgentID).To(Equal("__nd_bg__|lead"))
+			Expect(list[0].Line[1].Cue).To(HaveLen(1))
+			Expect(list[0].Line[1].Cue[0].AgentID).To(Equal("lead__bg"))
+		})
+
+		It("should fill missing cue agent ids with the resolved main agent", func() {
+			content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
+<tt xmlns="http://www.w3.org/ns/ttml" xmlns:ttm="http://www.w3.org/ns/ttml#metadata">
+  <head>
+    <metadata>
+      <ttm:agent xml:id="guest" type="person"><ttm:name>Guest Vocal</ttm:name></ttm:agent>
+    </metadata>
+  </head>
+  <body xml:lang="eng">
+    <div>
+      <p begin="1s" end="3s">
+        <span begin="1s" end="1.4s">Lead</span>
+        <span begin="2s" end="2.4s" ttm:agent="guest">Guest</span>
+      </p>
+    </div>
+  </body>
+</tt>`)
+
+			list, err := parseTTML(content)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(list).To(HaveLen(1))
+			Expect(list[0].Agents).To(Equal([]model.Agent{
+				{ID: "guest", Role: "main", Name: "Guest Vocal"},
+			}))
+			Expect(list[0].Line).To(HaveLen(1))
+			Expect(list[0].Line[0].Cue).To(HaveLen(2))
+			Expect(list[0].Line[0].Cue[0].AgentID).To(Equal("guest"))
+			Expect(list[0].Line[0].Cue[1].AgentID).To(Equal("guest"))
+		})
+	})
+
+	Describe("Ambiguous decimal timing", func() {
+		It("should prefer absolute timing when values fall inside parent window", func() {
+			content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
+<tt xmlns="http://www.w3.org/ns/ttml">
+  <body xml:lang="eng">
+    <div begin="37.870" end="45.570">
+      <p begin="43.444" end="45.570">
+        <span begin="43.444" end="43.716">go</span>
+        <span begin="43.716" end="43.887">go</span>
+      </p>
+    </div>
+  </body>
+</tt>`)
+
+			list, err := parseTTML(content)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(list).To(HaveLen(1))
+			Expect(list[0].Line).To(HaveLen(1))
+
+			line := list[0].Line[0]
+			Expect(line.Start).To(Equal(gg.P(int64(43444))))
+			Expect(line.Value).To(Equal("go\ngo"))
+			Expect(line.End).To(Equal(gg.P(int64(45570))))
+			Expect(line.Cue).To(HaveLen(2))
+			Expect(line.Cue[0]).To(Equal(model.Cue{Start: gg.P(int64(43444)), End: gg.P(int64(43716)), Value: "go", ByteStart: 0, ByteEnd: 1}))
+			Expect(line.Cue[1]).To(Equal(model.Cue{Start: gg.P(int64(43716)), End: gg.P(int64(43887)), Value: "go", ByteStart: 3, ByteEnd: 4}))
+		})
+	})
+
+	Describe("Unsynced fallback", func() {
+		It("should return unsynced lyrics when no timing is present", func() {
+			content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
+<tt xmlns="http://www.w3.org/ns/ttml">
+  <body>
+    <div>
+      <p>No timing here</p>
+    </div>
+  </body>
+</tt>`)
+
+			list, err := parseTTML(content)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(list).To(HaveLen(1))
+			Expect(list[0].Lang).To(Equal("xxx"))
+			Expect(list[0].Synced).To(BeFalse())
+			Expect(list[0].Line).To(HaveLen(1))
+			Expect(list[0].Line[0].Start).To(BeNil())
+			Expect(list[0].Line[0].Value).To(Equal("No timing here"))
+		})
+	})
+
+	Describe("Metadata tracks", func() {
+		It("should produce main, translation, and pronunciation tracks from iTunesMetadata", func() {
+			content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
+<tt xmlns="http://www.w3.org/ns/ttml" xmlns:itunes="http://music.apple.com/lyric-ttml-internal">
+  <head>
+    <metadata>
+      <iTunesMetadata xmlns="http://music.apple.com/lyric-ttml-internal">
+        <translations>
+          <translation xml:lang="es">
+            <text for="L1">Hola</text>
+            <text for="MISSING">Skip me</text>
+          </translation>
+        </translations>
+        <transliterations>
+          <transliteration xml:lang="ja-Latn">
+            <text for="L2"><span begin="00:02.000" end="00:02.300" xmlns="http://www.w3.org/ns/ttml">ko</span><span begin="00:02.300" end="00:02.600" xmlns="http://www.w3.org/ns/ttml">nni</span></text>
+          </transliteration>
+        </transliterations>
+      </iTunesMetadata>
+    </metadata>
+  </head>
+  <body xml:lang="ja">
+    <div>
+      <p begin="00:01.000" end="00:01.500" itunes:key="L1">こんにちは</p>
+      <p begin="00:02.000" end="00:02.700" itunes:key="L2">こんばんは</p>
+    </div>
+  </body>
+</tt>`)
+
+			list, err := parseTTML(content)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(list).To(HaveLen(3))
+
+			By("checking the main track")
+			main := list[0]
+			Expect(main.Kind).To(Equal("main"))
+			Expect(main.Lang).To(Equal("ja"))
+			Expect(main.Line).To(HaveLen(2))
+
+			By("checking the translation track")
+			translation := list[1]
+			Expect(translation.Kind).To(Equal("translation"))
+			Expect(translation.Lang).To(Equal("es"))
+			Expect(translation.Line).To(HaveLen(1))
+			Expect(translation.Line[0].Start).To(Equal(gg.P(int64(1000))))
+			Expect(translation.Line[0].Value).To(Equal("Hola"))
+			Expect(translation.Line[0].End).To(Equal(gg.P(int64(1500))))
+
+			By("checking the pronunciation track")
+			pronunciation := list[2]
+			Expect(pronunciation.Kind).To(Equal("pronunciation"))
+			Expect(pronunciation.Lang).To(Equal("ja-latn"))
+			Expect(pronunciation.Line).To(HaveLen(1))
+			Expect(pronunciation.Line[0].Start).To(Equal(gg.P(int64(2000))))
+			Expect(pronunciation.Line[0].Value).To(Equal("konni"))
+			Expect(pronunciation.Line[0].End).To(Equal(gg.P(int64(2600))))
+			Expect(pronunciation.Line[0].Cue).To(HaveLen(2))
+			Expect(pronunciation.Line[0].Cue[0]).To(Equal(model.Cue{Start: gg.P(int64(2000)), End: gg.P(int64(2300)), Value: "ko", ByteStart: 0, ByteEnd: 1}))
+			Expect(pronunciation.Line[0].Cue[1]).To(Equal(model.Cue{Start: gg.P(int64(2300)), End: gg.P(int64(2600)), Value: "nni", ByteStart: 2, ByteEnd: 4}))
+		})
+	})
+
+	Describe("Pronunciation with bare decimal end times", func() {
+		It("should correctly parse bare decimal times in transliteration spans", func() {
+			content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
+<tt xmlns="http://www.w3.org/ns/ttml" xmlns:itunes="http://music.apple.com/lyric-ttml-internal">
+  <head>
+    <metadata>
+      <iTunesMetadata xmlns="http://music.apple.com/lyric-ttml-internal">
+        <transliterations>
+          <transliteration xml:lang="ja-Latn">
+            <text for="L1"><span begin="2.747" end="3.018" xmlns="http://www.w3.org/ns/ttml">I</span> <span begin="3.018" end="3.179" xmlns="http://www.w3.org/ns/ttml">woke</span> <span begin="3.179" end="3.582" xmlns="http://www.w3.org/ns/ttml">up</span></text>
+          </transliteration>
+        </transliterations>
+      </iTunesMetadata>
+    </metadata>
+  </head>
+  <body xml:lang="ja">
+    <div>
+      <p begin="00:02.747" end="00:04.000" itunes:key="L1">起きた</p>
+    </div>
+  </body>
+</tt>`)
+
+			list, err := parseTTML(content)
+			Expect(err).ToNot(HaveOccurred())
+
+			var pronunciation *model.Lyrics
+			for i := range list {
+				if list[i].Kind == "pronunciation" {
+					pronunciation = &list[i]
+					break
+				}
+			}
+			Expect(pronunciation).ToNot(BeNil())
+			Expect(pronunciation.Line).To(HaveLen(1))
+
+			line := pronunciation.Line[0]
+			Expect(line.Start).To(Equal(gg.P(int64(2747))))
+			Expect(line.Value).To(Equal("I woke up"))
+			Expect(line.Cue).To(HaveLen(3))
+			Expect(line.Cue[0]).To(Equal(model.Cue{Start: gg.P(int64(2747)), End: gg.P(int64(3018)), Value: "I", ByteStart: 0, ByteEnd: 0}))
+			Expect(line.Cue[1]).To(Equal(model.Cue{Start: gg.P(int64(3018)), End: gg.P(int64(3179)), Value: "woke", ByteStart: 2, ByteEnd: 5}))
+			Expect(line.Cue[2]).To(Equal(model.Cue{Start: gg.P(int64(3179)), End: gg.P(int64(3582)), Value: "up", ByteStart: 7, ByteEnd: 8}))
+		})
+	})
+})
--- a/model/lyrics.go
+++ b/model/lyrics.go
@ -6,23 +6,43 @@ import (
 	"slices"
 	"strconv"
 	"strings"
+	"unicode"

 	"github.com/navidrome/navidrome/log"
 	"github.com/navidrome/navidrome/utils/str"
 )

+type Cue struct {
+	Start     *int64 `structs:"start,omitempty"   json:"start,omitempty"`
+	End       *int64 `structs:"end,omitempty"     json:"end,omitempty"`
+	Value     string `structs:"value"             json:"value"`
+	ByteStart int    `structs:"byteStart"         json:"byteStart"`
+	ByteEnd   int    `structs:"byteEnd"           json:"byteEnd"`
+	AgentID   string `structs:"agentId,omitempty" json:"agentId,omitempty"`
+}
+
+type Agent struct {
+	ID   string `structs:"id"             json:"id"`
+	Role string `structs:"role"           json:"role"`
+	Name string `structs:"name,omitempty" json:"name,omitempty"`
+}
+
 type Line struct {
 	Start *int64 `structs:"start,omitempty" json:"start,omitempty"`
+	End   *int64 `structs:"end,omitempty"   json:"end,omitempty"`
 	Value string `structs:"value"           json:"value"`
+	Cue   []Cue  `structs:"cue,omitempty"   json:"cue,omitempty"`
 }

 type Lyrics struct {
-	DisplayArtist string `structs:"displayArtist,omitempty" json:"displayArtist,omitempty"`
-	DisplayTitle  string `structs:"displayTitle,omitempty"  json:"displayTitle,omitempty"`
-	Lang          string `structs:"lang"                    json:"lang"`
-	Line          []Line `structs:"line"                    json:"line"`
-	Offset        *int64 `structs:"offset,omitempty"        json:"offset,omitempty"`
-	Synced        bool   `structs:"synced"                  json:"synced"`
+	DisplayArtist string  `structs:"displayArtist,omitempty" json:"displayArtist,omitempty"`
+	DisplayTitle  string  `structs:"displayTitle,omitempty"  json:"displayTitle,omitempty"`
+	Kind          string  `structs:"kind,omitempty"          json:"kind,omitempty"`
+	Lang          string  `structs:"lang"                    json:"lang"`
+	Agents        []Agent `structs:"agents,omitempty"       json:"agents,omitempty"`
+	Line          []Line  `structs:"line"                    json:"line"`
+	Offset        *int64  `structs:"offset,omitempty"        json:"offset,omitempty"`
+	Synced        bool    `structs:"synced"                  json:"synced"`
 }

 // support the standard [mm:ss.mm], as well as [hh:*] and [*.mmm]
@ -33,6 +53,10 @@ var (
 	syncRegex  = regexp.MustCompile(`(^|\n)\s*` + timeRegexString)
 	timeRegex  = regexp.MustCompile(timeRegexString)
 	lrcIdRegex = regexp.MustCompile(`\[(ar|ti|offset|lang):([^]]+)]`)
+
+	// Enhanced LRC: inline word-level timing markers like <00:12.34>
+	enhancedLRCTimeString = `<([0-9]{1,2}:)?([0-9]{1,2}):([0-9]{1,2})(.[0-9]{1,3})?>`
+	enhancedLRCRegex      = regexp.MustCompile(enhancedLRCTimeString)
 )

 func (l Lyrics) IsEmpty() bool {
@ -106,9 +130,11 @@ func ToLyrics(language, text string) (*Lyrics, error) {

 			if validLine {
 				for idx := range timestamps {
+					value, cues := parseEnhancedLine(priorLine)
 					structuredLines = append(structuredLines, Line{
 						Start: &timestamps[idx],
-						Value: strings.TrimSpace(priorLine),
+						Value: value,
+						Cue:   cues,
 					})
 				}
 				timestamps = nil
@ -154,9 +180,11 @@ func ToLyrics(language, text string) (*Lyrics, error) {

 	if validLine {
 		for idx := range timestamps {
+			value, cues := parseEnhancedLine(priorLine)
 			structuredLines = append(structuredLines, Line{
 				Start: &timestamps[idx],
-				Value: strings.TrimSpace(priorLine),
+				Value: value,
+				Cue:   cues,
 			})
 		}
 	}
@ -173,13 +201,118 @@ func ToLyrics(language, text string) (*Lyrics, error) {
 		DisplayArtist: artist,
 		DisplayTitle:  title,
 		Lang:          language,
-		Line:          structuredLines,
+		Line:          NormalizeCueLines(structuredLines),
 		Offset:        offset,
 		Synced:        synced,
 	}
 	return &lyrics, nil
 }

+// parseEnhancedLine extracts word-level timing cues from Enhanced LRC inline markers
+// and computes UTF-8 byte offsets against the final stripped line value.
+func parseEnhancedLine(text string) (string, []Cue) {
+	matches := enhancedLRCRegex.FindAllStringSubmatchIndex(text, -1)
+	if len(matches) == 0 {
+		return strings.TrimSpace(text), nil
+	}
+
+	type segment struct {
+		start    int64
+		rawStart int
+		rawEnd   int
+	}
+
+	segments := make([]segment, 0, len(matches))
+	var rawValue strings.Builder
+	for i, match := range matches {
+		timeMs, err := parseTime(
+			// Rewrite <...> as [...] so parseTime can handle it with the same logic
+			"["+text[match[0]+1:match[1]-1]+"]",
+			// Adjust match indices to point into our rewritten string (need start/end pairs for each group)
+			[]int{
+				0, match[1] - match[0],
+				adjustGroup(match, 2), adjustGroup(match, 3),
+				adjustGroup(match, 4), adjustGroup(match, 5),
+				adjustGroup(match, 6), adjustGroup(match, 7),
+				adjustGroup(match, 8), adjustGroup(match, 9),
+			},
+		)
+		if err != nil {
+			continue
+		}
+
+		// Text runs from after this marker to the start of the next marker (or end of string)
+		textStart := match[1]
+		var textEnd int
+		if i+1 < len(matches) {
+			textEnd = matches[i+1][0]
+		} else {
+			textEnd = len(text)
+		}
+
+		word := text[textStart:textEnd]
+		if word == "" {
+			continue
+		}
+
+		rawStart := rawValue.Len()
+		rawValue.WriteString(word)
+		segments = append(segments, segment{
+			start:    timeMs,
+			rawStart: rawStart,
+			rawEnd:   rawValue.Len(),
+		})
+	}
+
+	if len(segments) == 0 {
+		return strings.TrimSpace(stripEnhancedMarkers(text)), nil
+	}
+
+	finalRaw := rawValue.String()
+	leftTrimBytes := len(finalRaw) - len(strings.TrimLeftFunc(finalRaw, unicode.IsSpace))
+	rightTrimBytes := len(finalRaw) - len(strings.TrimRightFunc(finalRaw, unicode.IsSpace))
+	trimmedEnd := len(finalRaw) - rightTrimBytes
+	if trimmedEnd < leftTrimBytes {
+		trimmedEnd = leftTrimBytes
+	}
+
+	cues := make([]Cue, 0, len(segments))
+	for _, seg := range segments {
+		start := seg.start
+		byteStart := max(seg.rawStart, leftTrimBytes)
+		byteEnd := min(seg.rawEnd, trimmedEnd)
+		if byteStart >= byteEnd {
+			continue
+		}
+
+		cues = append(cues, Cue{
+			Start:     &start,
+			Value:     finalRaw[byteStart:byteEnd],
+			ByteStart: byteStart - leftTrimBytes,
+			ByteEnd:   byteEnd - leftTrimBytes - 1,
+		})
+	}
+
+	return strings.TrimSpace(finalRaw), cues
+}
+
+// adjustGroup remaps a capture group index from the original match to our rewritten "[...]" string.
+// The rewrite shifts by -1 (removed '<', added '[') so positions within the brackets stay the same.
+func adjustGroup(match []int, groupIdx int) int {
+	orig := match[groupIdx]
+	if orig == -1 {
+		return -1
+	}
+	// Offset is: original position minus the position of '<' in the original, plus 1 for '['
+	return orig - match[0]
+}
+
+// stripEnhancedMarkers removes all <mm:ss.mm> inline markers from text,
+// returning the plain lyric text.
+func stripEnhancedMarkers(text string) string {
+	return enhancedLRCRegex.ReplaceAllString(text, "")
+}
+
 func parseTime(line string, match []int) (int64, error) {
 	var hours, millis int64
 	var err error
@ -227,3 +360,115 @@ func parseTime(line string, match []int) (int64, error) {
 }

 type LyricList []Lyrics
+
+func NormalizeLyrics(lyrics Lyrics) Lyrics {
+	lyrics.Line = NormalizeCueLines(lyrics.Line)
+	if len(lyrics.Agents) == 0 {
+		lyrics.Agents = nil
+	}
+	return lyrics
+}
+
+func NormalizeCueLines(lines []Line) []Line {
+	if len(lines) == 0 {
+		return lines
+	}
+
+	normalized := make([]Line, len(lines))
+	copy(normalized, lines)
+
+	for i := range normalized {
+		var fallbackEnd *int64
+		if normalized[i].End != nil {
+			v := *normalized[i].End
+			fallbackEnd = &v
+		} else if i+1 < len(normalized) && normalized[i+1].Start != nil {
+			v := *normalized[i+1].Start
+			fallbackEnd = &v
+		}
+
+		normalized[i] = normalizeCueLine(normalized[i], fallbackEnd)
+	}
+
+	return normalized
+}
+
+func NormalizeLineTiming(line Line) Line {
+	if len(line.Cue) == 0 {
+		return line
+	}
+
+	var earliestStart *int64
+	var latestEnd *int64
+	for i := range line.Cue {
+		token := line.Cue[i]
+		if token.Start != nil {
+			if earliestStart == nil || *token.Start < *earliestStart {
+				v := *token.Start
+				earliestStart = &v
+			}
+		}
+
+		candidateEnd := token.End
+		if candidateEnd == nil {
+			candidateEnd = token.Start
+		}
+		if candidateEnd != nil {
+			if latestEnd == nil || *candidateEnd > *latestEnd {
+				v := *candidateEnd
+				latestEnd = &v
+			}
+		}
+	}
+
+	if line.Start == nil && earliestStart != nil {
+		v := *earliestStart
+		line.Start = &v
+	}
+	if line.End == nil && latestEnd != nil {
+		v := *latestEnd
+		line.End = &v
+	}
+	return line
+}
+
+func normalizeCueLine(line Line, fallbackEnd *int64) Line {
+	if len(line.Cue) == 0 {
+		return line
+	}
+
+	for i := range line.Cue {
+		if line.Cue[i].End != nil {
+			continue
+		}
+
+		if i+1 < len(line.Cue) && line.Cue[i+1].Start != nil {
+			v := *line.Cue[i+1].Start
+			line.Cue[i].End = &v
+			continue
+		}
+
+		if fallbackEnd != nil {
+			v := *fallbackEnd
+			line.Cue[i].End = &v
+		}
+	}
+
+	for i := range line.Cue {
+		if line.Cue[i].End == nil {
+			line.Cue = clearCueEnds(line.Cue)
+			return NormalizeLineTiming(line)
+		}
+	}
+
+	return NormalizeLineTiming(line)
+}
+
+func clearCueEnds(cues []Cue) []Cue {
+	normalized := make([]Cue, len(cues))
+	copy(normalized, cues)
+	for i := range normalized {
+		normalized[i].End = nil
+	}
+	return normalized
+}
--- a/model/lyrics_test.go
+++ b/model/lyrics_test.go
@ -116,4 +116,85 @@ var _ = Describe("ToLyrics", func() {
 			{Start: &e, Value: "Test"},
 		}))
 	})
+
+	It("should parse Enhanced LRC with word-level timing", func() {
+		lyrics, err := ToLyrics("xxx", "[00:01.00]<00:01.00>Some <00:01.50>lyrics <00:02.00>here\n[00:03.00]<00:03.00>More <00:03.50>words")
+		Expect(err).ToNot(HaveOccurred())
+		Expect(lyrics.Synced).To(BeTrue())
+		Expect(lyrics.Line).To(HaveLen(2))
+
+		t1000, t1500, t2000, t3000, t3500 := int64(1000), int64(1500), int64(2000), int64(3000), int64(3500)
+
+		line0 := lyrics.Line[0]
+		Expect(line0.Start).To(Equal(&t1000))
+		Expect(line0.End).To(Equal(&t3000))
+		Expect(line0.Value).To(Equal("Some lyrics here"))
+		Expect(line0.Cue).To(Equal([]Cue{
+			{Start: &t1000, End: &t1500, Value: "Some ", ByteStart: 0, ByteEnd: 4},
+			{Start: &t1500, End: &t2000, Value: "lyrics ", ByteStart: 5, ByteEnd: 11},
+			{Start: &t2000, End: &t3000, Value: "here", ByteStart: 12, ByteEnd: 15},
+		}))
+
+		line1 := lyrics.Line[1]
+		Expect(line1.Start).To(Equal(&t3000))
+		Expect(line1.End).To(Equal(&t3500))
+		Expect(line1.Value).To(Equal("More words"))
+		Expect(line1.Cue).To(Equal([]Cue{
+			{Start: &t3000, Value: "More ", ByteStart: 0, ByteEnd: 4},
+			{Start: &t3500, Value: "words", ByteStart: 5, ByteEnd: 9},
+		}))
+
+		Expect(line1.Cue[1].End).To(BeNil())
+	})
+
+	It("should ignore Enhanced LRC markers and return plain lines when no markers present", func() {
+		a, b := int64(1000), int64(3000)
+		lyrics, err := ToLyrics("xxx", "[00:01.00]Plain line\n[00:03.00]Another plain line")
+		Expect(err).ToNot(HaveOccurred())
+		Expect(lyrics.Line).To(Equal([]Line{
+			{Start: &a, Value: "Plain line"},
+			{Start: &b, Value: "Another plain line"},
+		}))
+	})
+
+	It("should handle mixed Enhanced and plain LRC lines", func() {
+		lyrics, err := ToLyrics("xxx", "[00:01.00]<00:01.00>Some <00:01.50>lyrics\n[00:03.00]Plain line\n[00:05.00]<00:05.00>More <00:05.50>words")
+		Expect(err).ToNot(HaveOccurred())
+		Expect(lyrics.Line).To(HaveLen(3))
+
+		t1000, t1500, t5000, t5500 := int64(1000), int64(1500), int64(5000), int64(5500)
+		t3000 := int64(3000)
+
+		Expect(lyrics.Line[0].Cue).To(Equal([]Cue{
+			{Start: &t1000, End: &t1500, Value: "Some ", ByteStart: 0, ByteEnd: 4},
+			{Start: &t1500, End: &t3000, Value: "lyrics", ByteStart: 5, ByteEnd: 10},
+		}))
+		Expect(lyrics.Line[0].Value).To(Equal("Some lyrics"))
+		Expect(lyrics.Line[0].End).To(Equal(&t3000))
+
+		Expect(lyrics.Line[1].Cue).To(BeNil())
+		Expect(lyrics.Line[1].Value).To(Equal("Plain line"))
+
+		Expect(lyrics.Line[2].Cue).To(Equal([]Cue{
+			{Start: &t5000, Value: "More ", ByteStart: 0, ByteEnd: 4},
+			{Start: &t5500, Value: "words", ByteStart: 5, ByteEnd: 9},
+		}))
+		Expect(lyrics.Line[2].Value).To(Equal("More words"))
+	})
+
+	It("should preserve byte offsets for Enhanced LRC cues", func() {
+		lyrics, err := ToLyrics("xxx", "[00:00.00]<00:00.00>Oh <00:00.90>love<00:01.30> me <00:01.60>tonight")
+		Expect(err).ToNot(HaveOccurred())
+		Expect(lyrics.Line).To(HaveLen(1))
+
+		t0, t900, t1300, t1600 := int64(0), int64(900), int64(1300), int64(1600)
+		line := lyrics.Line[0]
+		Expect(line.Value).To(Equal("Oh love me tonight"))
+		Expect(line.Cue).To(Equal([]Cue{
+			{Start: &t0, Value: "Oh ", ByteStart: 0, ByteEnd: 2},
+			{Start: &t900, Value: "love", ByteStart: 3, ByteEnd: 6},
+			{Start: &t1300, Value: " me ", ByteStart: 7, ByteEnd: 10},
+			{Start: &t1600, Value: "tonight", ByteStart: 11, ByteEnd: 17},
+		}))
+	})
 })
--- a/server/subsonic/helpers.go
+++ b/server/subsonic/helpers.go
@ -493,14 +493,79 @@ func mapExplicitStatus(explicitStatus string) string {
 	return ""
 }

-func buildStructuredLyric(mf *model.MediaFile, lyrics model.Lyrics) responses.StructuredLyric {
+func buildStructuredLyric(mf *model.MediaFile, lyrics model.Lyrics, enhanced bool) responses.StructuredLyric {
 	lines := make([]responses.Line, len(lyrics.Line))
+	var cueLines []responses.CueLine
+	agentOrderByID := make(map[string]int, len(lyrics.Agents))
+	agentRoleByID := make(map[string]string, len(lyrics.Agents))
+	responseAgents := make([]responses.Agent, 0, len(lyrics.Agents))
+
+	for i, agent := range lyrics.Agents {
+		agentOrderByID[agent.ID] = i
+		agentRoleByID[agent.ID] = agent.Role
+		responseAgents = append(responseAgents, responses.Agent{
+			ID:   agent.ID,
+			Role: agent.Role,
+			Name: agent.Name,
+		})
+	}

 	for i, line := range lyrics.Line {
 		lines[i] = responses.Line{
 			Start: line.Start,
 			Value: line.Value,
 		}
+		if !enhanced || len(line.Cue) == 0 {
+			continue
+		}
+
+		agentOrder := make([]string, 0, 2)
+		cuesByAgent := make(map[string][]model.Cue)
+		for _, cue := range line.Cue {
+			if cue.Start == nil {
+				continue
+			}
+			agentID := strings.TrimSpace(cue.AgentID)
+			if _, exists := cuesByAgent[agentID]; !exists {
+				agentOrder = append(agentOrder, agentID)
+			}
+			cuesByAgent[agentID] = append(cuesByAgent[agentID], cue)
+		}
+
+		sort.SliceStable(agentOrder, func(i, j int) bool {
+			leftRole := agentRoleByID[agentOrder[i]]
+			rightRole := agentRoleByID[agentOrder[j]]
+			if leftRole == "main" && rightRole != "main" {
+				return true
+			}
+			if rightRole == "main" && leftRole != "main" {
+				return false
+			}
+
+			leftOrder, leftOK := agentOrderByID[agentOrder[i]]
+			rightOrder, rightOK := agentOrderByID[agentOrder[j]]
+			if leftOK && rightOK && leftOrder != rightOrder {
+				return leftOrder < rightOrder
+			}
+			if leftOK != rightOK {
+				return leftOK
+			}
+			return i < j
+		})
+
+		for _, agentID := range agentOrder {
+			cueLine := responses.CueLine{
+				Index: int32(i),
+				Start: line.Start,
+				End:   line.End,
+				Value: line.Value,
+				Cue:   buildLyricCues(cuesByAgent[agentID], line.End),
+			}
+			if agentID != "" {
+				cueLine.AgentID = agentID
+			}
+			cueLines = append(cueLines, cueLine)
+		}
 	}

 	structured := responses.StructuredLyric{
@ -508,10 +573,22 @@ func buildStructuredLyric(mf *model.MediaFile, lyrics model.Lyrics) responses.St
 		DisplayTitle:  lyrics.DisplayTitle,
 		Lang:          lyrics.Lang,
 		Line:          lines,
+		CueLine:       cueLines,
 		Offset:        lyrics.Offset,
 		Synced:        lyrics.Synced,
 	}

+	if enhanced {
+		kind := strings.TrimSpace(lyrics.Kind)
+		if kind == "" {
+			kind = "main"
+		}
+		structured.Kind = kind
+		if len(cueLines) > 0 && len(responseAgents) > 0 {
+			structured.Agents = responseAgents
+		}
+	}
+
 	if structured.DisplayArtist == "" {
 		structured.DisplayArtist = mf.Artist
 	}
@ -522,11 +599,86 @@ func buildStructuredLyric(mf *model.MediaFile, lyrics model.Lyrics) responses.St
 	return structured
 }

-func buildLyricsList(mf *model.MediaFile, lyricsList model.LyricList) *responses.LyricsList {
-	lyricList := make(responses.StructuredLyrics, len(lyricsList))
+func buildLyricCues(cues []model.Cue, lineEnd *int64) []responses.LyricCue {
+	if len(cues) == 0 {
+		return nil
+	}

-	for i, lyrics := range lyricsList {
-		lyricList[i] = buildStructuredLyric(mf, lyrics)
+	hasAnyEnd := false
+	for i := range cues {
+		if cues[i].End != nil {
+			hasAnyEnd = true
+			break
+		}
+	}
+
+	normalized := make([]responses.LyricCue, 0, len(cues))
+	for i := range cues {
+		if cues[i].Start == nil {
+			continue
+		}
+
+		cue := responses.LyricCue{
+			Start:     *cues[i].Start,
+			Value:     cues[i].Value,
+			ByteStart: cues[i].ByteStart,
+			ByteEnd:   cues[i].ByteEnd,
+		}
+		if hasAnyEnd {
+			end := cues[i].End
+			if end == nil {
+				if i+1 < len(cues) && cues[i+1].Start != nil {
+					v := *cues[i+1].Start
+					end = &v
+				} else if lineEnd != nil {
+					v := *lineEnd
+					end = &v
+				}
+			}
+			if end != nil && i+1 < len(cues) && cues[i+1].Start != nil && *end > *cues[i+1].Start {
+				v := *cues[i+1].Start
+				end = &v
+			}
+			if end != nil && *end < cue.Start {
+				v := cue.Start
+				end = &v
+			}
+			cue.End = end
+		}
+		normalized = append(normalized, cue)
+	}
+
+	if hasAnyEnd {
+		for i := range normalized {
+			if normalized[i].End == nil {
+				for j := range normalized {
+					normalized[j].End = nil
+				}
+				break
+			}
+		}
+	}
+
+	return normalized
+}
+
+func buildLyricsList(mf *model.MediaFile, lyricsList model.LyricList, enhanced bool) *responses.LyricsList {
+	var filtered model.LyricList
+	if enhanced {
+		filtered = lyricsList
+	} else {
+		// Without enhanced, only return "main" kind entries
+		for _, l := range lyricsList {
+			kind := strings.TrimSpace(l.Kind)
+			if kind == "" || kind == "main" {
+				filtered = append(filtered, l)
+			}
+		}
+	}
+
+	lyricList := make(responses.StructuredLyrics, len(filtered))
+	for i, lyrics := range filtered {
+		lyricList[i] = buildStructuredLyric(mf, lyrics, enhanced)
 	}

 	res := &responses.LyricsList{
--- a/server/subsonic/media_retrieval.go
+++ b/server/subsonic/media_retrieval.go
@ -10,6 +10,7 @@ import (

 	"github.com/navidrome/navidrome/conf"
 	"github.com/navidrome/navidrome/consts"
+	lyricssvc "github.com/navidrome/navidrome/core/lyrics"
 	"github.com/navidrome/navidrome/log"
 	"github.com/navidrome/navidrome/model"
 	"github.com/navidrome/navidrome/resources"
@ -19,6 +20,8 @@ import (
 	"github.com/navidrome/navidrome/utils/req"
 )

+const maxLegacyLyricsCandidates = 10
+
 func (api *Router) GetAvatar(w http.ResponseWriter, r *http.Request) (*responses.Subsonic, error) {
 	if !conf.Server.EnableGravatar {
 		return api.getPlaceHolderAvatar(w, r)
@ -98,7 +101,11 @@ func (api *Router) GetLyrics(r *http.Request) (*responses.Subsonic, error) {
 	response := newResponse()
 	lyricsResponse := responses.Lyrics{}
 	response.Lyrics = &lyricsResponse
-	mediaFiles, err := api.ds.MediaFile(r.Context()).GetAll(filter.SongsByArtistTitleWithLyricsFirst(artist, title))
+	opts := filter.SongsByArtistTitleWithLyricsFirst(artist, title)
+	// Search a bounded duplicate window so source-priority fallback can still
+	// reach older matches without turning legacy getLyrics into an unbounded scan.
+	opts.Max = maxLegacyLyricsCandidates
+	mediaFiles, err := api.ds.MediaFile(r.Context()).GetAll(opts)

 	if err != nil {
 		return nil, err
@ -108,9 +115,22 @@ func (api *Router) GetLyrics(r *http.Request) (*responses.Subsonic, error) {
 		return response, nil
 	}

-	structuredLyrics, err := api.lyrics.GetLyrics(r.Context(), &mediaFiles[0])
-	if err != nil {
-		return nil, err
+	var structuredLyrics model.LyricList
+	if batchLyrics, ok := api.lyrics.(lyricssvc.BatchLyrics); ok {
+		structuredLyrics, err = batchLyrics.GetLyricsForMediaFiles(r.Context(), mediaFiles)
+		if err != nil {
+			return nil, err
+		}
+	} else {
+		for i := range mediaFiles {
+			structuredLyrics, err = api.lyrics.GetLyrics(r.Context(), &mediaFiles[i])
+			if err != nil {
+				return nil, err
+			}
+			if len(structuredLyrics) > 0 {
+				break
+			}
+		}
 	}

 	if len(structuredLyrics) == 0 {
@ -124,7 +144,6 @@ func (api *Router) GetLyrics(r *http.Request) (*responses.Subsonic, error) {
 	for _, line := range structuredLyrics[0].Line {
 		lyricsText.WriteString(line.Value + "\n")
 	}
-
 	lyricsResponse.Value = lyricsText.String()

 	return response, nil
@ -146,8 +165,10 @@ func (api *Router) GetLyricsBySongId(r *http.Request) (*responses.Subsonic, erro
 		return nil, err
 	}

+	enhanced, _ := req.Params(r).Bool("enhanced")
+
 	response := newResponse()
-	response.LyricsList = buildLyricsList(mediaFile, structuredLyrics)
+	response.LyricsList = buildLyricsList(mediaFile, structuredLyrics, enhanced)

 	return response, nil
 }
--- a/server/subsonic/media_retrieval_test.go
+++ b/server/subsonic/media_retrieval_test.go
@ -186,6 +186,41 @@ var _ = Describe("MediaRetrievalController", func() {
 			Expect(response.Lyrics.Title).To(Equal("Never Gonna Give You Up"))
 			Expect(response.Lyrics.Value).To(Equal("We're no strangers to love\nYou know the rules and so do I\n"))
 		})
+
+		It("should prefer higher-priority sidecar lyrics across duplicate candidates", func() {
+			conf.Server.LyricsPriority = ".ttml,embedded"
+			r := newGetRequest("artist=Rick+Astley", "title=Never+Gonna+Give+You+Up")
+			baseTime := time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC)
+			embedded, err := model.ToLyrics("eng", "Newest duplicate embedded lyrics")
+			Expect(err).ToNot(HaveOccurred())
+			embeddedJSON, err := json.Marshal(model.LyricList{*embedded})
+			Expect(err).ToNot(HaveOccurred())
+			mockRepo.SetData(model.MediaFiles{
+				{
+					ID:        "1",
+					Path:      "tests/fixtures/01 Invisible (RED) Edit Version.mp3",
+					Artist:    "Rick Astley",
+					Title:     "Never Gonna Give You Up",
+					Lyrics:    string(embeddedJSON),
+					UpdatedAt: baseTime.Add(2 * time.Hour), // Newer duplicate with embedded lyrics only
+				},
+				{
+					ID:        "2",
+					Path:      "tests/fixtures/test.mp3",
+					Artist:    "Rick Astley",
+					Title:     "Never Gonna Give You Up",
+					Lyrics:    "[]",
+					UpdatedAt: baseTime.Add(1 * time.Hour), // Older, but has TTML sidecar
+				},
+			})
+
+			response, err := router.GetLyrics(r)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(response.Lyrics.Artist).To(Equal("Rick Astley"))
+			Expect(response.Lyrics.Title).To(Equal("Never Gonna Give You Up"))
+			Expect(response.Lyrics.Value).To(Equal("We're no strangers to love\nYou know the rules and so do I\n"))
+			Expect(mockRepo.Options.Max).To(Equal(maxLegacyLyricsCandidates))
+		})
 	})

 	Describe("GetLyricsBySongId", func() {
@ -202,8 +237,10 @@ var _ = Describe("MediaRetrievalController", func() {

 				Expect(realLyric.DisplayArtist).To(Equal(expectedLyric.DisplayArtist))
 				Expect(realLyric.DisplayTitle).To(Equal(expectedLyric.DisplayTitle))
+				Expect(realLyric.Kind).To(Equal(expectedLyric.Kind))
 				Expect(realLyric.Lang).To(Equal(expectedLyric.Lang))
 				Expect(realLyric.Synced).To(Equal(expectedLyric.Synced))
+				Expect(realLyric.Agents).To(Equal(expectedLyric.Agents))

 				if expectedLyric.Offset == nil {
 					Expect(realLyric.Offset).To(BeNil())
@ -222,6 +259,38 @@ var _ = Describe("MediaRetrievalController", func() {
 						Expect(*realLine.Start).To(Equal(*expectedLine.Start))
 					}
 				}
+
+				Expect(realLyric.CueLine).To(HaveLen(len(expectedLyric.CueLine)))
+				for j, realCueLine := range realLyric.CueLine {
+					expectedCueLine := expectedLyric.CueLine[j]
+					Expect(realCueLine.Index).To(Equal(expectedCueLine.Index))
+					Expect(realCueLine.Value).To(Equal(expectedCueLine.Value))
+					Expect(realCueLine.AgentID).To(Equal(expectedCueLine.AgentID))
+					if expectedCueLine.Start == nil {
+						Expect(realCueLine.Start).To(BeNil())
+					} else {
+						Expect(*realCueLine.Start).To(Equal(*expectedCueLine.Start))
+					}
+					if expectedCueLine.End == nil {
+						Expect(realCueLine.End).To(BeNil())
+					} else {
+						Expect(*realCueLine.End).To(Equal(*expectedCueLine.End))
+					}
+
+					Expect(realCueLine.Cue).To(HaveLen(len(expectedCueLine.Cue)))
+					for k, realCue := range realCueLine.Cue {
+						expectedCue := expectedCueLine.Cue[k]
+						Expect(realCue.Value).To(Equal(expectedCue.Value))
+						Expect(realCue.Start).To(Equal(expectedCue.Start))
+						Expect(realCue.ByteStart).To(Equal(expectedCue.ByteStart))
+						Expect(realCue.ByteEnd).To(Equal(expectedCue.ByteEnd))
+						if expectedCue.End == nil {
+							Expect(realCue.End).To(BeNil())
+						} else {
+							Expect(*realCue.End).To(Equal(*expectedCue.End))
+						}
+					}
+				}
 			}
 		}

@ -323,6 +392,427 @@ var _ = Describe("MediaRetrievalController", func() {
 				},
 			})
 		})
+
+		It("should return multilingual TTML sidecar lyrics", func() {
+			conf.Server.LyricsPriority = ".ttml,embedded"
+			r := newGetRequest("id=1")
+
+			mockRepo.SetData(model.MediaFiles{
+				{
+					ID:     "1",
+					Path:   "tests/fixtures/test.mp3",
+					Artist: "Rick Astley",
+					Title:  "Never Gonna Give You Up",
+					Lyrics: "[]",
+				},
+			})
+
+			response, err := router.GetLyricsBySongId(r)
+			Expect(err).ToNot(HaveOccurred())
+
+			porTime := int64(18800)
+			ttmlTime := int64(22800)
+			compareResponses(response.LyricsList, responses.LyricsList{
+				StructuredLyrics: responses.StructuredLyrics{
+					{
+						DisplayArtist: "Rick Astley",
+						DisplayTitle:  "Never Gonna Give You Up",
+						Lang:          "eng",
+						Synced:        true,
+						Line: []responses.Line{
+							{
+								Start: &times[0],
+								Value: "We're no strangers to love",
+							},
+							{
+								Start: &ttmlTime,
+								Value: "You know the rules and so do I",
+							},
+						},
+					},
+					{
+						DisplayArtist: "Rick Astley",
+						DisplayTitle:  "Never Gonna Give You Up",
+						Lang:          "por",
+						Synced:        true,
+						Line: []responses.Line{
+							{
+								Start: &porTime,
+								Value: "Nao somos estranhos ao amor",
+							},
+						},
+					},
+				},
+			})
+		})
+
+		It("should return metadata-linked translation and pronunciation tracks from TTML", func() {
+			conf.Server.LyricsPriority = ".ttml,embedded"
+			r := newGetRequest("id=1&enhanced=true")
+
+			mockRepo.SetData(model.MediaFiles{
+				{
+					ID:     "1",
+					Path:   "tests/fixtures/test-metadata.mp3",
+					Artist: "Rick Astley",
+					Title:  "Never Gonna Give You Up",
+					Lyrics: "[]",
+				},
+			})
+
+			response, err := router.GetLyricsBySongId(r)
+			Expect(err).ToNot(HaveOccurred())
+
+			mainStartA := int64(1000)
+			mainStartB := int64(2000)
+			tokenStartA := int64(2000)
+			tokenEndA := int64(2300)
+			tokenStartB := int64(2300)
+			tokenEndB := int64(2600)
+			compareResponses(response.LyricsList, responses.LyricsList{
+				StructuredLyrics: responses.StructuredLyrics{
+					{
+						DisplayArtist: "Rick Astley",
+						DisplayTitle:  "Never Gonna Give You Up",
+						Kind:          "main",
+						Lang:          "ja",
+						Synced:        true,
+						Line: []responses.Line{
+							{
+								Start: &mainStartA,
+								Value: "こんにちは",
+							},
+							{
+								Start: &mainStartB,
+								Value: "こんばんは",
+							},
+						},
+					},
+					{
+						DisplayArtist: "Rick Astley",
+						DisplayTitle:  "Never Gonna Give You Up",
+						Kind:          "translation",
+						Lang:          "es",
+						Synced:        true,
+						Line: []responses.Line{
+							{
+								Start: &mainStartA,
+								Value: "Hola",
+							},
+						},
+					},
+					{
+						DisplayArtist: "Rick Astley",
+						DisplayTitle:  "Never Gonna Give You Up",
+						Kind:          "pronunciation",
+						Lang:          "ja-latn",
+						Synced:        true,
+						Line: []responses.Line{
+							{
+								Start: &mainStartB,
+								Value: "konni",
+							},
+						},
+						CueLine: []responses.CueLine{
+							{
+								Index: 0,
+								Start: &mainStartB,
+								End:   &tokenEndB,
+								Value: "konni",
+								Cue: []responses.LyricCue{
+									{
+										Start:     tokenStartA,
+										End:       &tokenEndA,
+										ByteStart: 0,
+										ByteEnd:   1,
+										Value:     "ko",
+									},
+									{
+										Start:     tokenStartB,
+										End:       &tokenEndB,
+										ByteStart: 2,
+										ByteEnd:   4,
+										Value:     "nni",
+									},
+								},
+							},
+						},
+					},
+				},
+			})
+		})
+
+		It("should return cue lines for songLyrics v2 clients with enhanced=true", func() {
+			r := newGetRequest("id=1&enhanced=true")
+
+			lineStart := int64(1000)
+			lineEnd := int64(3000)
+			tokenStartA := int64(1000)
+			tokenEndA := int64(1400)
+			tokenStartB := int64(2000)
+			tokenEndB := int64(2500)
+			lyricsJson, err := json.Marshal(model.LyricList{
+				{
+					Lang:   "eng",
+					Agents: []model.Agent{{ID: "lead", Role: "main"}, {ID: "__nd_bg__|lead", Role: "bg"}},
+					Synced: true,
+					Line: []model.Line{
+						{
+							Start: &lineStart,
+							End:   &lineEnd,
+							Value: "Hello echo",
+							Cue: []model.Cue{
+								{
+									Start:     &tokenStartA,
+									End:       &tokenEndA,
+									Value:     "Hello",
+									ByteStart: 0,
+									ByteEnd:   4,
+									AgentID:   "lead",
+								},
+								{
+									Start:     &tokenStartB,
+									End:       &tokenEndB,
+									Value:     "echo",
+									ByteStart: 6,
+									ByteEnd:   9,
+									AgentID:   "__nd_bg__|lead",
+								},
+							},
+						},
+					},
+				},
+			})
+			Expect(err).ToNot(HaveOccurred())
+
+			mockRepo.SetData(model.MediaFiles{
+				{
+					ID:     "1",
+					Artist: "Rick Astley",
+					Title:  "Never Gonna Give You Up",
+					Lyrics: string(lyricsJson),
+				},
+			})
+
+			response, err := router.GetLyricsBySongId(r)
+			Expect(err).ToNot(HaveOccurred())
+			compareResponses(response.LyricsList, responses.LyricsList{
+				StructuredLyrics: responses.StructuredLyrics{
+					{
+						DisplayArtist: "Rick Astley",
+						DisplayTitle:  "Never Gonna Give You Up",
+						Kind:          "main",
+						Lang:          "eng",
+						Synced:        true,
+						Agents: []responses.Agent{
+							{ID: "lead", Role: "main"},
+							{ID: "__nd_bg__|lead", Role: "bg"},
+						},
+						Line: []responses.Line{
+							{
+								Start: &lineStart,
+								Value: "Hello echo",
+							},
+						},
+						CueLine: []responses.CueLine{
+							{
+								Index:   0,
+								Start:   &lineStart,
+								End:     &lineEnd,
+								Value:   "Hello echo",
+								AgentID: "lead",
+								Cue: []responses.LyricCue{
+									{
+										Start:     tokenStartA,
+										End:       &tokenEndA,
+										ByteStart: 0,
+										ByteEnd:   4,
+										Value:     "Hello",
+									},
+								},
+							},
+							{
+								Index:   0,
+								Start:   &lineStart,
+								End:     &lineEnd,
+								Value:   "Hello echo",
+								AgentID: "__nd_bg__|lead",
+								Cue: []responses.LyricCue{
+									{
+										Start:     tokenStartB,
+										End:       &tokenEndB,
+										ByteStart: 6,
+										ByteEnd:   9,
+										Value:     "echo",
+									},
+								},
+							},
+						},
+					},
+				},
+			})
+		})
+
+		It("should keep enhanced line-level lyrics when no cue data is available", func() {
+			r := newGetRequest("id=1&enhanced=true")
+
+			lineStart := int64(1000)
+			lineEnd := int64(3000)
+			lyricsJSON, err := json.Marshal(model.LyricList{
+				{
+					Kind:   "main",
+					Lang:   "eng",
+					Synced: true,
+					Line: []model.Line{
+						{
+							Start: &lineStart,
+							End:   &lineEnd,
+							Value: "Line without word timing",
+						},
+					},
+				},
+			})
+			Expect(err).ToNot(HaveOccurred())
+
+			mockRepo.SetData(model.MediaFiles{
+				{
+					ID:     "1",
+					Artist: "Rick Astley",
+					Title:  "Never Gonna Give You Up",
+					Lyrics: string(lyricsJSON),
+				},
+			})
+
+			response, err := router.GetLyricsBySongId(r)
+			Expect(err).ToNot(HaveOccurred())
+			compareResponses(response.LyricsList, responses.LyricsList{
+				StructuredLyrics: responses.StructuredLyrics{
+					{
+						DisplayArtist: "Rick Astley",
+						DisplayTitle:  "Never Gonna Give You Up",
+						Kind:          "main",
+						Lang:          "eng",
+						Synced:        true,
+						Line: []responses.Line{
+							{
+								Start: &lineStart,
+								Value: "Line without word timing",
+							},
+						},
+					},
+				},
+			})
+		})
+
+		It("should return required cue byte offsets for ambiguous and multibyte cue lines", func() {
+			r := newGetRequest("id=1&enhanced=true")
+
+			asciiLineStart := int64(0)
+			asciiLineEnd := int64(2400)
+			asciiCueStartA := int64(0)
+			asciiCueEndA := int64(300)
+			asciiCueStartB := int64(900)
+			asciiCueEndB := int64(1300)
+			asciiCueStartC := int64(1300)
+			asciiCueEndC := int64(1600)
+			asciiCueStartD := int64(1600)
+
+			utfLineStart := int64(2747)
+			utfLineEnd := int64(6214)
+			utfCueStartA := int64(2747)
+			utfCueEndA := int64(3018)
+			utfCueStartB := int64(3018)
+			utfCueEndB := int64(3179)
+			utfCueStartC := int64(3582)
+			utfCueEndC := int64(4100)
+			utfCueStartD := int64(4500)
+			utfCueEndD := int64(6214)
+
+			lyricsJSON, err := json.Marshal(model.LyricList{
+				{
+					Lang:   "eng",
+					Synced: true,
+					Line: []model.Line{
+						{
+							Start: &asciiLineStart,
+							End:   &asciiLineEnd,
+							Value: "Oh love love me tonight",
+							Cue: []model.Cue{
+								{Start: &asciiCueStartA, End: &asciiCueEndA, Value: "Oh", ByteStart: 0, ByteEnd: 1},
+								{Start: &asciiCueStartB, End: &asciiCueEndB, Value: "love", ByteStart: 8, ByteEnd: 11},
+								{Start: &asciiCueStartC, End: &asciiCueEndC, Value: "me", ByteStart: 13, ByteEnd: 14},
+								{Start: &asciiCueStartD, Value: "tonight", ByteStart: 16, ByteEnd: 22},
+							},
+						},
+						{
+							Start: &utfLineStart,
+							End:   &utfLineEnd,
+							Value: "눈을 뜬 순간",
+							Cue: []model.Cue{
+								{Start: &utfCueStartA, End: &utfCueEndA, Value: "눈", ByteStart: 0, ByteEnd: 2},
+								{Start: &utfCueStartB, End: &utfCueEndB, Value: "을", ByteStart: 3, ByteEnd: 5},
+								{Start: &utfCueStartC, End: &utfCueEndC, Value: "뜬", ByteStart: 7, ByteEnd: 9},
+								{Start: &utfCueStartD, End: &utfCueEndD, Value: "순간", ByteStart: 11, ByteEnd: 16},
+							},
+						},
+					},
+				},
+			})
+			Expect(err).ToNot(HaveOccurred())
+
+			mockRepo.SetData(model.MediaFiles{
+				{
+					ID:     "1",
+					Artist: "Rick Astley",
+					Title:  "Never Gonna Give You Up",
+					Lyrics: string(lyricsJSON),
+				},
+			})
+
+			response, err := router.GetLyricsBySongId(r)
+			Expect(err).ToNot(HaveOccurred())
+			compareResponses(response.LyricsList, responses.LyricsList{
+				StructuredLyrics: responses.StructuredLyrics{
+					{
+						DisplayArtist: "Rick Astley",
+						DisplayTitle:  "Never Gonna Give You Up",
+						Kind:          "main",
+						Lang:          "eng",
+						Synced:        true,
+						Line: []responses.Line{
+							{Start: &asciiLineStart, Value: "Oh love love me tonight"},
+							{Start: &utfLineStart, Value: "눈을 뜬 순간"},
+						},
+						CueLine: []responses.CueLine{
+							{
+								Index: 0,
+								Start: &asciiLineStart,
+								End:   &asciiLineEnd,
+								Value: "Oh love love me tonight",
+								Cue: []responses.LyricCue{
+									{Start: asciiCueStartA, End: &asciiCueEndA, Value: "Oh", ByteStart: 0, ByteEnd: 1},
+									{Start: asciiCueStartB, End: &asciiCueEndB, Value: "love", ByteStart: 8, ByteEnd: 11},
+									{Start: asciiCueStartC, End: &asciiCueEndC, Value: "me", ByteStart: 13, ByteEnd: 14},
+									{Start: asciiCueStartD, End: &asciiLineEnd, Value: "tonight", ByteStart: 16, ByteEnd: 22},
+								},
+							},
+							{
+								Index: 1,
+								Start: &utfLineStart,
+								End:   &utfLineEnd,
+								Value: "눈을 뜬 순간",
+								Cue: []responses.LyricCue{
+									{Start: utfCueStartA, End: &utfCueEndA, Value: "눈", ByteStart: 0, ByteEnd: 2},
+									{Start: utfCueStartB, End: &utfCueEndB, Value: "을", ByteStart: 3, ByteEnd: 5},
+									{Start: utfCueStartC, End: &utfCueEndC, Value: "뜬", ByteStart: 7, ByteEnd: 9},
+									{Start: utfCueStartD, End: &utfCueEndD, Value: "순간", ByteStart: 11, ByteEnd: 16},
+								},
+							},
+						},
+					},
+				},
+			})
+		})
 	})
 })

--- a/server/subsonic/opensubsonic.go
+++ b/server/subsonic/opensubsonic.go
@ -11,7 +11,7 @@ func (api *Router) GetOpenSubsonicExtensions(_ *http.Request) (*responses.Subson
 	extensions := responses.OpenSubsonicExtensions{
 		{Name: "transcodeOffset", Versions: []int32{1}},
 		{Name: "formPost", Versions: []int32{1}},
-		{Name: "songLyrics", Versions: []int32{1}},
+		{Name: "songLyrics", Versions: []int32{1, 2}},
 		{Name: "indexBasedQueue", Versions: []int32{1}},
 		{Name: "transcoding", Versions: []int32{1}},
 	}
--- a/server/subsonic/opensubsonic_test.go
+++ b/server/subsonic/opensubsonic_test.go
@ -58,7 +58,7 @@ var _ = Describe("GetOpenSubsonicExtensions", func() {
 				HaveLen(5),
 				ContainElement(responses.OpenSubsonicExtension{Name: "transcodeOffset", Versions: []int32{1}}),
 				ContainElement(responses.OpenSubsonicExtension{Name: "formPost", Versions: []int32{1}}),
-				ContainElement(responses.OpenSubsonicExtension{Name: "songLyrics", Versions: []int32{1}}),
+				ContainElement(responses.OpenSubsonicExtension{Name: "songLyrics", Versions: []int32{1, 2}}),
 				ContainElement(responses.OpenSubsonicExtension{Name: "indexBasedQueue", Versions: []int32{1}}),
 				ContainElement(responses.OpenSubsonicExtension{Name: "transcoding", Versions: []int32{1}}),
 			))
@ -87,7 +87,7 @@ var _ = Describe("GetOpenSubsonicExtensions", func() {
 				HaveLen(6),
 				ContainElement(responses.OpenSubsonicExtension{Name: "transcodeOffset", Versions: []int32{1}}),
 				ContainElement(responses.OpenSubsonicExtension{Name: "formPost", Versions: []int32{1}}),
-				ContainElement(responses.OpenSubsonicExtension{Name: "songLyrics", Versions: []int32{1}}),
+				ContainElement(responses.OpenSubsonicExtension{Name: "songLyrics", Versions: []int32{1, 2}}),
 				ContainElement(responses.OpenSubsonicExtension{Name: "indexBasedQueue", Versions: []int32{1}}),
 				ContainElement(responses.OpenSubsonicExtension{Name: "transcoding", Versions: []int32{1}}),
 				ContainElement(responses.OpenSubsonicExtension{Name: "sonicSimilarity", Versions: []int32{1}}),
--- a/server/subsonic/responses/responses.go
+++ b/server/subsonic/responses/responses.go
@ -543,13 +543,39 @@ type Line struct {
 	Value string `xml:",chardata"            json:"value"`
 }

+type LyricCue struct {
+	Start     int64  `xml:"start,attr"           json:"start"`
+	End       *int64 `xml:"end,attr,omitempty"   json:"end,omitempty"`
+	ByteStart int    `xml:"byteStart,attr"       json:"byteStart"`
+	ByteEnd   int    `xml:"byteEnd,attr"         json:"byteEnd"`
+	Value     string `xml:",chardata"            json:"value"`
+}
+
+type Agent struct {
+	ID   string `xml:"id,attr"                 json:"id"`
+	Role string `xml:"role,attr"               json:"role"`
+	Name string `xml:"name,attr,omitempty"     json:"name,omitempty"`
+}
+
+type CueLine struct {
+	Index   int32      `xml:"index,attr"                    json:"index"`
+	Start   *int64     `xml:"start,attr,omitempty"          json:"start,omitempty"`
+	End     *int64     `xml:"end,attr,omitempty"            json:"end,omitempty"`
+	Value   string     `xml:"value,attr"                    json:"value"`
+	AgentID string     `xml:"agentId,attr,omitempty"        json:"agentId,omitempty"`
+	Cue     []LyricCue `xml:"cue,omitempty"                 json:"cue,omitempty"`
+}
+
 type StructuredLyric struct {
-	DisplayArtist string `xml:"displayArtist,attr,omitempty" json:"displayArtist,omitempty"`
-	DisplayTitle  string `xml:"displayTitle,attr,omitempty"  json:"displayTitle,omitempty"`
-	Lang          string `xml:"lang,attr"                    json:"lang"`
-	Line          []Line `xml:"line"                         json:"line"`
-	Offset        *int64 `xml:"offset,attr,omitempty"        json:"offset,omitempty"`
-	Synced        bool   `xml:"synced,attr"                  json:"synced"`
+	DisplayArtist string    `xml:"displayArtist,attr,omitempty" json:"displayArtist,omitempty"`
+	DisplayTitle  string    `xml:"displayTitle,attr,omitempty"  json:"displayTitle,omitempty"`
+	Kind          string    `xml:"kind,attr,omitempty"          json:"kind,omitempty"`
+	Lang          string    `xml:"lang,attr"                    json:"lang"`
+	Line          []Line    `xml:"line"                         json:"line"`
+	Agents        []Agent   `xml:"agent,omitempty"              json:"agents,omitempty"`
+	CueLine       []CueLine `xml:"cueLine,omitempty"     json:"cueLine,omitempty"`
+	Offset        *int64    `xml:"offset,attr,omitempty"        json:"offset,omitempty"`
+	Synced        bool      `xml:"synced,attr"                  json:"synced"`
 }

 type StructuredLyrics []StructuredLyric
--- a/tests/fixtures/bom-test.ttml
+++ b/tests/fixtures/bom-test.ttml
@ -0,0 +1,2 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<tt xmlns="http://www.w3.org/ns/ttml"><body><div xml:lang="eng"><p begin="00:00:00.00">BOM test line</p></div></body></tt>
--- a/tests/fixtures/bom-utf16-test.ttml
+++ b/tests/fixtures/bom-utf16-test.ttml
--- a/tests/fixtures/test-enhanced.lrc
+++ b/tests/fixtures/test-enhanced.lrc
@ -0,0 +1,6 @@
+[ar:Test Artist]
+[ti:Enhanced Test]
+[lang:eng]
+[00:01.00]<00:01.00>Some <00:01.50>lyrics <00:02.00>here
+[00:03.00]<00:03.00>More <00:03.50>words
+[00:05.00]Plain line without inline markers
--- a/tests/fixtures/test-metadata.ttml
+++ b/tests/fixtures/test-metadata.ttml
@ -0,0 +1,25 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<tt xmlns="http://www.w3.org/ns/ttml" xmlns:itunes="http://music.apple.com/lyric-ttml-internal">
+  <head>
+    <metadata>
+      <iTunesMetadata xmlns="http://music.apple.com/lyric-ttml-internal">
+        <translations>
+          <translation xml:lang="es">
+            <text for="L1">Hola</text>
+          </translation>
+        </translations>
+        <transliterations>
+          <transliteration xml:lang="ja-Latn">
+            <text for="L2"><span begin="00:02.000" end="00:02.300" xmlns="http://www.w3.org/ns/ttml">ko</span><span begin="00:02.300" end="00:02.600" xmlns="http://www.w3.org/ns/ttml">nni</span></text>
+          </transliteration>
+        </transliterations>
+      </iTunesMetadata>
+    </metadata>
+  </head>
+  <body xml:lang="ja">
+    <div>
+      <p begin="00:01.000" end="00:01.500" itunes:key="L1">こんにちは</p>
+      <p begin="00:02.000" end="00:02.700" itunes:key="L2">こんばんは</p>
+    </div>
+  </body>
+</tt>
--- a/tests/fixtures/test.elrc
+++ b/tests/fixtures/test.elrc
@ -0,0 +1,5 @@
+[ar:ELRC Artist]
+[ti:ELRC Song]
+[lang:eng]
+[00:01.00]<00:01.00>Lead <00:01.50>words
+[00:03.00]Fallback line
--- a/tests/fixtures/test.srt
+++ b/tests/fixtures/test.srt
@ -0,0 +1,7 @@
+1
+00:00:18,800 --> 00:00:22,800
+We're from subtitles
+
+2
+00:00:22,801 --> 00:00:26,000
+Another subtitle line
--- a/tests/fixtures/test.ttml
+++ b/tests/fixtures/test.ttml
@ -0,0 +1,12 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<tt xmlns="http://www.w3.org/ns/ttml" xmlns:ttp="http://www.w3.org/ns/ttml#parameter" ttp:frameRate="30" ttp:subFrameRate="2" ttp:tickRate="10">
+  <body>
+    <div xml:lang="eng">
+      <p begin="00:00:18.80">We're no strangers to love</p>
+      <p begin="00:00:22:24">You know the rules and so do I</p>
+    </div>
+    <div xml:lang="por">
+      <p begin="188t">Nao somos estranhos ao amor</p>
+    </div>
+  </body>
+</tt>
--- a/ui/src/actions/player.js
+++ b/ui/src/actions/player.js
@ -9,6 +9,7 @@ export const PLAYER_SET_VOLUME = 'PLAYER_SET_VOLUME'
 export const PLAYER_SET_MODE = 'PLAYER_SET_MODE'
 export const TRANSCODING_SET_PROFILE = 'TRANSCODING_SET_PROFILE'
 export const PLAYER_REFRESH_QUEUE = 'PLAYER_REFRESH_QUEUE'
+export const PLAYER_UPDATE_LYRIC = 'PLAYER_UPDATE_LYRIC'

 export const setTrack = (data) => ({
  type: PLAYER_SET_TRACK,
@ -114,3 +115,8 @@ export const refreshQueue = (resolvedUrls) => ({
  type: PLAYER_REFRESH_QUEUE,
  data: resolvedUrls,
 })
+
+export const updateQueueLyric = (trackId, lyric) => ({
+  type: PLAYER_UPDATE_LYRIC,
+  data: { trackId, lyric },
+})
--- a/ui/src/audioplayer/KaraokeLyricsOverlay.jsx
+++ b/ui/src/audioplayer/KaraokeLyricsOverlay.jsx
--- a/ui/src/audioplayer/KaraokeLyricsOverlay.test.jsx
+++ b/ui/src/audioplayer/KaraokeLyricsOverlay.test.jsx
@ -0,0 +1,514 @@
+import React from 'react'
+import {
+  cleanup,
+  fireEvent,
+  render,
+  screen,
+  waitFor,
+} from '@testing-library/react'
+import KaraokeLyricsOverlay from './KaraokeLyricsOverlay'
+
+const DEFAULT_LINE_HEIGHT_TEXT = '1.30'
+const NEXT_LINE_HEIGHT_TEXT = '1.32'
+
+const audioInstance = {
+  currentTime: 0,
+  paused: true,
+  seeking: false,
+  playbackRate: 1,
+}
+
+const buildLyric = (kind, lang, value) => ({
+  kind,
+  lang,
+  synced: true,
+  line: [{ start: 1000, value }],
+})
+
+const renderOverlay = (props = {}) =>
+  render(
+    <KaraokeLyricsOverlay
+      visible={true}
+      mainLyric={buildLyric('main', 'ja', 'こんにちは')}
+      translationLyric={buildLyric('translation', 'en', 'Hello')}
+      pronunciationLyric={buildLyric('pronunciation', 'ja-Latn', 'konnichiwa')}
+      showTranslation={false}
+      showPronunciation={true}
+      translationEnabled={true}
+      pronunciationEnabled={true}
+      onToggleTranslation={() => {}}
+      onTogglePronunciation={() => {}}
+      audioInstance={audioInstance}
+      onClose={() => {}}
+      {...props}
+    />,
+  )
+
+describe('<KaraokeLyricsOverlay /> behavior', () => {
+  beforeEach(() => {
+    localStorage.clear()
+    window.innerWidth = 1200
+    window.innerHeight = 900
+    vi.spyOn(window, 'requestAnimationFrame').mockImplementation(() => 1)
+    vi.spyOn(window, 'cancelAnimationFrame').mockImplementation(() => {})
+  })
+
+  afterEach(() => {
+    vi.restoreAllMocks()
+    cleanup()
+  })
+
+  it('shows tooltips for translation, pronunciation, and appearance controls', async () => {
+    renderOverlay()
+
+    fireEvent.mouseOver(screen.getByTestId('lyrics-language-badge-tr'))
+    expect(await screen.findByText('Show translation')).toBeInTheDocument()
+
+    fireEvent.mouseOver(screen.getByTestId('lyrics-language-badge-pr'))
+    expect(await screen.findByText('Hide pronunciation')).toBeInTheDocument()
+
+    fireEvent.mouseOver(screen.getByTestId('lyrics-settings-button'))
+    expect(await screen.findByText('Appearance')).toBeInTheDocument()
+  })
+
+  it('renders inline mode without the desktop resize handle', () => {
+    renderOverlay({ inline: true })
+
+    expect(screen.getByTestId('karaoke-lyrics-overlay')).toHaveAttribute(
+      'data-inline',
+      'true',
+    )
+    expect(screen.queryByTestId('lyrics-resize-handle')).not.toBeInTheDocument()
+  })
+
+  it('renders the appearance popup with Main label and default line height for older settings', async () => {
+    localStorage.setItem(
+      'karaoke-lyrics-settings',
+      JSON.stringify({
+        tr: { fontSize: 16, colorKey: 'blue' },
+        main: { fontSize: 26, colorKey: 'white' },
+        pr: { fontSize: 15, colorKey: 'green' },
+      }),
+    )
+
+    renderOverlay()
+
+    fireEvent.click(screen.getByTestId('lyrics-settings-button'))
+
+    expect(await screen.findByText('Appearance')).toBeInTheDocument()
+    expect(screen.getByText('Main', { selector: 'div' })).toBeInTheDocument()
+    expect(screen.queryByText('Default')).not.toBeInTheDocument()
+    expect(screen.getByTestId('lyrics-reset-appearance')).toBeInTheDocument()
+    expect(screen.getByTestId('lyrics-line-height-value')).toHaveTextContent(
+      DEFAULT_LINE_HEIGHT_TEXT,
+    )
+  })
+
+  it('renders the lyric group in main, pronunciation, translation order with layer badges', () => {
+    renderOverlay({
+      showTranslation: true,
+      showPronunciation: true,
+    })
+
+    const mainLine = screen.getByText('こんにちは')
+    const pronunciationLine = screen.getByText('konnichiwa')
+    const translationLine = screen.getByText('Hello')
+
+    expect(
+      mainLine.compareDocumentPosition(pronunciationLine) &
+        Node.DOCUMENT_POSITION_FOLLOWING,
+    ).toBeTruthy()
+    expect(
+      pronunciationLine.compareDocumentPosition(translationLine) &
+        Node.DOCUMENT_POSITION_FOLLOWING,
+    ).toBeTruthy()
+
+    expect(screen.getByTestId('lyrics-language-badge-main')).toHaveTextContent(
+      'Mainja',
+    )
+    expect(screen.getByTestId('lyrics-language-badge-pr')).toHaveTextContent(
+      'PRja-Latn',
+    )
+    expect(screen.getByTestId('lyrics-language-badge-tr')).toHaveTextContent(
+      'TRen',
+    )
+  })
+
+  it('renders line-timed rows as whole-line spans without synthetic token splits', () => {
+    renderOverlay({
+      mainLyric: {
+        kind: 'main',
+        lang: 'en',
+        synced: true,
+        line: [
+          { start: 1000, end: 2400, value: 'Batter up, batter up, batter up' },
+        ],
+      },
+      translationLyric: {
+        kind: 'translation',
+        lang: 'ja',
+        synced: true,
+        line: [
+          {
+            start: 1000,
+            end: 2400,
+            value: 'バッターアップ、バッターアップ、バッターアップ',
+          },
+        ],
+      },
+      pronunciationLyric: {
+        kind: 'pronunciation',
+        lang: 'ja-Latn',
+        synced: true,
+        line: [
+          {
+            start: 1000,
+            end: 2400,
+            value: 'Battaa appu, battaa appu, battaa appu',
+          },
+        ],
+      },
+      showTranslation: true,
+      showPronunciation: true,
+    })
+
+    const mainLine = screen.getByText(
+      'Batter up, batter up, batter up',
+    ).parentElement
+    const pronunciationLine = screen.getByText(
+      'Battaa appu, battaa appu, battaa appu',
+    ).parentElement
+    const translationLine = screen.getByText(
+      'バッターアップ、バッターアップ、バッターアップ',
+    ).parentElement
+
+    expect(mainLine.querySelectorAll('span')).toHaveLength(1)
+    expect(pronunciationLine.querySelectorAll('span')).toHaveLength(1)
+    expect(translationLine.querySelectorAll('span')).toHaveLength(1)
+  })
+
+  it('uses cue byte offsets to segment repeated words in the karaoke line', () => {
+    renderOverlay({
+      mainLyric: {
+        kind: 'main',
+        lang: 'en',
+        synced: true,
+        line: [{ start: 0, end: 2400, value: 'Oh love love me tonight' }],
+        cueLine: [
+          {
+            index: 0,
+            start: 0,
+            end: 2400,
+            value: 'Oh love love me tonight',
+            cue: [
+              { start: 0, end: 300, value: 'Oh', byteStart: 0, byteEnd: 1 },
+              {
+                start: 900,
+                end: 1300,
+                value: 'love',
+                byteStart: 8,
+                byteEnd: 11,
+              },
+              {
+                start: 1300,
+                end: 1600,
+                value: 'me',
+                byteStart: 13,
+                byteEnd: 14,
+              },
+              {
+                start: 1600,
+                end: 2400,
+                value: 'tonight',
+                byteStart: 16,
+                byteEnd: 22,
+              },
+            ],
+          },
+        ],
+      },
+      translationLyric: null,
+      pronunciationLyric: null,
+      showTranslation: false,
+      showPronunciation: false,
+      translationEnabled: false,
+      pronunciationEnabled: false,
+      audioInstance: {
+        ...audioInstance,
+        currentTime: 1.0,
+      },
+    })
+
+    const mainLine = screen.getByText('Oh').parentElement
+    const segments = Array.from(mainLine.querySelectorAll('span')).map(
+      (span) => span.textContent,
+    )
+
+    expect(segments).toEqual([
+      'Oh',
+      ' love ',
+      'love',
+      ' ',
+      'me',
+      ' ',
+      'tonight',
+    ])
+  })
+
+  it('uses cue byte offsets to preserve explicit space cues in multibyte karaoke lines', () => {
+    renderOverlay({
+      mainLyric: {
+        kind: 'main',
+        lang: 'ko',
+        synced: true,
+        line: [{ start: 0, end: 900, value: '눈을 뜬 순간' }],
+        cueLine: [
+          {
+            index: 0,
+            start: 0,
+            end: 900,
+            value: '눈을 뜬 순간',
+            cue: [
+              { start: 0, end: 150, value: '눈을', byteStart: 0, byteEnd: 5 },
+              { start: 150, end: 250, value: ' ', byteStart: 6, byteEnd: 6 },
+              { start: 250, end: 450, value: '뜬', byteStart: 7, byteEnd: 9 },
+              { start: 450, end: 550, value: ' ', byteStart: 10, byteEnd: 10 },
+              { start: 550, end: 900, value: '순간', byteStart: 11, byteEnd: 16 },
+            ],
+          },
+        ],
+      },
+      translationLyric: null,
+      pronunciationLyric: null,
+      showTranslation: false,
+      showPronunciation: false,
+      translationEnabled: false,
+      pronunciationEnabled: false,
+      audioInstance: {
+        ...audioInstance,
+        currentTime: 0.3,
+      },
+    })
+
+    const mainLine = screen.getByText('눈을').parentElement
+    const segments = Array.from(mainLine.querySelectorAll('span')).map(
+      (span) => span.textContent,
+    )
+
+    expect(segments).toEqual(['눈을', ' ', '뜬', ' ', '순간'])
+  })
+
+  it('highlights line-timed pronunciation and translation rows with the active main line', () => {
+    renderOverlay({
+      mainLyric: {
+        kind: 'main',
+        lang: 'en',
+        synced: true,
+        line: [
+          { start: 1000, end: 1800, value: 'Line one' },
+          { start: 2500, end: 3300, value: 'Line two' },
+        ],
+      },
+      translationLyric: {
+        kind: 'translation',
+        lang: 'ja',
+        synced: true,
+        line: [
+          { start: 1000, end: 1800, value: '一行目' },
+          { start: 2500, end: 3300, value: '二行目' },
+        ],
+      },
+      pronunciationLyric: {
+        kind: 'pronunciation',
+        lang: 'ja-Latn',
+        synced: true,
+        line: [
+          { start: 1000, end: 1800, value: 'ichigyoume' },
+          { start: 2500, end: 3300, value: 'nigyoume' },
+        ],
+      },
+      showTranslation: true,
+      showPronunciation: true,
+      audioInstance: {
+        ...audioInstance,
+        currentTime: 1.2,
+      },
+    })
+
+    const activePronunciation = screen.getByText('ichigyoume').parentElement
+    const inactivePronunciation = screen.getByText('nigyoume').parentElement
+    const activeTranslation = screen.getByText('一行目').parentElement
+    const inactiveTranslation = screen.getByText('二行目').parentElement
+
+    expect(parseFloat(activePronunciation.style.opacity)).toBeGreaterThan(
+      parseFloat(inactivePronunciation.style.opacity),
+    )
+    expect(parseFloat(activeTranslation.style.opacity)).toBeGreaterThan(
+      parseFloat(inactiveTranslation.style.opacity),
+    )
+  })
+
+  it('pre-wraps inactive main lines so the active line keeps the same wrap shape', () => {
+    renderOverlay({
+      mainLyric: {
+        kind: 'main',
+        lang: 'en',
+        synced: true,
+        line: [
+          { start: 1000, end: 1800, value: 'First line that is getting focus' },
+          { start: 2500, end: 3300, value: 'Second line waiting below' },
+        ],
+      },
+      translationLyric: null,
+      pronunciationLyric: null,
+      showTranslation: false,
+      showPronunciation: false,
+      translationEnabled: false,
+      pronunciationEnabled: false,
+      audioInstance: {
+        ...audioInstance,
+        currentTime: 1.2,
+      },
+    })
+
+    const activeLine = screen.getByText('First line that is getting focus')
+      .parentElement
+    const inactiveLine = screen.getByText('Second line waiting below')
+      .parentElement
+
+    expect(parseFloat(activeLine.style.fontSize)).toBeGreaterThan(
+      parseFloat(inactiveLine.style.fontSize),
+    )
+    expect(activeLine.style.maxWidth).toBe('100%')
+    expect(inactiveLine.style.maxWidth).toBe('80%')
+  })
+
+  it('centers pronunciation text inside the pill container', () => {
+    renderOverlay({
+      showTranslation: false,
+      showPronunciation: true,
+    })
+
+    const pronunciationLine = screen.getByText('konnichiwa').parentElement
+    const styles = window.getComputedStyle(pronunciationLine)
+
+    expect(styles.display).toBe('inline-flex')
+    expect(styles.justifyContent).toBe('center')
+    expect(styles.alignItems).toBe('center')
+  })
+
+  it('renders untimed text lyrics in manual reading mode without a pinned active line', () => {
+    renderOverlay({
+      mainLyric: {
+        kind: 'main',
+        lang: 'en',
+        synced: false,
+        line: [{ value: 'First plain line' }, { value: 'Second plain line' }],
+      },
+      translationLyric: null,
+      pronunciationLyric: null,
+      showTranslation: false,
+      showPronunciation: false,
+      translationEnabled: false,
+      pronunciationEnabled: false,
+    })
+
+    const firstLine = screen.getByText('First plain line').parentElement
+    const secondLine = screen.getByText('Second plain line').parentElement
+
+    expect(firstLine.style.opacity).toBe('1')
+    expect(secondLine.style.opacity).toBe('1')
+    expect(firstLine.style.color).toBe(secondLine.style.color)
+  })
+
+  it('persists line height changes, keeps aux line spacing fixed, and stores overlay height', async () => {
+    renderOverlay({
+      mainLyric: buildLyric('main', 'en', 'Hello world'),
+      translationLyric: buildLyric('translation', 'es', 'Hola'),
+      pronunciationLyric: buildLyric('pronunciation', 'en-Latn', 'heh-loh'),
+      showTranslation: true,
+      showPronunciation: true,
+      translationEnabled: true,
+      pronunciationEnabled: true,
+    })
+
+    const overlay = screen.getByTestId('karaoke-lyrics-overlay')
+    const mainLine = screen.getByText('Hello world').parentElement
+    const pronunciationLine = screen.getByText('heh-loh').parentElement
+    expect(mainLine).toHaveStyle(`line-height: ${DEFAULT_LINE_HEIGHT_TEXT}`)
+    expect(pronunciationLine).toHaveStyle('line-height: 1.2')
+
+    fireEvent.click(screen.getByTestId('lyrics-settings-button'))
+
+    const slider = screen.getByRole('slider', { name: 'Line height' })
+    slider.focus()
+    fireEvent.keyDown(slider, { key: 'ArrowRight' })
+
+    await waitFor(() =>
+      expect(screen.getByTestId('lyrics-line-height-value')).toHaveTextContent(
+        NEXT_LINE_HEIGHT_TEXT,
+      ),
+    )
+
+    await waitFor(() =>
+      expect(mainLine).toHaveStyle(`line-height: ${NEXT_LINE_HEIGHT_TEXT}`),
+    )
+    expect(pronunciationLine).toHaveStyle('line-height: 1.2')
+
+    fireEvent.mouseDown(screen.getByTestId('lyrics-resize-handle'), {
+      clientY: 400,
+    })
+    fireEvent.mouseMove(window, { clientY: 360 })
+    fireEvent.mouseUp(window)
+
+    await waitFor(() => expect(overlay).toHaveStyle('height: 340px'))
+
+    const stored = JSON.parse(localStorage.getItem('karaoke-lyrics-settings'))
+    expect(stored.lineHeight).toBeCloseTo(1.32, 2)
+    expect(stored.overlayHeight).toBe(340)
+  })
+
+  it('resets appearance back to the default spacing and overlay height', async () => {
+    localStorage.setItem(
+      'karaoke-lyrics-settings',
+      JSON.stringify({
+        lineHeight: 1.8,
+        overlayHeight: 420,
+        tr: { fontSize: 16, colorKey: 'yellow' },
+        main: { fontSize: 28, colorKey: 'cyan' },
+        pr: { fontSize: 15, colorKey: 'pink' },
+      }),
+    )
+
+    renderOverlay({
+      mainLyric: buildLyric('main', 'en', 'Hello world'),
+      translationLyric: null,
+      pronunciationLyric: null,
+      showPronunciation: false,
+      translationEnabled: false,
+      pronunciationEnabled: false,
+    })
+
+    const overlay = screen.getByTestId('karaoke-lyrics-overlay')
+    const mainLine = screen.getByText('Hello world').parentElement
+    expect(overlay).toHaveStyle('height: 420px')
+    expect(mainLine).toHaveStyle('line-height: 1.8')
+
+    fireEvent.click(screen.getByTestId('lyrics-settings-button'))
+    fireEvent.click(screen.getByTestId('lyrics-reset-appearance'))
+
+    await waitFor(() =>
+      expect(screen.getByTestId('lyrics-line-height-value')).toHaveTextContent(
+        DEFAULT_LINE_HEIGHT_TEXT,
+      ),
+    )
+    await waitFor(() => expect(overlay).toHaveStyle('height: 300px'))
+    await waitFor(() =>
+      expect(mainLine).toHaveStyle(`line-height: ${DEFAULT_LINE_HEIGHT_TEXT}`),
+    )
+
+    const stored = JSON.parse(localStorage.getItem('karaoke-lyrics-settings'))
+    expect(stored.lineHeight).toBeCloseTo(1.3, 2)
+    expect(stored.overlayHeight).toBe(300)
+  })
+})
--- a/ui/src/audioplayer/MobileKaraokeLyricsPortal.jsx
+++ b/ui/src/audioplayer/MobileKaraokeLyricsPortal.jsx
@ -0,0 +1,65 @@
+import React, { useEffect, useState } from 'react'
+import { createPortal } from 'react-dom'
+
+export const MOBILE_KARAOKE_LYRICS_HOST_SELECTOR =
+  '.react-jinke-music-player-mobile-cover'
+export const MOBILE_KARAOKE_LYRICS_ACTIVE_CLASS = 'nd-mobile-lyrics-active'
+
+const resolveMobileLyricsHost = () => {
+  if (typeof document === 'undefined') {
+    return null
+  }
+  return document.querySelector(MOBILE_KARAOKE_LYRICS_HOST_SELECTOR)
+}
+
+const MobileKaraokeLyricsPortal = ({ active, children }) => {
+  const [host, setHost] = useState(() =>
+    active ? resolveMobileLyricsHost() : null,
+  )
+
+  useEffect(() => {
+    if (typeof document === 'undefined') {
+      setHost(null)
+      return undefined
+    }
+
+    if (!active) {
+      setHost(null)
+      return undefined
+    }
+
+    const syncHost = () => {
+      setHost(resolveMobileLyricsHost())
+    }
+
+    syncHost()
+
+    const observer = new MutationObserver(syncHost)
+    observer.observe(document.body, {
+      childList: true,
+      subtree: true,
+    })
+
+    return () => observer.disconnect()
+  }, [active])
+
+  useEffect(() => {
+    if (!host) {
+      return undefined
+    }
+
+    host.classList.toggle(MOBILE_KARAOKE_LYRICS_ACTIVE_CLASS, active)
+
+    return () => {
+      host.classList.remove(MOBILE_KARAOKE_LYRICS_ACTIVE_CLASS)
+    }
+  }, [active, host])
+
+  if (!active || !host) {
+    return null
+  }
+
+  return createPortal(children, host)
+}
+
+export default MobileKaraokeLyricsPortal
--- a/ui/src/audioplayer/MobileKaraokeLyricsPortal.test.jsx
+++ b/ui/src/audioplayer/MobileKaraokeLyricsPortal.test.jsx
@ -0,0 +1,55 @@
+import React from 'react'
+import { cleanup, render, screen, waitFor } from '@testing-library/react'
+import MobileKaraokeLyricsPortal, {
+  MOBILE_KARAOKE_LYRICS_ACTIVE_CLASS,
+} from './MobileKaraokeLyricsPortal'
+
+const HOST_CLASS = 'react-jinke-music-player-mobile-cover'
+
+describe('<MobileKaraokeLyricsPortal />', () => {
+  afterEach(() => {
+    cleanup()
+    document.body.innerHTML = ''
+  })
+
+  it('renders lyrics into the mobile cover host and toggles the active class', () => {
+    const host = document.createElement('div')
+    host.className = HOST_CLASS
+    document.body.appendChild(host)
+
+    const { rerender } = render(
+      <MobileKaraokeLyricsPortal active={true}>
+        <div data-testid="mobile-inline-lyrics">Lyrics</div>
+      </MobileKaraokeLyricsPortal>,
+    )
+
+    expect(host).toContainElement(screen.getByTestId('mobile-inline-lyrics'))
+    expect(host).toHaveClass(MOBILE_KARAOKE_LYRICS_ACTIVE_CLASS)
+
+    rerender(
+      <MobileKaraokeLyricsPortal active={false}>
+        <div data-testid="mobile-inline-lyrics">Lyrics</div>
+      </MobileKaraokeLyricsPortal>,
+    )
+
+    expect(screen.queryByTestId('mobile-inline-lyrics')).not.toBeInTheDocument()
+    expect(host).not.toHaveClass(MOBILE_KARAOKE_LYRICS_ACTIVE_CLASS)
+  })
+
+  it('attaches when the mobile cover host appears after mount', async () => {
+    render(
+      <MobileKaraokeLyricsPortal active={true}>
+        <div data-testid="mobile-inline-lyrics">Lyrics</div>
+      </MobileKaraokeLyricsPortal>,
+    )
+
+    const host = document.createElement('div')
+    host.className = HOST_CLASS
+    document.body.appendChild(host)
+
+    await waitFor(() =>
+      expect(host).toContainElement(screen.getByTestId('mobile-inline-lyrics')),
+    )
+    expect(host).toHaveClass(MOBILE_KARAOKE_LYRICS_ACTIVE_CLASS)
+  })
+})
--- a/ui/src/audioplayer/Player.jsx
+++ b/ui/src/audioplayer/Player.jsx
@ -22,6 +22,7 @@ import {
  refreshQueue,
  setPlayMode,
  setTranscodingProfile,
+  updateQueueLyric,
  setVolume,
  syncQueue,
 } from '../actions'
@ -33,6 +34,30 @@ import { keyMap } from '../hotkeys'
 import keyHandlers from './keyHandlers'
 import { calculateGain } from '../utils/calculateReplayGain'
 import { detectBrowserProfile, decisionService } from '../transcode'
+import {
+  getPreferredLyricLanguage,
+  hasStructuredLyricContent,
+  selectLyricLayers,
+  structuredLyricToLrc,
+} from './lyrics'
+import {
+  resolveLyricsOverlayState,
+  togglePronunciationPreference,
+} from './lyricsOverlayState'
+import KaraokeLyricsOverlay from './KaraokeLyricsOverlay'
+import MobileKaraokeLyricsPortal from './MobileKaraokeLyricsPortal'
+
+const emptyLyricLayers = {
+  main: null,
+  translation: null,
+  pronunciation: null,
+}
+
+const normalizeLyricLayers = (layers) => ({
+  main: layers?.main || null,
+  translation: layers?.translation || null,
+  pronunciation: layers?.pronunciation || null,
+})

 const Player = () => {
  const theme = useCurrentTheme()
@ -120,6 +145,83 @@ const Player = () => {
  const gainInfo = useSelector((state) => state.replayGain)
  const [context, setContext] = useState(null)
  const [gainNode, setGainNode] = useState(null)
+  const lyricCacheRef = useRef(new Map())
+  const lyricRequestIdRef = useRef(0)
+  const playerRef = useRef(null)
+  const [karaokeVisiblePreference, setKaraokeVisiblePreference] =
+    useState(false)
+  const [selectedLyricLayers, setSelectedLyricLayers] =
+    useState(emptyLyricLayers)
+  const [translationPreference, setTranslationPreference] = useState(false)
+  const [pronunciationPreference, setPronunciationPreference] = useState(null)
+  const currentTrackId = playerState.current?.trackId
+  const currentTrackIsRadio = playerState.current?.isRadio
+  const selectedStructuredLyric = selectedLyricLayers.main
+  const hasKaraokeLyric = hasStructuredLyricContent(selectedStructuredLyric)
+  const hasTranslationLyric = hasStructuredLyricContent(
+    selectedLyricLayers.translation,
+  )
+  const hasPronunciationLyric = hasStructuredLyricContent(
+    selectedLyricLayers.pronunciation,
+  )
+  const { karaokeVisible, showTranslation, showPronunciation } =
+    resolveLyricsOverlayState({
+      karaokeVisiblePreference,
+      translationPreference,
+      pronunciationPreference,
+      hasKaraokeLyric,
+      hasTranslationLyric,
+      hasPronunciationLyric,
+    })
+  const useInlineMobileLyrics = karaokeVisible && !isDesktop
+
+  const applyLyricToRuntimePlayer = useCallback((trackId, lyric) => {
+    if (!trackId) {
+      return
+    }
+
+    const player = playerRef.current
+    if (!player || typeof player.setState !== 'function') {
+      return
+    }
+
+    player.setState((prevState) => {
+      const prevLists = Array.isArray(prevState.audioLists)
+        ? prevState.audioLists
+        : []
+      let changed = false
+      const audioLists = prevLists.map((item) => {
+        if (item.trackId !== trackId) {
+          return item
+        }
+        if (item.lyric === lyric) {
+          return item
+        }
+        changed = true
+        return {
+          ...item,
+          lyric,
+        }
+      })
+
+      const currentItem = audioLists.find(
+        (item) => item.musicSrc === prevState.musicSrc,
+      )
+      const currentLyric =
+        typeof currentItem?.lyric === 'string'
+          ? currentItem.lyric
+          : prevState.lyric
+
+      if (!changed && currentLyric === prevState.lyric) {
+        return null
+      }
+
+      return {
+        audioLists,
+        lyric: currentLyric,
+      }
+    })
+  }, [])

  useEffect(() => {
    if (
@ -166,6 +268,88 @@ const Player = () => {
    return () => window.removeEventListener('beforeunload', handleBeforeUnload)
  }, [playerState, audioInstance])

+  useEffect(() => {
+    if (!currentTrackId || currentTrackIsRadio) {
+      setSelectedLyricLayers(emptyLyricLayers)
+      return
+    }
+
+    const cached = lyricCacheRef.current.get(currentTrackId)
+    let layers = emptyLyricLayers
+    if (cached && typeof cached !== 'string') {
+      if (cached.layers) {
+        layers = normalizeLyricLayers(cached.layers)
+      } else if (cached.structuredLyric) {
+        layers = normalizeLyricLayers({
+          main: cached.structuredLyric,
+        })
+      }
+    }
+    setSelectedLyricLayers(layers)
+  }, [currentTrackId, currentTrackIsRadio])
+
+  useEffect(() => {
+    lyricRequestIdRef.current += 1
+    const requestId = lyricRequestIdRef.current
+
+    if (!currentTrackId || currentTrackIsRadio) {
+      return
+    }
+
+    const cached = lyricCacheRef.current.get(currentTrackId)
+    if (cached !== undefined) {
+      const cachedLyric =
+        typeof cached === 'string' ? cached : cached?.lrc || ''
+      const cachedLayers =
+        typeof cached === 'string'
+          ? emptyLyricLayers
+          : cached?.layers
+            ? normalizeLyricLayers(cached.layers)
+            : normalizeLyricLayers({ main: cached?.structuredLyric })
+
+      setSelectedLyricLayers(cachedLayers)
+      if (cachedLyric) {
+        dispatch(updateQueueLyric(currentTrackId, cachedLyric))
+        applyLyricToRuntimePlayer(currentTrackId, cachedLyric)
+      }
+      return
+    }
+
+    subsonic
+      .getLyricsBySongId(currentTrackId)
+      .then((resp) => {
+        if (lyricRequestIdRef.current !== requestId) {
+          return
+        }
+
+        const structuredLyrics =
+          resp?.json?.['subsonic-response']?.lyricsList?.structuredLyrics || []
+        const layers = selectLyricLayers(
+          structuredLyrics,
+          getPreferredLyricLanguage(),
+        )
+        const lyric = layers.main ? structuredLyricToLrc(layers.main) : ''
+        lyricCacheRef.current.set(currentTrackId, {
+          lrc: lyric,
+          layers,
+        })
+        setSelectedLyricLayers(layers)
+
+        if (lyric !== '') {
+          dispatch(updateQueueLyric(currentTrackId, lyric))
+          applyLyricToRuntimePlayer(currentTrackId, lyric)
+        }
+      })
+      .catch(() => {
+        if (lyricRequestIdRef.current !== requestId) {
+          return
+        }
+        setSelectedLyricLayers(emptyLyricLayers)
+        // Do not cache network/request failures as empty lyrics, so we can retry.
+        lyricCacheRef.current.delete(currentTrackId)
+      })
+  }, [dispatch, currentTrackId, currentTrackIsRadio, applyLyricToRuntimePlayer])
+
  const defaultOptions = useMemo(
    () => ({
      theme: playerTheme,
@ -177,7 +361,7 @@ const Player = () => {
      clearPriorAudioLists: false,
      showDestroy: true,
      showDownload: false,
-      showLyric: true,
+      showLyric: false,
      showReload: false,
      toggleMode: !isDesktop,
      glassBg: false,
@ -215,12 +399,26 @@ const Player = () => {
        (playerState.clear || playerState.playIndex === 0),
      clearPriorAudioLists: playerState.clear,
      extendsContent: (
-        <PlayerToolbar id={current.trackId} isRadio={current.isRadio} />
+        <PlayerToolbar
+          id={current.trackId}
+          isRadio={current.isRadio}
+          onToggleLyrics={() =>
+            setKaraokeVisiblePreference((visible) => !visible)
+          }
+          lyricsActive={karaokeVisible}
+          lyricsDisabled={!hasKaraokeLyric}
+        />
      ),
      defaultVolume: isMobilePlayer ? 1 : playerState.volume,
      showMediaSession: !current.isRadio,
    }
-  }, [playerState, defaultOptions, isMobilePlayer])
+  }, [
+    playerState,
+    defaultOptions,
+    isMobilePlayer,
+    karaokeVisible,
+    hasKaraokeLyric,
+  ])

  const onAudioListsChange = useCallback(
    (_, audioLists, audioInfo) => dispatch(syncQueue(audioInfo, audioLists)),
@ -340,10 +538,13 @@ const Player = () => {
  )

  const onCoverClick = useCallback((mode, audioLists, audioInfo) => {
+    if (!isDesktop && karaokeVisible) {
+      return
+    }
    if (mode === 'full' && audioInfo?.song?.albumId) {
      window.location.href = `#/album/${audioInfo.song.albumId}/show`
    }
-  }, [])
+  }, [isDesktop, karaokeVisible])

  const onAudioError = useCallback(
    (error, currentPlayId, audioLists, audioInfo) => {
@ -392,6 +593,7 @@ const Player = () => {
  return (
    <ThemeProvider theme={createMuiTheme(theme)}>
      <ReactJkMusicPlayer
+        ref={playerRef}
        {...options}
        className={classes.player}
        onAudioListsChange={onAudioListsChange}
@ -407,6 +609,55 @@ const Player = () => {
        onBeforeDestroy={onBeforeDestroy}
        getAudioInstance={setAudioInstance}
      />
+      {isDesktop && (
+        <KaraokeLyricsOverlay
+          visible={karaokeVisible}
+          mainLyric={selectedLyricLayers.main}
+          translationLyric={selectedLyricLayers.translation}
+          pronunciationLyric={selectedLyricLayers.pronunciation}
+          showTranslation={showTranslation}
+          showPronunciation={showPronunciation}
+          translationEnabled={hasTranslationLyric}
+          pronunciationEnabled={hasPronunciationLyric}
+          onToggleTranslation={() =>
+            setTranslationPreference((previous) =>
+              hasTranslationLyric ? !previous : false,
+            )
+          }
+          onTogglePronunciation={() =>
+            setPronunciationPreference((previous) =>
+              togglePronunciationPreference(previous, hasPronunciationLyric),
+            )
+          }
+          audioInstance={audioInstance}
+          onClose={() => setKaraokeVisiblePreference(false)}
+        />
+      )}
+      <MobileKaraokeLyricsPortal active={useInlineMobileLyrics}>
+        <KaraokeLyricsOverlay
+          visible={useInlineMobileLyrics}
+          inline={true}
+          mainLyric={selectedLyricLayers.main}
+          translationLyric={selectedLyricLayers.translation}
+          pronunciationLyric={selectedLyricLayers.pronunciation}
+          showTranslation={showTranslation}
+          showPronunciation={showPronunciation}
+          translationEnabled={hasTranslationLyric}
+          pronunciationEnabled={hasPronunciationLyric}
+          onToggleTranslation={() =>
+            setTranslationPreference((previous) =>
+              hasTranslationLyric ? !previous : false,
+            )
+          }
+          onTogglePronunciation={() =>
+            setPronunciationPreference((previous) =>
+              togglePronunciationPreference(previous, hasPronunciationLyric),
+            )
+          }
+          audioInstance={audioInstance}
+          onClose={() => setKaraokeVisiblePreference(false)}
+        />
+      </MobileKaraokeLyricsPortal>
      <GlobalHotKeys handlers={handlers} keyMap={keyMap} allowChanges />
    </ThemeProvider>
  )
--- a/ui/src/audioplayer/Player.lyricsState.test.jsx
+++ b/ui/src/audioplayer/Player.lyricsState.test.jsx
@ -0,0 +1,77 @@
+import {
+  resolveLyricsOverlayState,
+  togglePronunciationPreference,
+} from './lyricsOverlayState'
+
+describe('Player lyrics state helpers', () => {
+  it('keeps the lyrics window preference across track changes in the session', () => {
+    const visibleOnCurrentTrack = resolveLyricsOverlayState({
+      karaokeVisiblePreference: true,
+      translationPreference: false,
+      pronunciationPreference: null,
+      hasKaraokeLyric: true,
+      hasTranslationLyric: true,
+      hasPronunciationLyric: true,
+    })
+    expect(visibleOnCurrentTrack.karaokeVisible).toBe(true)
+
+    const hiddenForTrackWithoutLyrics = resolveLyricsOverlayState({
+      karaokeVisiblePreference: true,
+      translationPreference: false,
+      pronunciationPreference: null,
+      hasKaraokeLyric: false,
+      hasTranslationLyric: false,
+      hasPronunciationLyric: false,
+    })
+    expect(hiddenForTrackWithoutLyrics.karaokeVisible).toBe(false)
+
+    const restoredOnNextLyricsTrack = resolveLyricsOverlayState({
+      karaokeVisiblePreference: true,
+      translationPreference: false,
+      pronunciationPreference: null,
+      hasKaraokeLyric: true,
+      hasTranslationLyric: false,
+      hasPronunciationLyric: false,
+    })
+    expect(restoredOnNextLyricsTrack.karaokeVisible).toBe(true)
+  })
+
+  it('restores translation and pronunciation preferences after tracks without those layers', () => {
+    const initialState = resolveLyricsOverlayState({
+      karaokeVisiblePreference: false,
+      translationPreference: false,
+      pronunciationPreference: null,
+      hasKaraokeLyric: true,
+      hasTranslationLyric: true,
+      hasPronunciationLyric: true,
+    })
+    expect(initialState.showTranslation).toBe(false)
+    expect(initialState.showPronunciation).toBe(true)
+
+    const translationPreference = true
+    const pronunciationPreference = togglePronunciationPreference(null, true)
+    expect(pronunciationPreference).toBe(false)
+
+    const hiddenOnTrackWithoutAuxLayers = resolveLyricsOverlayState({
+      karaokeVisiblePreference: false,
+      translationPreference,
+      pronunciationPreference,
+      hasKaraokeLyric: true,
+      hasTranslationLyric: false,
+      hasPronunciationLyric: false,
+    })
+    expect(hiddenOnTrackWithoutAuxLayers.showTranslation).toBe(false)
+    expect(hiddenOnTrackWithoutAuxLayers.showPronunciation).toBe(false)
+
+    const restoredOnTrackWithAuxLayers = resolveLyricsOverlayState({
+      karaokeVisiblePreference: false,
+      translationPreference,
+      pronunciationPreference,
+      hasKaraokeLyric: true,
+      hasTranslationLyric: true,
+      hasPronunciationLyric: true,
+    })
+    expect(restoredOnTrackWithAuxLayers.showTranslation).toBe(true)
+    expect(restoredOnTrackWithAuxLayers.showPronunciation).toBe(false)
+  })
+})
--- a/ui/src/audioplayer/PlayerToolbar.jsx
+++ b/ui/src/audioplayer/PlayerToolbar.jsx
@ -4,7 +4,9 @@ import { useGetOne } from 'react-admin'
 import { GlobalHotKeys } from 'react-hotkeys'
 import IconButton from '@material-ui/core/IconButton'
 import { useMediaQuery } from '@material-ui/core'
+import Tooltip from '@material-ui/core/Tooltip'
 import { RiSaveLine } from 'react-icons/ri'
+import { RiFileMusicLine } from 'react-icons/ri'
 import { LoveButton, useToggleLove } from '../common'
 import { openSaveQueueDialog } from '../actions'
 import { keyMap } from '../hotkeys'
@ -55,7 +57,13 @@ const useStyles = makeStyles((theme) => ({
  },
 }))

-const PlayerToolbar = ({ id, isRadio }) => {
+const PlayerToolbar = ({
+  id,
+  isRadio,
+  onToggleLyrics,
+  lyricsActive = false,
+  lyricsDisabled = false,
+}) => {
  const dispatch = useDispatch()
  const { data, loading } = useGetOne('song', id, { enabled: !!id && !isRadio })
  const [toggleLove, toggling] = useToggleLove('song', data)
@ -99,6 +107,25 @@ const PlayerToolbar = ({ id, isRadio }) => {
    />
  )

+  const toggleLyricsButton = (
+    <Tooltip title="Toggle lyrics">
+      <span>
+        <IconButton
+          size={isDesktop ? 'small' : undefined}
+          onClick={onToggleLyrics}
+          disabled={!onToggleLyrics || lyricsDisabled}
+          data-testid="toggle-lyrics-button"
+          className={buttonClass}
+          color={lyricsActive ? 'primary' : 'default'}
+        >
+          <RiFileMusicLine
+            className={!isDesktop ? classes.mobileIcon : undefined}
+          />
+        </IconButton>
+      </span>
+    </Tooltip>
+  )
+
  return (
    <>
      <GlobalHotKeys keyMap={keyMap} handlers={handlers} allowChanges />
@ -106,11 +133,13 @@ const PlayerToolbar = ({ id, isRadio }) => {
        <li className={`${listItemClass} item`}>
          {saveQueueButton}
          {loveButton}
+          {toggleLyricsButton}
        </li>
      ) : (
        <>
          <li className={`${listItemClass} item`}>{saveQueueButton}</li>
          <li className={`${listItemClass} item`}>{loveButton}</li>
+          <li className={`${listItemClass} item`}>{toggleLyricsButton}</li>
        </>
      )}
    </>
--- a/ui/src/audioplayer/PlayerToolbar.test.jsx
+++ b/ui/src/audioplayer/PlayerToolbar.test.jsx
@ -71,6 +71,7 @@ describe('<PlayerToolbar />', () => {
      // Verify both buttons are rendered
      expect(screen.getByTestId('save-queue-button')).toBeInTheDocument()
      expect(screen.getByTestId('love-button')).toBeInTheDocument()
+      expect(screen.getByTestId('toggle-lyrics-button')).toBeInTheDocument()

      // Verify desktop classes are applied
      expect(listItems[0].className).toContain('toolbar')
@ -102,6 +103,14 @@ describe('<PlayerToolbar />', () => {
        type: 'OPEN_SAVE_QUEUE_DIALOG',
      })
    })
+
+    it('triggers lyric toggle callback when lyrics button is clicked', () => {
+      const onToggleLyrics = vi.fn()
+      render(<PlayerToolbar id="song-1" onToggleLyrics={onToggleLyrics} />)
+
+      fireEvent.click(screen.getByTestId('toggle-lyrics-button'))
+      expect(onToggleLyrics).toHaveBeenCalledTimes(1)
+    })
  })

  describe('Mobile layout', () => {
@ -114,11 +123,12 @@ describe('<PlayerToolbar />', () => {

      // Each button should be in its own list item
      const listItems = screen.getAllByRole('listitem')
-      expect(listItems).toHaveLength(2)
+      expect(listItems).toHaveLength(3)

      // Verify both buttons are rendered
      expect(screen.getByTestId('save-queue-button')).toBeInTheDocument()
      expect(screen.getByTestId('love-button')).toBeInTheDocument()
+      expect(screen.getByTestId('toggle-lyrics-button')).toBeInTheDocument()

      // Verify mobile classes are applied
      expect(listItems[0].className).toContain('mobileListItem')
@ -140,6 +150,13 @@ describe('<PlayerToolbar />', () => {
      const loveButton = screen.getByTestId('love-button')
      expect(loveButton).toBeDisabled()
    })
+
+    it('disables lyrics button when lyrics are unavailable', () => {
+      render(<PlayerToolbar id="song-1" lyricsDisabled={true} />)
+
+      const lyricsButton = screen.getByTestId('toggle-lyrics-button')
+      expect(lyricsButton).toBeDisabled()
+    })
  })

  describe('Common behavior', () => {
--- a/ui/src/audioplayer/lyrics.js
+++ b/ui/src/audioplayer/lyrics.js
@ -0,0 +1,725 @@
+const normalizeLanguageTag = (language) =>
+  (language || '').toLowerCase().replace('_', '-')
+
+// Roughly one 60fps frame; keeps line/token switching stable near tight boundaries.
+const KARAOKE_SWITCH_EPSILON_MS = 50
+const LYRIC_KIND_MAIN = 'main'
+const LYRIC_KIND_TRANSLATION = 'translation'
+const LYRIC_KIND_PRONUNCIATION = 'pronunciation'
+
+const padTime = (value) => {
+  const str = value.toString()
+  return str.length === 1 ? `0${str}` : str
+}
+
+const toTime = (value) => {
+  if (value == null || value === '') {
+    return null
+  }
+  const numeric = Number(value)
+  return Number.isFinite(numeric) ? numeric : null
+}
+
+const toByteOffset = (value) => {
+  if (value == null || value === '') {
+    return null
+  }
+  const numeric = Number(value)
+  if (!Number.isInteger(numeric) || numeric < 0) {
+    return null
+  }
+  return numeric
+}
+
+const compareNullableTime = (a, b) => {
+  if (a == null && b == null) {
+    return 0
+  }
+  if (a == null) {
+    return 1
+  }
+  if (b == null) {
+    return -1
+  }
+  return a - b
+}
+
+const sortTokensByStart = (tokens) =>
+  tokens
+    .map((token, order) => ({ ...token, order }))
+    .sort((a, b) => {
+      const byStart = compareNullableTime(a.start, b.start)
+      if (byStart !== 0) {
+        return byStart
+      }
+      const byEnd = compareNullableTime(a.end, b.end)
+      if (byEnd !== 0) {
+        return byEnd
+      }
+      return a.order - b.order
+    })
+    .map(({ order, ...token }) => token)
+
+const languageMatch = (candidate, preferred) => {
+  if (!candidate || !preferred) {
+    return false
+  }
+  return (
+    candidate === preferred ||
+    candidate.startsWith(`${preferred}-`) ||
+    preferred.startsWith(`${candidate}-`)
+  )
+}
+
+const hasTimedLines = (lyric) =>
+  lyric &&
+  lyric.synced &&
+  Array.isArray(lyric.line) &&
+  lyric.line.some((line) => Number.isFinite(Number(line.start)))
+
+const preferTimedLyrics = (lyrics) => {
+  const timed = lyrics.filter(hasTimedLines)
+  return timed.length > 0 ? timed : lyrics
+}
+
+const normalizeToken = (token) => {
+  if (!token) {
+    return null
+  }
+  const value = typeof token.value === 'string' ? token.value : ''
+  if (value.length === 0) {
+    return null
+  }
+  const byteStart = toByteOffset(token.byteStart)
+  const byteEnd = toByteOffset(token.byteEnd)
+  return {
+    start: toTime(token.start),
+    end: toTime(token.end),
+    value,
+    ...(byteStart != null ? { byteStart } : {}),
+    ...(byteEnd != null ? { byteEnd } : {}),
+  }
+}
+
+const utf8BytesForCodePoint = (codePoint) => {
+  if (codePoint <= 0x7f) {
+    return 1
+  }
+  if (codePoint <= 0x7ff) {
+    return 2
+  }
+  if (codePoint <= 0xffff) {
+    return 3
+  }
+  return 4
+}
+
+export const utf8ByteOffsetToCodeUnitIndex = (text, targetByteOffset) => {
+  if (typeof text !== 'string' || text.length === 0) {
+    return 0
+  }
+
+  const target = toByteOffset(targetByteOffset)
+  if (target == null || target <= 0) {
+    return 0
+  }
+
+  let byteOffset = 0
+  let index = 0
+  while (index < text.length) {
+    if (byteOffset >= target) {
+      return index
+    }
+    const codePoint = text.codePointAt(index)
+    byteOffset += utf8BytesForCodePoint(codePoint)
+    index += codePoint > 0xffff ? 2 : 1
+  }
+
+  return text.length
+}
+
+export const utf8ByteRangeToCodeUnitRange = (text, byteStart, byteEnd) => {
+  if (typeof text !== 'string') {
+    return null
+  }
+
+  const start = toByteOffset(byteStart)
+  const end = toByteOffset(byteEnd)
+  if (start == null || end == null || end < start) {
+    return null
+  }
+
+  const startIndex = utf8ByteOffsetToCodeUnitIndex(text, start)
+  const endIndex = utf8ByteOffsetToCodeUnitIndex(text, end + 1)
+  if (
+    startIndex >= endIndex ||
+    startIndex > text.length ||
+    endIndex > text.length
+  ) {
+    return null
+  }
+
+  return {
+    start: startIndex,
+    end: endIndex,
+    text: text.slice(startIndex, endIndex),
+  }
+}
+
+const buildAgentLookup = (structuredLyric) => {
+  const lookup = new Map()
+  const agents = Array.isArray(structuredLyric?.agents)
+    ? structuredLyric.agents
+    : []
+  for (const agent of agents) {
+    const id = typeof agent?.id === 'string' ? agent.id : ''
+    if (!id || lookup.has(id)) {
+      continue
+    }
+    lookup.set(id, {
+      id,
+      role: typeof agent?.role === 'string' ? agent.role : '',
+      name: typeof agent?.name === 'string' ? agent.name : '',
+    })
+  }
+  return lookup
+}
+
+const deriveUiRole = (agent) => {
+  if (!agent?.role || agent.role === 'main') {
+    return ''
+  }
+  return agent.role
+}
+
+const normalizeCueLine = (cueLine, fallbackIndex, agentLookup) => {
+  const index = Number.isFinite(Number(cueLine?.index))
+    ? Number(cueLine.index)
+    : fallbackIndex
+  const agentId = typeof cueLine?.agentId === 'string' ? cueLine.agentId : ''
+  const agent = agentId ? agentLookup.get(agentId) || null : null
+  const fallbackRole = typeof cueLine?.role === 'string' ? cueLine.role : ''
+  const tokens = sortTokensByStart(
+    Array.isArray(cueLine?.cue)
+      ? cueLine.cue.map(normalizeToken).filter(Boolean)
+      : [],
+  )
+
+  return {
+    index,
+    start: toTime(cueLine?.start),
+    end: toTime(cueLine?.end),
+    value: typeof cueLine?.value === 'string' ? cueLine.value : '',
+    role: agent ? deriveUiRole(agent) : fallbackRole,
+    agentId,
+    agentRole: agent?.role || fallbackRole,
+    agentName: agent?.name || '',
+    tokens,
+  }
+}
+
+const normalizeLyricKind = (kind) => {
+  const normalized = (kind || '').toLowerCase().trim()
+  switch (normalized) {
+    case LYRIC_KIND_TRANSLATION:
+      return LYRIC_KIND_TRANSLATION
+    case LYRIC_KIND_PRONUNCIATION:
+      return LYRIC_KIND_PRONUNCIATION
+    default:
+      return LYRIC_KIND_MAIN
+  }
+}
+
+const pickLyricByLanguage = (lyrics, preferredLanguage) => {
+  if (!Array.isArray(lyrics) || lyrics.length === 0) {
+    return null
+  }
+
+  const preferred = normalizeLanguageTag(preferredLanguage)
+  const preferredBase = preferred.split('-')[0]
+
+  return (
+    lyrics.find((lyric) =>
+      languageMatch(normalizeLanguageTag(lyric.lang), preferred),
+    ) ||
+    lyrics.find((lyric) =>
+      languageMatch(normalizeLanguageTag(lyric.lang), preferredBase),
+    ) ||
+    lyrics.find((lyric) =>
+      languageMatch(normalizeLanguageTag(lyric.lang), 'en'),
+    ) ||
+    lyrics[0]
+  )
+}
+
+const lineTimeWindow = (lines, index) => {
+  const line = lines[index]
+  if (!line) {
+    return { start: null, end: null }
+  }
+
+  const start = toTime(line.start)
+  const end = toTime(line.end) ?? toTime(lines[index + 1]?.start)
+  return { start, end }
+}
+
+export const hasCueTiming = (structuredLyric) =>
+  Boolean(
+    structuredLyric &&
+    Array.isArray(structuredLyric.cueLine) &&
+    structuredLyric.cueLine.some(
+      (cueLine) =>
+        Array.isArray(cueLine?.cue) &&
+        cueLine.cue.some((cue) => Number.isFinite(Number(cue?.start))),
+    ),
+  )
+
+export const hasStructuredLyricContent = (structuredLyric) =>
+  Boolean(
+    structuredLyric &&
+    ((Array.isArray(structuredLyric.line) &&
+      structuredLyric.line.some(
+        (line) => typeof line?.value === 'string' && line.value.trim() !== '',
+      )) ||
+      hasCueTiming(structuredLyric)),
+  )
+
+export const getPreferredLyricLanguage = () => {
+  if (typeof window !== 'undefined' && window.localStorage) {
+    const stored = window.localStorage.getItem('locale')
+    if (stored) {
+      return stored
+    }
+  }
+  if (typeof navigator !== 'undefined' && navigator.language) {
+    return navigator.language
+  }
+  return 'en'
+}
+
+export const selectLyricLayers = (structuredLyrics, preferredLanguage) => {
+  if (!Array.isArray(structuredLyrics)) {
+    return {
+      main: null,
+      translation: null,
+      pronunciation: null,
+    }
+  }
+
+  const available = structuredLyrics.filter(hasStructuredLyricContent)
+  if (available.length === 0) {
+    return {
+      main: null,
+      translation: null,
+      pronunciation: null,
+    }
+  }
+
+  const grouped = {
+    [LYRIC_KIND_MAIN]: [],
+    [LYRIC_KIND_TRANSLATION]: [],
+    [LYRIC_KIND_PRONUNCIATION]: [],
+  }
+
+  for (const lyric of available) {
+    grouped[normalizeLyricKind(lyric?.kind)].push(lyric)
+  }
+
+  const mainCandidates = grouped[LYRIC_KIND_MAIN].length
+    ? grouped[LYRIC_KIND_MAIN]
+    : available
+
+  return {
+    main: pickLyricByLanguage(
+      preferTimedLyrics(mainCandidates),
+      preferredLanguage,
+    ),
+    translation: pickLyricByLanguage(
+      preferTimedLyrics(grouped[LYRIC_KIND_TRANSLATION]),
+      preferredLanguage,
+    ),
+    pronunciation: pickLyricByLanguage(
+      preferTimedLyrics(grouped[LYRIC_KIND_PRONUNCIATION]),
+      preferredLanguage,
+    ),
+  }
+}
+
+export const pickStructuredLyric = (structuredLyrics, preferredLanguage) =>
+  selectLyricLayers(structuredLyrics, preferredLanguage).main
+
+export const structuredLyricToLrc = (structuredLyric) => {
+  if (!structuredLyric || !Array.isArray(structuredLyric.line)) {
+    return ''
+  }
+
+  let lyricText = ''
+  for (const line of structuredLyric.line) {
+    const start = Number(line.start)
+    if (!Number.isFinite(start) || start < 0) {
+      continue
+    }
+
+    let time = Math.floor(start / 10)
+    const ms = time % 100
+    time = Math.floor(time / 100)
+    const sec = time % 60
+    time = Math.floor(time / 60)
+    const min = time % 60
+
+    lyricText += `[${padTime(min)}:${padTime(sec)}.${padTime(ms)}] ${line.value || ''}\n`
+  }
+  return lyricText
+}
+
+export const structuredLyricsToLrc = (structuredLyrics, preferredLanguage) => {
+  const selected = pickStructuredLyric(structuredLyrics, preferredLanguage)
+  if (!selected) {
+    return ''
+  }
+  return structuredLyricToLrc(selected)
+}
+
+const buildBaseKaraokeLines = (baseLines) =>
+  baseLines.map((line, index) => ({
+    index,
+    start: toTime(line.start),
+    end: toTime(line.end),
+    value: typeof line.value === 'string' ? line.value : '',
+    tokens: [],
+  }))
+
+export const buildKaraokeLinesFromCueLines = (
+  rawCueLines,
+  baseLines,
+  agentLookup,
+) => {
+  const normalizedCueLines = rawCueLines.map((cueLine, fallbackIndex) => {
+    const normalized = normalizeCueLine(cueLine, fallbackIndex, agentLookup)
+    return {
+      ...normalized,
+      tokens: normalized.tokens.map((token) => ({
+        ...token,
+        role: normalized.role,
+        agentId: normalized.agentId,
+        agentName: normalized.agentName,
+        agentRole: normalized.agentRole,
+      })),
+    }
+  })
+
+  const byIndex = new Map()
+  for (const cueLine of normalizedCueLines) {
+    if (!byIndex.has(cueLine.index)) {
+      byIndex.set(cueLine.index, [])
+    }
+    byIndex.get(cueLine.index).push(cueLine)
+  }
+
+  return Array.from(byIndex.entries()).map(([index, group]) => {
+    const first = group[0]
+    const baseLine = baseLines[index] || {}
+    const tokens = sortTokensByStart(group.flatMap((cueLine) => cueLine.tokens))
+    const fallbackStart =
+      tokens.find((token) => token.start != null)?.start ?? null
+    const fallbackEnd =
+      [...tokens].reverse().find((token) => token.end != null)?.end ?? null
+    const value =
+      first.value ||
+      (typeof baseLine.value === 'string' ? baseLine.value : '') ||
+      tokens.map((token) => token.value).join('')
+
+    return {
+      index,
+      start: first.start ?? toTime(baseLine.start) ?? fallbackStart,
+      end: first.end ?? toTime(baseLine.end) ?? fallbackEnd,
+      value,
+      agentId: first.agentId,
+      agentName: first.agentName,
+      agentRole: first.agentRole,
+      tokens,
+    }
+  })
+}
+
+export const buildKaraokeLines = (structuredLyric) => {
+  if (!structuredLyric) {
+    return []
+  }
+
+  const agentLookup = buildAgentLookup(structuredLyric)
+  const baseLines = Array.isArray(structuredLyric.line)
+    ? structuredLyric.line
+    : []
+  const rawCueLines = Array.isArray(structuredLyric.cueLine)
+    ? structuredLyric.cueLine
+    : []
+
+  const lines =
+    rawCueLines.length > 0
+      ? buildKaraokeLinesFromCueLines(rawCueLines, baseLines, agentLookup)
+      : buildBaseKaraokeLines(baseLines)
+
+  const normalized = lines
+    .filter((line) => line.value || line.tokens.length > 0)
+    .sort((a, b) => {
+      if (a.start == null && b.start == null) {
+        return a.index - b.index
+      }
+      if (a.start == null) {
+        return 1
+      }
+      if (b.start == null) {
+        return -1
+      }
+      if (a.start !== b.start) {
+        return a.start - b.start
+      }
+      return a.index - b.index
+    })
+
+  for (let i = 0; i < normalized.length; i += 1) {
+    if (normalized[i].end == null) {
+      const nextStart = normalized[i + 1]?.start
+      if (nextStart != null) {
+        normalized[i].end = nextStart
+      }
+    }
+  }
+
+  return normalized
+}
+
+export const resolveKaraokeTokenWindow = (
+  line,
+  tokenIndex,
+  lineEndFallback = null,
+) => {
+  const tokens = Array.isArray(line?.tokens) ? line.tokens : []
+  const token = tokens[tokenIndex]
+  if (!token) {
+    return { start: null, end: null }
+  }
+
+  const prevToken = tokenIndex > 0 ? tokens[tokenIndex - 1] : null
+  const nextToken =
+    tokenIndex + 1 < tokens.length ? tokens[tokenIndex + 1] : null
+
+  const lineStart = toTime(line?.start)
+  const lineEnd = toTime(line?.end) ?? toTime(lineEndFallback)
+  const tokenCount = tokens.length
+  const hasLineWindow =
+    lineStart != null &&
+    lineEnd != null &&
+    Number.isFinite(lineStart) &&
+    Number.isFinite(lineEnd) &&
+    lineEnd > lineStart
+  const estimatedStart =
+    hasLineWindow && tokenCount > 0
+      ? lineStart + ((lineEnd - lineStart) * tokenIndex) / tokenCount
+      : null
+  const estimatedEnd =
+    hasLineWindow && tokenCount > 0
+      ? lineStart + ((lineEnd - lineStart) * (tokenIndex + 1)) / tokenCount
+      : null
+
+  let explicitStartCount = 0
+  let explicitEndCount = 0
+  const uniqueStarts = new Set()
+  const uniqueEnds = new Set()
+
+  for (let i = 0; i < tokenCount; i += 1) {
+    const explicitStart = toTime(tokens[i]?.start)
+    if (explicitStart != null) {
+      explicitStartCount += 1
+      uniqueStarts.add(explicitStart)
+    }
+
+    const explicitEnd = toTime(tokens[i]?.end)
+    if (explicitEnd != null) {
+      explicitEndCount += 1
+      uniqueEnds.add(explicitEnd)
+    }
+  }
+
+  const collapsedStarts =
+    explicitStartCount > 1 && uniqueStarts.size <= Math.max(1, tokenCount / 4)
+  const collapsedEnds =
+    explicitEndCount > 1 && uniqueEnds.size <= Math.max(1, tokenCount / 4)
+  const shouldForceEstimated =
+    hasLineWindow && tokenCount > 1 && (collapsedStarts || collapsedEnds)
+
+  if (shouldForceEstimated) {
+    return {
+      start: estimatedStart,
+      end: estimatedEnd,
+    }
+  }
+  const prevEnd = toTime(prevToken?.end) ?? toTime(prevToken?.start)
+
+  let start = toTime(token.start)
+  if (start == null) {
+    start = prevEnd ?? estimatedStart ?? lineStart
+  }
+
+  let end = toTime(token.end)
+  if (end == null) {
+    const nextDirectStart = toTime(nextToken?.start)
+    const nextEstimatedStart =
+      hasLineWindow && tokenIndex + 1 < tokenCount
+        ? lineStart + ((lineEnd - lineStart) * (tokenIndex + 1)) / tokenCount
+        : null
+    end = nextDirectStart ?? nextEstimatedStart ?? estimatedEnd ?? lineEnd
+  }
+
+  if (
+    tokenCount === 1 &&
+    hasLineWindow &&
+    (start == null || end == null || end <= start + 1)
+  ) {
+    start = lineStart
+    end = lineEnd
+  }
+
+  if (start != null && end != null && end < start) {
+    end = start
+  }
+
+  return { start, end }
+}
+
+export const getActiveKaraokeState = (lines, currentTimeMs) => {
+  if (!Array.isArray(lines) || lines.length === 0) {
+    return { lineIndex: -1, tokenIndex: -1 }
+  }
+
+  const current = Number.isFinite(Number(currentTimeMs))
+    ? Number(currentTimeMs)
+    : 0
+  let lineIndex = 0
+  for (let i = 0; i < lines.length; i += 1) {
+    const lineStart = toTime(lines[i]?.start)
+    if (lineStart == null || lineStart <= current + KARAOKE_SWITCH_EPSILON_MS) {
+      lineIndex = i
+      continue
+    }
+    break
+  }
+
+  for (let i = lineIndex; i >= 0; i -= 1) {
+    const lineStart = toTime(lines[i]?.start)
+    const lineEnd = toTime(lines[i]?.end) ?? toTime(lines[i + 1]?.start)
+    if (lineStart != null && current + KARAOKE_SWITCH_EPSILON_MS < lineStart) {
+      continue
+    }
+    if (lineEnd == null || current <= lineEnd + KARAOKE_SWITCH_EPSILON_MS) {
+      lineIndex = i
+      break
+    }
+  }
+
+  const activeLine = lines[lineIndex] || null
+  const tokens = Array.isArray(activeLine?.tokens) ? activeLine.tokens : []
+  let tokenIndex = -1
+  for (let i = 0; i < tokens.length; i += 1) {
+    const { start: tokenStart, end: tokenEnd } = resolveKaraokeTokenWindow(
+      activeLine,
+      i,
+      lines[lineIndex + 1]?.start,
+    )
+    if (
+      tokenStart == null ||
+      tokenStart <= current + KARAOKE_SWITCH_EPSILON_MS
+    ) {
+      tokenIndex = i
+      if (tokenEnd != null && current <= tokenEnd + KARAOKE_SWITCH_EPSILON_MS) {
+        break
+      }
+      continue
+    }
+    break
+  }
+
+  return { lineIndex, tokenIndex }
+}
+
+export const hasUsableKaraokeTiming = (lines) =>
+  Array.isArray(lines) &&
+  lines.some(
+    (line) =>
+      toTime(line?.start) != null ||
+      (Array.isArray(line?.tokens) &&
+        line.tokens.some(
+          (token) => toTime(token?.start) != null || toTime(token?.end) != null,
+        )),
+  )
+
+export const findLayerLineIndexForMain = (mainLines, layerLines, mainIndex) => {
+  if (
+    !Array.isArray(mainLines) ||
+    !Array.isArray(layerLines) ||
+    mainLines.length === 0 ||
+    layerLines.length === 0 ||
+    mainIndex < 0 ||
+    mainIndex >= mainLines.length
+  ) {
+    return -1
+  }
+
+  const { start: mainStart, end: mainEnd } = lineTimeWindow(
+    mainLines,
+    mainIndex,
+  )
+
+  if (mainStart == null) {
+    return -1
+  }
+  const mainWindowEnd = mainEnd ?? mainStart
+  const mainWindowDuration = Math.max(0, mainWindowEnd - mainStart)
+  const maxDelta = Math.max(550, Math.min(1400, mainWindowDuration + 420))
+
+  let bestIdx = -1
+  let bestScore = Number.POSITIVE_INFINITY
+
+  for (let i = 0; i < layerLines.length; i += 1) {
+    const { start, end } = lineTimeWindow(layerLines, i)
+
+    if (start != null && end != null) {
+      const overlap = Math.min(end, mainEnd ?? end) - Math.max(start, mainStart)
+      if (overlap >= 0) {
+        const score = Math.abs(start - mainStart) + Math.abs(i - mainIndex) * 30
+        if (score < bestScore) {
+          bestScore = score
+          bestIdx = i
+        }
+        continue
+      }
+    }
+
+    if (start != null) {
+      if (Math.abs(start - mainStart) > maxDelta) {
+        continue
+      }
+      const score = Math.abs(start - mainStart) + Math.abs(i - mainIndex) * 45
+      if (score < bestScore) {
+        bestScore = score
+        bestIdx = i
+      }
+    }
+  }
+
+  return bestIdx
+}
+
+export const resolveLayerLineForMain = (mainLines, layerLines, mainIndex) => {
+  const index = findLayerLineIndexForMain(mainLines, layerLines, mainIndex)
+  return {
+    index,
+    line: index >= 0 ? layerLines[index] : null,
+  }
+}
+
+export const buildHighlightedMainLine = (line) => line
+
+export const buildHighlightedAuxLine = (_referenceLine, auxiliaryLine) =>
+  auxiliaryLine ?? null
--- a/ui/src/audioplayer/lyrics.test.js
+++ b/ui/src/audioplayer/lyrics.test.js
@ -0,0 +1,786 @@
+import {
+  buildHighlightedAuxLine,
+  buildHighlightedMainLine,
+  buildKaraokeLines,
+  buildKaraokeLinesFromCueLines,
+  findLayerLineIndexForMain,
+  getActiveKaraokeState,
+  getPreferredLyricLanguage,
+  hasUsableKaraokeTiming,
+  hasStructuredLyricContent,
+  pickStructuredLyric,
+  resolveKaraokeTokenWindow,
+  resolveLayerLineForMain,
+  selectLyricLayers,
+  structuredLyricsToLrc,
+  structuredLyricToLrc,
+  utf8ByteOffsetToCodeUnitIndex,
+  utf8ByteRangeToCodeUnitRange,
+} from './lyrics'
+
+describe('lyrics helpers', () => {
+  beforeEach(() => {
+    localStorage.clear()
+  })
+
+  it('prefers a lyric track that matches the locale', () => {
+    const selected = pickStructuredLyric(
+      [
+        {
+          lang: 'eng',
+          synced: true,
+          line: [{ start: 1000, value: 'English line' }],
+        },
+        {
+          lang: 'pt-BR',
+          synced: true,
+          line: [{ start: 1000, value: 'Linha em portugues' }],
+        },
+      ],
+      'pt-BR',
+    )
+
+    expect(selected.lang).toBe('pt-BR')
+  })
+
+  it('falls back to english when preferred locale is not available', () => {
+    const selected = pickStructuredLyric(
+      [
+        {
+          lang: 'eng',
+          synced: true,
+          line: [{ start: 1000, value: 'English line' }],
+        },
+        {
+          lang: 'deu',
+          synced: true,
+          line: [{ start: 1000, value: 'Deutsche Zeile' }],
+        },
+      ],
+      'pt-BR',
+    )
+
+    expect(selected.lang).toBe('eng')
+  })
+
+  it('falls back to first synced track when english is missing', () => {
+    const selected = pickStructuredLyric(
+      [
+        {
+          lang: 'jpn',
+          synced: true,
+          line: [{ start: 1000, value: 'Nihongo' }],
+        },
+        {
+          lang: 'deu',
+          synced: true,
+          line: [{ start: 1000, value: 'Deutsch' }],
+        },
+      ],
+      'pt-BR',
+    )
+
+    expect(selected.lang).toBe('jpn')
+  })
+
+  it('selects translation and pronunciation layers by kind', () => {
+    const layers = selectLyricLayers(
+      [
+        {
+          kind: 'main',
+          lang: 'ja',
+          synced: true,
+          line: [{ start: 1000, value: 'こんにちは' }],
+        },
+        {
+          kind: 'translation',
+          lang: 'es',
+          synced: true,
+          line: [{ start: 1000, value: 'Hola' }],
+        },
+        {
+          kind: 'pronunciation',
+          lang: 'ja-Latn',
+          synced: true,
+          line: [{ start: 1000, value: 'konnichiwa' }],
+        },
+      ],
+      'es-MX',
+    )
+
+    expect(layers.main.lang).toBe('ja')
+    expect(layers.translation.lang).toBe('es')
+    expect(layers.pronunciation.lang).toBe('ja-Latn')
+  })
+
+  it('treats missing kind as main for backward compatibility', () => {
+    const layers = selectLyricLayers(
+      [
+        {
+          lang: 'eng',
+          synced: true,
+          line: [{ start: 1000, value: 'Main' }],
+        },
+      ],
+      'eng',
+    )
+
+    expect(layers.main.lang).toBe('eng')
+    expect(layers.translation).toBeNull()
+    expect(layers.pronunciation).toBeNull()
+  })
+
+  it('falls back to unsynced lyric content when no timed track exists', () => {
+    const layers = selectLyricLayers(
+      [
+        {
+          lang: 'eng',
+          synced: false,
+          line: [{ value: 'Plain embedded lyric' }],
+        },
+      ],
+      'eng',
+    )
+
+    expect(layers.main).toEqual({
+      lang: 'eng',
+      synced: false,
+      line: [{ value: 'Plain embedded lyric' }],
+    })
+  })
+
+  it('still prefers timed lyrics when both timed and untimed tracks exist', () => {
+    const layers = selectLyricLayers(
+      [
+        {
+          lang: 'eng',
+          synced: false,
+          line: [{ value: 'Plain lyric' }],
+        },
+        {
+          lang: 'eng',
+          synced: true,
+          line: [{ start: 1000, value: 'Timed lyric' }],
+        },
+      ],
+      'eng',
+    )
+
+    expect(layers.main).toEqual({
+      lang: 'eng',
+      synced: true,
+      line: [{ start: 1000, value: 'Timed lyric' }],
+    })
+  })
+
+  it('matches layer line by timing for the active main line', () => {
+    const mainLines = [
+      { index: 0, start: 1000, end: 1800, value: 'Line A', tokens: [] },
+      { index: 1, start: 2000, end: 2800, value: 'Line B', tokens: [] },
+    ]
+    const layerLines = [
+      { index: 0, start: 900, end: 1750, value: 'A2', tokens: [] },
+      { index: 1, start: 2050, end: 2900, value: 'B2', tokens: [] },
+    ]
+
+    expect(findLayerLineIndexForMain(mainLines, layerLines, 1)).toBe(1)
+    expect(resolveLayerLineForMain(mainLines, layerLines, 0).line.value).toBe(
+      'A2',
+    )
+  })
+
+  it('matches metadata layers by nearest timing even when indexes differ', () => {
+    const mainLines = [
+      { index: 0, start: 1000, end: 1800, value: 'Line A', tokens: [] },
+      { index: 1, start: 2000, end: 2800, value: 'Line B', tokens: [] },
+      { index: 2, start: 3000, end: 3800, value: 'Line C', tokens: [] },
+    ]
+    const layerLines = [
+      { index: 2, start: 3020, end: 3820, value: 'C2', tokens: [] },
+      { index: 0, start: 980, end: 1760, value: 'A2', tokens: [] },
+      { index: 1, start: 2010, end: 2810, value: 'B2', tokens: [] },
+    ]
+
+    expect(findLayerLineIndexForMain(mainLines, layerLines, 1)).toBe(2)
+    expect(resolveLayerLineForMain(mainLines, layerLines, 2).line.value).toBe(
+      'C2',
+    )
+  })
+
+  it('keeps translation lines line-level when they do not have real cue timing', () => {
+    const mainLine = {
+      index: 0,
+      start: 1000,
+      end: 2200,
+      value: '불을 질러라',
+      tokens: [
+        { start: 1000, end: 1300, value: '불을 ' },
+        { start: 1300, end: 1650, value: '질' },
+        { start: 1650, end: 2200, value: '러라' },
+      ],
+    }
+    const translationLine = {
+      index: 0,
+      start: 1000,
+      end: 2200,
+      value: 'Set it on fire',
+      tokens: [],
+    }
+
+    const highlighted = buildHighlightedAuxLine(mainLine, translationLine, 2600)
+
+    expect(highlighted).toBe(translationLine)
+    expect(highlighted.tokens).toEqual([])
+  })
+
+  it('keeps pronunciation lines line-level when they do not have real cue timing', () => {
+    const mainLine = {
+      index: 0,
+      start: 1000,
+      end: 2200,
+      value: 'You もっと強く 素早く 吹き飛ばせ',
+      tokens: [],
+    }
+    const pronunciationLine = {
+      index: 0,
+      start: 1000,
+      end: 2200,
+      value: 'You motto tsuyoku subayaku fukitobase',
+      tokens: [],
+    }
+
+    const highlighted = buildHighlightedAuxLine(
+      mainLine,
+      pronunciationLine,
+      2600,
+    )
+
+    expect(highlighted).toBe(pronunciationLine)
+    expect(highlighted.tokens).toEqual([])
+  })
+
+  it('keeps main lines line-level when they do not have real cue timing', () => {
+    const line = {
+      index: 0,
+      start: 1000,
+      end: 2200,
+      value: 'Youもっと強く 素早く 吹き飛ばせ',
+      tokens: [],
+    }
+
+    const highlighted = buildHighlightedMainLine(line, 2600)
+
+    expect(highlighted).toBe(line)
+    expect(highlighted.tokens).toEqual([])
+  })
+
+  it('keeps auxiliary lines line-level when end time is missing and they lack cues', () => {
+    const mainLine = {
+      index: 0,
+      start: 1000,
+      end: null,
+      value: 'Hello there',
+      tokens: [],
+    }
+    const translationLine = {
+      index: 0,
+      start: 1000,
+      end: null,
+      value: 'Bonjour toi',
+      tokens: [],
+    }
+
+    const highlighted = buildHighlightedAuxLine(mainLine, translationLine, 2400)
+
+    expect(highlighted).toBe(translationLine)
+    expect(highlighted.tokens).toEqual([])
+  })
+
+  it('keeps main lines line-level when end time is missing and they lack cues', () => {
+    const line = {
+      index: 0,
+      start: 1000,
+      end: null,
+      value: 'One more time',
+      tokens: [],
+    }
+
+    const highlighted = buildHighlightedMainLine(line, 2400)
+
+    expect(highlighted).toBe(line)
+    expect(highlighted.tokens).toEqual([])
+  })
+
+  it('returns no layer match when the nearest line is too far in time', () => {
+    const mainLines = [
+      { index: 0, start: 1000, end: 1800, value: 'Line A', tokens: [] },
+      { index: 1, start: 2000, end: 2800, value: 'Line B', tokens: [] },
+    ]
+    const layerLines = [
+      { index: 0, start: 60000, end: 60800, value: 'Far line', tokens: [] },
+    ]
+
+    expect(findLayerLineIndexForMain(mainLines, layerLines, 1)).toBe(-1)
+    expect(resolveLayerLineForMain(mainLines, layerLines, 1).line).toBeNull()
+  })
+
+  it('converts a structured lyric track to LRC', () => {
+    const lrc = structuredLyricToLrc({
+      lang: 'eng',
+      synced: true,
+      line: [
+        { start: 18800, value: "We're no strangers to love" },
+        { start: 22801, value: 'You know the rules and so do I' },
+      ],
+    })
+
+    expect(lrc).toBe(
+      "[00:18.80] We're no strangers to love\n[00:22.80] You know the rules and so do I\n",
+    )
+  })
+
+  it('returns empty text when no synced lyrics are available', () => {
+    const lrc = structuredLyricsToLrc(
+      [{ lang: 'eng', synced: false, line: [{ value: 'Unsynced line' }] }],
+      'eng',
+    )
+
+    expect(lrc).toBe('')
+  })
+
+  it('reads preferred language from localStorage first', () => {
+    localStorage.setItem('locale', 'pt-BR')
+    expect(getPreferredLyricLanguage()).toBe('pt-BR')
+  })
+
+  it('builds karaoke lines from agent-based cueLine payload', () => {
+    const lines = buildKaraokeLines({
+      lang: 'eng',
+      synced: true,
+      line: [{ start: 1000, end: 3000, value: 'Hello world' }],
+      agents: [
+        { id: 'lead', role: 'main', name: 'Lead Vocal' },
+        { id: 'backing', role: 'bg' },
+      ],
+      cueLine: [
+        {
+          index: 0,
+          start: 1000,
+          end: 3000,
+          value: 'Hello world',
+          agentId: 'lead',
+          cue: [{ start: 1000, end: 1500, value: 'Hello' }],
+        },
+        {
+          index: 0,
+          start: 1000,
+          end: 3000,
+          value: 'Hello world',
+          agentId: 'backing',
+          cue: [{ start: 2000, end: 2500, value: 'world' }],
+        },
+      ],
+    })
+
+    expect(lines).toEqual([
+      {
+        agentId: 'lead',
+        agentName: 'Lead Vocal',
+        agentRole: 'main',
+        index: 0,
+        start: 1000,
+        end: 3000,
+        value: 'Hello world',
+        tokens: [
+          {
+            start: 1000,
+            end: 1500,
+            value: 'Hello',
+            role: '',
+            agentId: 'lead',
+            agentName: 'Lead Vocal',
+            agentRole: 'main',
+          },
+          {
+            start: 2000,
+            end: 2500,
+            value: 'world',
+            role: 'bg',
+            agentId: 'backing',
+            agentName: '',
+            agentRole: 'bg',
+          },
+        ],
+      },
+    ])
+  })
+
+  it('builds grouped karaoke lines directly from cue lines', () => {
+    const agentLookup = new Map([
+      ['lead', { id: 'lead', role: 'main', name: 'Lead Vocal' }],
+      ['backing', { id: 'backing', role: 'bg', name: '' }],
+    ])
+
+    const lines = buildKaraokeLinesFromCueLines(
+      [
+        {
+          index: 0,
+          start: 1000,
+          end: 3000,
+          value: 'Hello world',
+          agentId: 'lead',
+          cue: [{ start: 1000, end: 1500, value: 'Hello' }],
+        },
+        {
+          index: 0,
+          start: 1000,
+          end: 3000,
+          value: 'Hello world',
+          agentId: 'backing',
+          cue: [{ start: 2000, end: 2500, value: 'world' }],
+        },
+      ],
+      [{ start: 1000, end: 3000, value: 'Hello world' }],
+      agentLookup,
+    )
+
+    expect(lines).toEqual([
+      {
+        agentId: 'lead',
+        agentName: 'Lead Vocal',
+        agentRole: 'main',
+        index: 0,
+        start: 1000,
+        end: 3000,
+        value: 'Hello world',
+        tokens: [
+          {
+            start: 1000,
+            end: 1500,
+            value: 'Hello',
+            role: '',
+            agentId: 'lead',
+            agentName: 'Lead Vocal',
+            agentRole: 'main',
+          },
+          {
+            start: 2000,
+            end: 2500,
+            value: 'world',
+            role: 'bg',
+            agentId: 'backing',
+            agentName: '',
+            agentRole: 'bg',
+          },
+        ],
+      },
+    ])
+  })
+
+  it('preserves cue byte offsets on karaoke tokens', () => {
+    const lines = buildKaraokeLines({
+      lang: 'eng',
+      synced: true,
+      line: [{ start: 0, end: 2400, value: 'Oh love love me tonight' }],
+      cueLine: [
+        {
+          index: 0,
+          start: 0,
+          end: 2400,
+          value: 'Oh love love me tonight',
+          cue: [
+            { start: 0, end: 300, value: 'Oh', byteStart: 0, byteEnd: 1 },
+            { start: 900, end: 1300, value: 'love', byteStart: 8, byteEnd: 11 },
+            { start: 1300, end: 1600, value: 'me', byteStart: 13, byteEnd: 14 },
+            {
+              start: 1600,
+              end: 2400,
+              value: 'tonight',
+              byteStart: 16,
+              byteEnd: 22,
+            },
+          ],
+        },
+      ],
+    })
+
+    expect(
+      lines[0].tokens.map((token) => [
+        token.value,
+        token.byteStart,
+        token.byteEnd,
+      ]),
+    ).toEqual([
+      ['Oh', 0, 1],
+      ['love', 8, 11],
+      ['me', 13, 14],
+      ['tonight', 16, 22],
+    ])
+  })
+
+  it('preserves whitespace-only cues for exact byte-range rendering', () => {
+    const lines = buildKaraokeLines({
+      lang: 'kor',
+      synced: true,
+      line: [{ start: 0, end: 900, value: '눈을 뜬 순간' }],
+      cueLine: [
+        {
+          index: 0,
+          start: 0,
+          end: 900,
+          value: '눈을 뜬 순간',
+          cue: [
+            { start: 0, end: 150, value: '눈을', byteStart: 0, byteEnd: 5 },
+            { start: 150, end: 250, value: ' ', byteStart: 6, byteEnd: 6 },
+            { start: 250, end: 450, value: '뜬', byteStart: 7, byteEnd: 9 },
+            { start: 450, end: 550, value: ' ', byteStart: 10, byteEnd: 10 },
+            { start: 550, end: 900, value: '순간', byteStart: 11, byteEnd: 16 },
+          ],
+        },
+      ],
+    })
+
+    expect(
+      lines[0].tokens.map((token) => [
+        token.value,
+        token.byteStart,
+        token.byteEnd,
+      ]),
+    ).toEqual([
+      ['눈을', 0, 5],
+      [' ', 6, 6],
+      ['뜬', 7, 9],
+      [' ', 10, 10],
+      ['순간', 11, 16],
+    ])
+  })
+
+  it('maps UTF-8 byte offsets to string ranges for multibyte lyrics', () => {
+    const text = '눈을 뜬 순간'
+
+    expect(utf8ByteOffsetToCodeUnitIndex(text, 0)).toBe(0)
+    expect(utf8ByteOffsetToCodeUnitIndex(text, 3)).toBe(1)
+    expect(utf8ByteOffsetToCodeUnitIndex(text, 7)).toBe(3)
+    expect(utf8ByteRangeToCodeUnitRange(text, 11, 16)).toEqual({
+      start: 5,
+      end: 7,
+      text: '순간',
+    })
+  })
+
+  it('falls back to legacy cueLine role values when agents are absent', () => {
+    const lines = buildKaraokeLines({
+      lang: 'eng',
+      synced: true,
+      line: [{ start: 1000, end: 3000, value: 'Hello world' }],
+      cueLine: [
+        {
+          index: 0,
+          start: 1000,
+          end: 3000,
+          value: 'Hello world',
+          role: 'bg',
+          cue: [{ start: 1000, end: 1500, value: 'Hello' }],
+        },
+      ],
+    })
+
+    expect(lines[0].tokens[0].role).toBe('bg')
+    expect(lines[0].tokens[0].agentId).toBe('')
+    expect(lines[0].tokens[0].agentName).toBe('')
+  })
+
+  it('sorts token timing by start to keep playback stable', () => {
+    const lines = buildKaraokeLines({
+      lang: 'eng',
+      synced: true,
+      line: [{ start: 1000, end: 3000, value: 'Hello world' }],
+      cueLine: [
+        {
+          index: 0,
+          start: 1000,
+          end: 3000,
+          value: 'Hello world',
+          role: '',
+          cue: [
+            { start: 2000, end: 2500, value: 'world' },
+            { start: 1000, end: 1500, value: 'Hello' },
+          ],
+        },
+      ],
+    })
+
+    expect(lines[0].tokens.map((token) => token.value)).toEqual([
+      'Hello',
+      'world',
+    ])
+  })
+
+  it('keeps a single full-line token unchanged instead of expanding it synthetically', () => {
+    const lines = buildKaraokeLines({
+      lang: 'ko-Latn',
+      synced: true,
+      line: [{ start: 1000, end: 2000, value: 'Da-la-lun, dun' }],
+      cueLine: [
+        {
+          index: 0,
+          start: 1000,
+          end: 2000,
+          value: 'Da-la-lun, dun',
+          role: '',
+          cue: [{ start: 1000, end: 2000, value: 'Da-la-lun, dun' }],
+        },
+      ],
+    })
+
+    expect(lines).toHaveLength(1)
+    expect(lines[0].tokens).toHaveLength(1)
+    expect(lines[0].tokens[0].value).toBe('Da-la-lun, dun')
+
+    const firstWindow = resolveKaraokeTokenWindow(lines[0], 0)
+
+    expect(firstWindow.start).toBeCloseTo(1000)
+    expect(firstWindow.end).toBeCloseTo(2000)
+  })
+
+  it('detects active line and token for karaoke timing', () => {
+    const state = getActiveKaraokeState(
+      [
+        {
+          index: 0,
+          start: 1000,
+          end: 3000,
+          value: 'Hello world',
+          tokens: [
+            { start: 1000, end: 1500, value: 'Hello', role: '' },
+            { start: 2000, end: 2500, value: 'world', role: '' },
+          ],
+        },
+        {
+          index: 1,
+          start: 3500,
+          end: 5000,
+          value: 'Second line',
+          tokens: [],
+        },
+      ],
+      2200,
+    )
+
+    expect(state).toEqual({ lineIndex: 0, tokenIndex: 1 })
+  })
+
+  it('resolves token window fallback boundaries from neighboring tokens', () => {
+    const line = {
+      start: 1000,
+      end: 3000,
+      value: 'Hello world',
+      tokens: [
+        { start: 1200, value: 'Hello', role: '' },
+        { start: 1800, value: 'world', role: '' },
+      ],
+    }
+
+    expect(resolveKaraokeTokenWindow(line, 0)).toEqual({
+      start: 1200,
+      end: 1800,
+    })
+    expect(resolveKaraokeTokenWindow(line, 1)).toEqual({
+      start: 1800,
+      end: 3000,
+    })
+  })
+
+  it('infers sequential token windows when token timings are missing', () => {
+    const line = {
+      start: 1000,
+      end: 2000,
+      value: 'A B C',
+      tokens: [
+        { value: 'A', role: '' },
+        { value: 'B', role: '' },
+        { value: 'C', role: '' },
+      ],
+    }
+
+    const first = resolveKaraokeTokenWindow(line, 0)
+    const second = resolveKaraokeTokenWindow(line, 1)
+    const third = resolveKaraokeTokenWindow(line, 2)
+
+    expect(first.start).toBeCloseTo(1000)
+    expect(first.end).toBeCloseTo(1333.3333333333333)
+
+    expect(second.start).toBeCloseTo(1333.3333333333333)
+    expect(second.end).toBeCloseTo(1666.6666666666667)
+
+    expect(third.start).toBeCloseTo(1666.6666666666667)
+    expect(third.end).toBeCloseTo(2000)
+  })
+
+  it('falls back to sequential windows when token timings are collapsed', () => {
+    const line = {
+      start: 1000,
+      end: 2000,
+      value: 'A B C',
+      tokens: [
+        { start: 1000, end: 2000, value: 'A', role: '' },
+        { start: 1000, end: 2000, value: 'B', role: '' },
+        { start: 1000, end: 2000, value: 'C', role: '' },
+      ],
+    }
+
+    const first = resolveKaraokeTokenWindow(line, 0)
+    const second = resolveKaraokeTokenWindow(line, 1)
+    const third = resolveKaraokeTokenWindow(line, 2)
+
+    expect(first.start).toBeCloseTo(1000)
+    expect(first.end).toBeCloseTo(1333.3333333333333)
+    expect(second.start).toBeCloseTo(1333.3333333333333)
+    expect(second.end).toBeCloseTo(1666.6666666666667)
+    expect(third.start).toBeCloseTo(1666.6666666666667)
+    expect(third.end).toBeCloseTo(2000)
+  })
+
+  it('keeps token selection stable near tight token boundaries', () => {
+    const state = getActiveKaraokeState(
+      [
+        {
+          index: 0,
+          start: 1000,
+          end: 2000,
+          value: 'A B',
+          tokens: [
+            { start: 1000, end: 1100, value: 'A', role: '' },
+            { start: 1110, end: 1300, value: 'B', role: '' },
+          ],
+        },
+      ],
+      1108,
+    )
+
+    expect(state).toEqual({ lineIndex: 0, tokenIndex: 0 })
+  })
+
+  it('reports structured lyric content when token timing exists', () => {
+    expect(
+      hasStructuredLyricContent({
+        cueLine: [{ cue: [{ start: 100, value: 'a' }] }],
+      }),
+    ).toBe(true)
+  })
+
+  it('detects when built karaoke lines have no usable timing', () => {
+    expect(
+      hasUsableKaraokeTiming([
+        { index: 0, value: 'First line', tokens: [] },
+        { index: 1, value: 'Second line', tokens: [] },
+      ]),
+    ).toBe(false)
+
+    expect(
+      hasUsableKaraokeTiming([
+        { index: 0, start: 1000, value: 'Timed line', tokens: [] },
+      ]),
+    ).toBe(true)
+  })
+})
--- a/ui/src/audioplayer/lyricsOverlayState.js
+++ b/ui/src/audioplayer/lyricsOverlayState.js
@ -0,0 +1,27 @@
+export const resolveLyricsOverlayState = ({
+  karaokeVisiblePreference,
+  translationPreference,
+  pronunciationPreference,
+  hasKaraokeLyric,
+  hasTranslationLyric,
+  hasPronunciationLyric,
+}) => ({
+  karaokeVisible: karaokeVisiblePreference && hasKaraokeLyric,
+  showTranslation: translationPreference && hasTranslationLyric,
+  showPronunciation:
+    (pronunciationPreference == null
+      ? hasPronunciationLyric
+      : pronunciationPreference) && hasPronunciationLyric,
+})
+
+export const togglePronunciationPreference = (
+  previousPreference,
+  hasPronunciationLyric,
+) => {
+  if (!hasPronunciationLyric) {
+    return false
+  }
+  const currentPreference =
+    previousPreference == null ? hasPronunciationLyric : previousPreference
+  return !currentPreference
+}
--- a/ui/src/audioplayer/styles.js
+++ b/ui/src/audioplayer/styles.js
@ -62,12 +62,30 @@ const useStyle = makeStyles(
          // Fix cover display when image is not square
          aspectRatio: '1/1',
          display: 'flex',
+          position: 'relative',
+        },
+      '& .react-jinke-music-player-mobile .react-jinke-music-player-mobile-cover.nd-mobile-lyrics-active':
+        {
+          width: '100%',
+          maxWidth: 'none',
+          height: 'clamp(280px, 42vh, 460px)',
+          aspectRatio: 'auto',
+          borderRadius: 12,
+          border: 'none',
+          boxShadow: 'none',
+          background: 'transparent',
+          cursor: 'default',
        },
      '& .react-jinke-music-player-mobile .react-jinke-music-player-mobile-cover img.cover':
        {
          animationDuration: (props) => !props.enableCoverAnimation && '0s',
          objectFit: 'contain', // Fix cover display when image is not square
        },
+      '& .react-jinke-music-player-mobile .react-jinke-music-player-mobile-cover.nd-mobile-lyrics-active img.cover':
+        {
+          opacity: 0,
+          pointerEvents: 'none',
+        },
      // Hide old singer display
      '& .react-jinke-music-player-mobile .react-jinke-music-player-mobile-singer':
        {
--- a/ui/src/reducers/playerReducer.js
+++ b/ui/src/reducers/playerReducer.js
@ -7,6 +7,7 @@ import {
  PLAYER_CURRENT,
  PLAYER_PLAY_NEXT,
  PLAYER_PLAY_TRACKS,
+  PLAYER_UPDATE_LYRIC,
  PLAYER_SET_TRACK,
  PLAYER_SET_VOLUME,
  PLAYER_SYNC_QUEUE,
@ -60,21 +61,25 @@ const mapToAudioLists = (item) => {
  let lyricText = ''

  if (lyrics) {
-    const structured = JSON.parse(lyrics)
-    for (const structuredLyric of structured) {
-      if (structuredLyric.synced) {
-        for (const line of structuredLyric.line) {
-          let time = Math.floor(line.start / 10)
-          const ms = time % 100
-          time = Math.floor(time / 100)
-          const sec = time % 60
-          time = Math.floor(time / 60)
-          const min = time % 60
+    try {
+      const structured = JSON.parse(lyrics)
+      for (const structuredLyric of structured) {
+        if (structuredLyric.synced) {
+          for (const line of structuredLyric.line) {
+            let time = Math.floor(line.start / 10)
+            const ms = time % 100
+            time = Math.floor(time / 100)
+            const sec = time % 60
+            time = Math.floor(time / 60)
+            const min = time % 60

-          ms.toString()
-          lyricText += `[${pad(min)}:${pad(sec)}.${pad(ms)}] ${line.value}\n`
+            ms.toString()
+            lyricText += `[${pad(min)}:${pad(sec)}.${pad(ms)}] ${line.value}\n`
+          }
        }
      }
+    } catch {
+      lyricText = ''
    }
  }

@ -208,6 +213,45 @@ const reduceMode = (state, { data: { mode } }) => {
  }
 }

+const reduceUpdateLyric = (state, { data: { trackId, lyric } }) => {
+  if (!trackId) {
+    return state
+  }
+
+  let changed = false
+  const queue = state.queue.map((item) => {
+    if (item.trackId !== trackId) {
+      return item
+    }
+    if (item.lyric === lyric) {
+      return item
+    }
+    changed = true
+    return {
+      ...item,
+      lyric,
+    }
+  })
+
+  if (!changed) {
+    return state
+  }
+
+  const current =
+    state.current?.trackId === trackId
+      ? {
+          ...state.current,
+          lyric,
+        }
+      : state.current
+
+  return {
+    ...state,
+    queue,
+    current,
+  }
+}
+
 export const playerReducer = (previousState = initialState, payload) => {
  const { type } = payload
  switch (type) {
@ -245,6 +289,8 @@ export const playerReducer = (previousState = initialState, payload) => {
          previousState.savedPlayIndex >= 0 ? previousState.savedPlayIndex : 0,
      }
    }
+    case PLAYER_UPDATE_LYRIC:
+      return reduceUpdateLyric(previousState, payload)
    default:
      return previousState
  }
--- a/ui/src/reducers/playerReducer.test.js
+++ b/ui/src/reducers/playerReducer.test.js
@ -1,11 +1,24 @@
-import { describe, it, expect } from 'vitest'
+import { describe, expect, it, vi } from 'vitest'
 import { playerReducer } from './playerReducer'
 import {
-  PLAYER_SYNC_QUEUE,
  PLAYER_CURRENT,
  PLAYER_REFRESH_QUEUE,
+  PLAYER_SET_TRACK,
+  PLAYER_SYNC_QUEUE,
+  PLAYER_UPDATE_LYRIC,
 } from '../actions'

+vi.mock('uuid', () => ({
+  v4: () => 'test-uuid',
+}))
+
+vi.mock('../subsonic', () => ({
+  default: {
+    streamUrl: vi.fn((id) => `/rest/stream?id=${id}`),
+    getCoverArtUrl: vi.fn(() => '/rest/getCoverArt?id=test'),
+  },
+}))
+
 describe('playerReducer', () => {
  describe('pending track selection survives SYNC_QUEUE and premature CURRENT', () => {
    // Simulates the real sequence when clicking a new song while one is playing:
@ -54,8 +67,6 @@ describe('playerReducer', () => {
    })

    it('CURRENT for old track preserves pending playIndex', () => {
-      // After SYNC_QUEUE, queue has new UUIDs. The old track's UUID (zzz)
-      // is at index 2, but playIndex is 0. This is a premature callback.
      const stateAfterSync = {
        ...stateAfterPlayTracks,
        queue: [
@ -71,7 +82,7 @@ describe('playerReducer', () => {
      const result = playerReducer(stateAfterSync, action)
      expect(result.playIndex).toBe(0)
      expect(result.clear).toBe(true)
-      expect(result.savedPlayIndex).toBe(2) // preserved from before
+      expect(result.savedPlayIndex).toBe(2)
    })

    it('CURRENT for correct track consumes pending playIndex', () => {
@ -83,7 +94,6 @@ describe('playerReducer', () => {
          { trackId: 's3', uuid: 'zzz', name: 'Song 3' },
        ],
      }
-      // Player switched to Song 1 (uuid 'xxx', index 0 == playIndex)
      const action = {
        type: PLAYER_CURRENT,
        data: { uuid: 'xxx', name: 'Song 1', volume: 1 },
@ -224,4 +234,80 @@ describe('playerReducer', () => {
      expect(result.playIndex).toBe(0)
    })
  })
+
+  it('maps embedded synced lyrics to LRC text', () => {
+    const lyrics = JSON.stringify([
+      {
+        lang: 'eng',
+        synced: true,
+        line: [{ start: 1000, value: 'Line one' }],
+      },
+      {
+        lang: 'eng',
+        synced: false,
+        line: [{ value: 'Unsynced line' }],
+      },
+    ])
+
+    const state = playerReducer(undefined, {
+      type: PLAYER_SET_TRACK,
+      data: {
+        id: 'song-1',
+        title: 'Test Song',
+        artist: 'Test Artist',
+        album: 'Test Album',
+        duration: 60,
+        lyrics,
+      },
+    })
+
+    expect(state.queue).toHaveLength(1)
+    expect(state.queue[0].lyric).toBe('[00:01.00] Line one\n')
+  })
+
+  it('updates queue lyric by track id', () => {
+    const initial = playerReducer(undefined, {
+      type: PLAYER_SET_TRACK,
+      data: {
+        id: 'song-1',
+        title: 'Test Song',
+        artist: 'Test Artist',
+        album: 'Test Album',
+        duration: 60,
+      },
+    })
+
+    const updated = playerReducer(initial, {
+      type: PLAYER_UPDATE_LYRIC,
+      data: {
+        trackId: 'song-1',
+        lyric: '[00:01.00] Updated lyric\n',
+      },
+    })
+
+    expect(updated.queue[0].lyric).toBe('[00:01.00] Updated lyric\n')
+  })
+
+  it('returns same state when lyric update does not match any track', () => {
+    const initial = playerReducer(undefined, {
+      type: PLAYER_SET_TRACK,
+      data: {
+        id: 'song-1',
+        title: 'Test Song',
+        artist: 'Test Artist',
+        album: 'Test Album',
+        duration: 60,
+      },
+    })
+
+    const updated = playerReducer(initial, {
+      type: PLAYER_UPDATE_LYRIC,
+      data: {
+        trackId: 'missing-track',
+        lyric: '[00:01.00] Updated lyric\n',
+      },
+    })
+
+    expect(updated).toBe(initial)
+  })
 })
--- a/ui/src/subsonic/index.js
+++ b/ui/src/subsonic/index.js
@ -1,5 +1,5 @@
-import { baseUrl } from '../utils'
 import { httpClient } from '../dataProvider'
+import { baseUrl } from '../utils'

 const url = (command, id, options) => {
  const username = localStorage.getItem('username')
@ -120,6 +120,10 @@ const getTopSongs = (artist, count = 50) => {
  return httpClient(url('getTopSongs', null, { artist, count }))
 }

+const getLyricsBySongId = (id) => {
+  return httpClient(url('getLyricsBySongId', id, { enhanced: true }))
+}
+
 const streamUrl = (id, options) => {
  return baseUrl(
    url('stream', id, {
@ -149,4 +153,5 @@ export default {
  getArtistInfo,
  getTopSongs,
  getSimilarSongs2,
+  getLyricsBySongId,
 }
--- a/ui/src/subsonic/index.test.js
+++ b/ui/src/subsonic/index.test.js
@ -1,7 +1,13 @@
 import { vi } from 'vitest'
-import config from '../config'
+import { httpClient } from '../dataProvider'
 import subsonic from './index'

+vi.mock('../dataProvider', () => ({
+  httpClient: vi.fn(() => Promise.resolve({})),
+}))
+
+const COVER_ART_SIZE = 600
+
 describe('getCoverArtUrl', () => {
  beforeEach(() => {
    // Mock window.location
@ -31,11 +37,7 @@ describe('getCoverArtUrl', () => {
      updatedAt: '2023-01-01T00:00:00Z',
    }

-    const url = subsonic.getCoverArtUrl(
-      playlistRecord,
-      config.uiCoverArtSize,
-      true,
-    )
+    const url = subsonic.getCoverArtUrl(playlistRecord, COVER_ART_SIZE, true)

    expect(url).toContain('pl-playlist-123')
    expect(url).toContain('size=600')
@ -49,11 +51,7 @@ describe('getCoverArtUrl', () => {
      sync: true,
    }

-    const url = subsonic.getCoverArtUrl(
-      playlistRecord,
-      config.uiCoverArtSize,
-      true,
-    )
+    const url = subsonic.getCoverArtUrl(playlistRecord, COVER_ART_SIZE, true)

    expect(url).toContain('pl-playlist-123')
    expect(url).toContain('size=600')
@ -68,11 +66,7 @@ describe('getCoverArtUrl', () => {
      updatedAt: '2023-01-01T00:00:00Z',
    }

-    const url = subsonic.getCoverArtUrl(
-      albumRecord,
-      config.uiCoverArtSize,
-      true,
-    )
+    const url = subsonic.getCoverArtUrl(albumRecord, COVER_ART_SIZE, true)

    expect(url).toContain('al-album-123')
    expect(url).toContain('size=600')
@ -86,7 +80,7 @@ describe('getCoverArtUrl', () => {
      updatedAt: '2023-01-01T00:00:00Z',
    }

-    const url = subsonic.getCoverArtUrl(songRecord, config.uiCoverArtSize, true)
+    const url = subsonic.getCoverArtUrl(songRecord, COVER_ART_SIZE, true)

    expect(url).toContain('mf-song-123')
    expect(url).toContain('size=600')
@ -99,11 +93,7 @@ describe('getCoverArtUrl', () => {
      updatedAt: '2023-01-01T00:00:00Z',
    }

-    const url = subsonic.getCoverArtUrl(
-      artistRecord,
-      config.uiCoverArtSize,
-      true,
-    )
+    const url = subsonic.getCoverArtUrl(artistRecord, COVER_ART_SIZE, true)

    expect(url).toContain('ar-artist-123')
    expect(url).toContain('size=600')
@ -194,3 +184,30 @@ describe('getAvatarUrl', () => {
    expect(url).toContain('username=john')
  })
 })
+
+describe('getLyricsBySongId', () => {
+  beforeEach(() => {
+    vi.clearAllMocks()
+    const localStorageMock = {
+      getItem: vi.fn((key) => {
+        const values = {
+          username: 'testuser',
+          'subsonic-token': 'testtoken',
+          'subsonic-salt': 'testsalt',
+        }
+        return values[key] || null
+      }),
+    }
+    Object.defineProperty(window, 'localStorage', { value: localStorageMock })
+  })
+
+  it('calls the getLyricsBySongId endpoint with enhanced=true', async () => {
+    await subsonic.getLyricsBySongId('song-1')
+
+    expect(httpClient).toHaveBeenCalledTimes(1)
+    const calledUrl = httpClient.mock.calls[0][0]
+    expect(calledUrl).toContain('/rest/getLyricsBySongId?')
+    expect(calledUrl).toContain('id=song-1')
+    expect(calledUrl).toContain('enhanced=true')
+  })
+})