Merge 656a673eed0c6b251fdae0554a223cf0e5f4f221 into 2b9f32699348d520fc96acbd74be24b12702b02a

2026-05-03 06:51:16 +00:00 · 2026-05-01 02:25:15 +03:00 · 2026-05-01 02:25:15 +03:00 · 80bf5d94e9
commit 80bf5d94e9
parent 2b9f326993 656a673eed
41 changed files with 7806 additions and 99 deletions
--- a/README.md
+++ b/README.md
@ -52,6 +52,7 @@ A share of the revenue helps fund the development of Navidrome at no additional
 - **Multi-platform**, runs on macOS, Linux and Windows. **Docker** images are also provided
 - Ready to use binaries for all major platforms, including **Raspberry Pi**
 - Automatically **monitors your library** for changes, importing new files and reloading new metadata 
 - Supports lyrics from sidecar **.ttml**, **.elrc**, **.lrc**, **.srt**, **.txt** files and embedded tags (via `lyricspriority`)
 - **Themeable**, modern and responsive **Web interface** based on [Material UI](https://material-ui.com)
 - **Compatible** with all Subsonic/Madsonic/Airsonic [clients](https://www.navidrome.org/docs/overview/#apps)
 - **Transcoding** on the fly. Can be set per user/player. **Opus encoding is supported**
--- a/conf/configuration.go
+++ b/conf/configuration.go
@ -763,7 +763,7 @@ func setViperDefaults() {
 	viper.SetDefault("artistartpriority", "artist.*, album/artist.*, external")
 	viper.SetDefault("artistimagefolder", "")
 	viper.SetDefault("discartpriority", "disc*.*, cd*.*, cover.*, folder.*, front.*, discsubtitle, embedded")
-	viper.SetDefault("lyricspriority", ".lrc,.txt,embedded")
+	viper.SetDefault("lyricspriority", ".ttml,.elrc,.lrc,.srt,.txt,embedded")
 	viper.SetDefault("enablegravatar", false)
 	viper.SetDefault("enablefavourites", true)
 	viper.SetDefault("enablestarrating", true)
--- a/core/lyrics/lyrics.go
+++ b/core/lyrics/lyrics.go
@ -14,6 +14,12 @@ type Lyrics interface {
 	GetLyrics(ctx context.Context, mf *model.MediaFile) (model.LyricList, error)
 }
 // BatchLyrics can resolve lyrics across multiple candidate media files while
 // still honoring the configured source priority globally.
 type BatchLyrics interface {
 	GetLyricsForMediaFiles(ctx context.Context, mediaFiles []model.MediaFile) (model.LyricList, error)
 }
 // PluginLoader discovers and loads lyrics provider plugins.
 type PluginLoader interface {
 	LoadLyricsProvider(name string) (Lyrics, bool)
@ -32,28 +38,53 @@ func NewLyrics(pluginLoader PluginLoader) Lyrics {
 // GetLyrics returns lyrics for the given media file, trying sources in the
 // order specified by conf.Server.LyricsPriority.
 func (l *lyricsService) GetLyrics(ctx context.Context, mf *model.MediaFile) (model.LyricList, error) {
-	var lyricsList model.LyricList
+	return l.getLyricsForCandidates(ctx, []*model.MediaFile{mf})
-	var err error
+}
 // GetLyricsForMediaFiles resolves lyrics across duplicate media files while
 // preserving the configured source priority across the full candidate set.
 func (l *lyricsService) GetLyricsForMediaFiles(ctx context.Context, mediaFiles []model.MediaFile) (model.LyricList, error) {
 	candidates := make([]*model.MediaFile, 0, len(mediaFiles))
 	for i := range mediaFiles {
 		candidates = append(candidates, &mediaFiles[i])
 	}
 	return l.getLyricsForCandidates(ctx, candidates)
 }
 func (l *lyricsService) getLyricsForCandidates(ctx context.Context, mediaFiles []*model.MediaFile) (model.LyricList, error) {
 	for pattern := range strings.SplitSeq(conf.Server.LyricsPriority, ",") {
 		pattern = strings.TrimSpace(pattern)
-		switch {
+		if pattern == "" {
-		case strings.EqualFold(pattern, "embedded"):
+			continue
 			lyricsList, err = fromEmbedded(ctx, mf)
 		case strings.HasPrefix(pattern, "."):
 			lyricsList, err = fromExternalFile(ctx, mf, strings.ToLower(pattern))
 		default:
 			lyricsList, err = l.fromPlugin(ctx, mf, pattern)
 		}
-		if err != nil {
+		for _, mf := range mediaFiles {
-			log.Error(ctx, "error getting lyrics", "source", pattern, err)
+			if mf == nil {
-		}
+				continue
 			}
-		if len(lyricsList) > 0 {
+			lyricsList, err := l.getLyricsFromSource(ctx, mf, pattern)
-			return lyricsList, nil
+			if err != nil {
 				log.Error(ctx, "error getting lyrics", "source", pattern, err)
 				continue
 			}
 			if len(lyricsList) > 0 {
 				return lyricsList, nil
 			}
 		}
 	}
 	return nil, nil
 }
 func (l *lyricsService) getLyricsFromSource(ctx context.Context, mf *model.MediaFile, pattern string) (model.LyricList, error) {
 	switch {
 	case strings.EqualFold(pattern, "embedded"):
 		return fromEmbedded(ctx, mf)
 	case strings.HasPrefix(pattern, "."):
 		return fromExternalFile(ctx, mf, strings.ToLower(pattern))
 	default:
 		return l.fromPlugin(ctx, mf, pattern)
 	}
 }
--- a/core/lyrics/lyrics_test.go
+++ b/core/lyrics/lyrics_test.go
@ -45,6 +45,71 @@ var _ = Describe("sources", func() {
 		},
 	}
 	elrcLyrics := model.LyricList{
 		model.Lyrics{
 			DisplayArtist: "ELRC Artist",
 			DisplayTitle:  "ELRC Song",
 			Lang:          "eng",
 			Line: []model.Line{
 				{
 					Start: gg.P(int64(1000)),
 					End:   gg.P(int64(3000)),
 					Value: "Lead words",
 					Cue: []model.Cue{
 						{
 							Start:     gg.P(int64(1000)),
 							End:       gg.P(int64(1500)),
 							Value:     "Lead ",
 							ByteStart: 0,
 							ByteEnd:   4,
 						},
 						{
 							Start:     gg.P(int64(1500)),
 							End:       gg.P(int64(3000)),
 							Value:     "words",
 							ByteStart: 5,
 							ByteEnd:   9,
 						},
 					},
 				},
 				{
 					Start: gg.P(int64(3000)),
 					Value: "Fallback line",
 				},
 			},
 			Synced: true,
 		},
 	}
 	ttmlLyrics := model.LyricList{
 		model.Lyrics{
 			Kind: "main",
 			Lang: "eng",
 			Line: []model.Line{
 				{
 					Start: gg.P(int64(18800)),
 					Value: "We're no strangers to love",
 				},
 				{
 					Start: gg.P(int64(22800)),
 					Value: "You know the rules and so do I",
 				},
 			},
 			Synced: true,
 		},
 		model.Lyrics{
 			Kind: "main",
 			Lang: "por",
 			Line: []model.Line{
 				{
 					Start: gg.P(int64(18800)),
 					Value: "Nao somos estranhos ao amor",
 				},
 			},
 			Synced: true,
 		},
 	}
 	unsyncedLyrics := model.LyricList{
 		model.Lyrics{
 			Lang: "xxx",
@ -60,6 +125,25 @@ var _ = Describe("sources", func() {
 		},
 	}
 	srtLyrics := model.LyricList{
 		model.Lyrics{
 			Lang: "xxx",
 			Line: []model.Line{
 				{
 					Start: gg.P(int64(18800)),
 					End:   gg.P(int64(22800)),
 					Value: "We're from subtitles",
 				},
 				{
 					Start: gg.P(int64(22801)),
 					End:   gg.P(int64(26000)),
 					Value: "Another subtitle line",
 				},
 			},
 			Synced: true,
 		},
 	}
 	BeforeEach(func() {
 		DeferCleanup(configtest.SetupConfig())
@ -81,7 +165,33 @@ var _ = Describe("sources", func() {
 	},
 		Entry("embedded > lrc > txt", "embedded,.lrc,.txt", embeddedLyrics),
 		Entry("lrc > embedded > txt", ".lrc,embedded,.txt", syncedLyrics),
-		Entry("txt > lrc > embedded", ".txt,.lrc,embedded", unsyncedLyrics))
+		Entry("elrc > lrc > embedded", ".elrc,.lrc,embedded", elrcLyrics),
 		Entry("srt > txt > embedded", ".srt,.txt,embedded", srtLyrics),
 		Entry("txt > lrc > embedded", ".txt,.lrc,embedded", unsyncedLyrics),
 		Entry("ttml > elrc > lrc > srt > embedded", ".ttml,.elrc,.lrc,.srt,embedded", ttmlLyrics))
 	It("resolves source priority across duplicate media files", func() {
 		conf.Server.LyricsPriority = ".ttml,embedded"
 		embeddedJSON, err := json.Marshal(embeddedLyrics)
 		Expect(err).To(BeNil())
 		svc := lyrics.NewLyrics(nil)
 		batchSvc, ok := svc.(lyrics.BatchLyrics)
 		Expect(ok).To(BeTrue())
 		list, err := batchSvc.GetLyricsForMediaFiles(ctx, []model.MediaFile{
 			{
 				Lyrics: string(embeddedJSON),
 				Path:   "tests/fixtures/01 Invisible (RED) Edit Version.mp3",
 			},
 			{
 				Lyrics: "[]",
 				Path:   "tests/fixtures/test.mp3",
 			},
 		})
 		Expect(err).To(BeNil())
 		Expect(list).To(Equal(ttmlLyrics))
 	})
 	Context("Errors", func() {
 		var RegularUserContext = XContext
--- a/core/lyrics/sources.go
+++ b/core/lyrics/sources.go
@ -5,6 +5,7 @@ import (
 	"errors"
 	"os"
 	"path"
 	"strings"
 	"github.com/navidrome/navidrome/log"
 	"github.com/navidrome/navidrome/model"
@ -36,18 +37,38 @@ func fromExternalFile(ctx context.Context, mf *model.MediaFile, suffix string) (
 		return nil, err
 	}
-	lyrics, err := model.ToLyrics("xxx", string(contents))
+	var list model.LyricList
-	if err != nil {
+	switch {
-		log.Error(ctx, "error parsing lyric external file", "path", externalLyric, err)
+	case strings.EqualFold(suffix, ".ttml"):
-		return nil, err
+		list, err = parseTTML(contents)
-	} else if lyrics == nil {
+		if err != nil {
 			log.Error(ctx, "error parsing ttml external file", "path", externalLyric, err)
 			return nil, err
 		}
 	case strings.EqualFold(suffix, ".srt"):
 		list, err = parseSRT(contents)
 		if err != nil {
 			log.Error(ctx, "error parsing srt external file", "path", externalLyric, err)
 			return nil, err
 		}
 	default:
 		lyrics, err := model.ToLyrics("xxx", string(contents))
 		if err != nil {
 			log.Error(ctx, "error parsing lyric external file", "path", externalLyric, err)
 			return nil, err
 		}
 		if lyrics != nil {
 			list = model.LyricList{*lyrics}
 		}
 	}
 	if len(list) == 0 {
 		log.Trace(ctx, "empty lyrics from external file", "path", externalLyric)
 		return nil, nil
 	}
 	log.Trace(ctx, "retrieved lyrics from external file", "path", externalLyric)
-
+	return list, nil
 	return model.LyricList{*lyrics}, nil
 }
 // fromPlugin attempts to load lyrics from a plugin with the given name.
--- a/core/lyrics/sources_test.go
+++ b/core/lyrics/sources_test.go
@ -88,6 +88,89 @@ var _ = Describe("sources", func() {
 			}))
 		})
 		It("should return Enhanced LRC lyrics with word-level cues from a file", func() {
 			mf := model.MediaFile{Path: "tests/fixtures/test-enhanced.mp3"}
 			lyrics, err := fromExternalFile(ctx, &mf, ".lrc")
 			Expect(err).To(BeNil())
 			Expect(lyrics).To(HaveLen(1))
 			Expect(lyrics[0].DisplayArtist).To(Equal("Test Artist"))
 			Expect(lyrics[0].DisplayTitle).To(Equal("Enhanced Test"))
 			Expect(lyrics[0].Lang).To(Equal("eng"))
 			Expect(lyrics[0].Synced).To(BeTrue())
 			Expect(lyrics[0].Line).To(HaveLen(3))
 			// Line 1: has inline markers → Cue array populated
 			Expect(lyrics[0].Line[0].Start).To(Equal(gg.P(int64(1000))))
 			Expect(lyrics[0].Line[0].End).To(Equal(gg.P(int64(3000))))
 			Expect(lyrics[0].Line[0].Value).To(Equal("Some lyrics here"))
 			Expect(lyrics[0].Line[0].Cue).To(HaveLen(3))
 			Expect(*lyrics[0].Line[0].Cue[0].Start).To(Equal(int64(1000)))
 			Expect(lyrics[0].Line[0].Cue[0].Value).To(Equal("Some "))
 			Expect(lyrics[0].Line[0].Cue[0].End).To(Equal(gg.P(int64(1500))))
 			Expect(lyrics[0].Line[0].Cue[0].ByteStart).To(Equal(0))
 			Expect(lyrics[0].Line[0].Cue[0].ByteEnd).To(Equal(4))
 			Expect(*lyrics[0].Line[0].Cue[1].Start).To(Equal(int64(1500)))
 			Expect(lyrics[0].Line[0].Cue[1].Value).To(Equal("lyrics "))
 			Expect(lyrics[0].Line[0].Cue[1].End).To(Equal(gg.P(int64(2000))))
 			Expect(lyrics[0].Line[0].Cue[1].ByteStart).To(Equal(5))
 			Expect(lyrics[0].Line[0].Cue[1].ByteEnd).To(Equal(11))
 			Expect(*lyrics[0].Line[0].Cue[2].Start).To(Equal(int64(2000)))
 			Expect(lyrics[0].Line[0].Cue[2].Value).To(Equal("here"))
 			Expect(lyrics[0].Line[0].Cue[2].End).To(Equal(gg.P(int64(3000))))
 			Expect(lyrics[0].Line[0].Cue[2].ByteStart).To(Equal(12))
 			Expect(lyrics[0].Line[0].Cue[2].ByteEnd).To(Equal(15))
 			// Line 2: has inline markers
 			Expect(lyrics[0].Line[1].Start).To(Equal(gg.P(int64(3000))))
 			Expect(lyrics[0].Line[1].End).To(Equal(gg.P(int64(5000))))
 			Expect(lyrics[0].Line[1].Value).To(Equal("More words"))
 			Expect(lyrics[0].Line[1].Cue).To(HaveLen(2))
 			Expect(lyrics[0].Line[1].Cue[0].End).To(Equal(gg.P(int64(3500))))
 			Expect(lyrics[0].Line[1].Cue[1].End).To(Equal(gg.P(int64(5000))))
 			Expect(lyrics[0].Line[1].Cue[0].ByteStart).To(Equal(0))
 			Expect(lyrics[0].Line[1].Cue[0].ByteEnd).To(Equal(4))
 			Expect(lyrics[0].Line[1].Cue[1].ByteStart).To(Equal(5))
 			Expect(lyrics[0].Line[1].Cue[1].ByteEnd).To(Equal(9))
 			// Line 3: plain line, no cues
 			Expect(lyrics[0].Line[2].Start).To(Equal(gg.P(int64(5000))))
 			Expect(lyrics[0].Line[2].Value).To(Equal("Plain line without inline markers"))
 			Expect(lyrics[0].Line[2].Cue).To(BeNil())
 		})
 		It("should return Enhanced LRC lyrics from an ELRC file", func() {
 			mf := model.MediaFile{Path: "tests/fixtures/test.mp3"}
 			lyrics, err := fromExternalFile(ctx, &mf, ".elrc")
 			Expect(err).To(BeNil())
 			Expect(lyrics).To(HaveLen(1))
 			Expect(lyrics[0].DisplayArtist).To(Equal("ELRC Artist"))
 			Expect(lyrics[0].DisplayTitle).To(Equal("ELRC Song"))
 			Expect(lyrics[0].Lang).To(Equal("eng"))
 			Expect(lyrics[0].Synced).To(BeTrue())
 			Expect(lyrics[0].Line).To(HaveLen(2))
 			Expect(lyrics[0].Line[0].Start).To(Equal(gg.P(int64(1000))))
 			Expect(lyrics[0].Line[0].End).To(Equal(gg.P(int64(3000))))
 			Expect(lyrics[0].Line[0].Value).To(Equal("Lead words"))
 			Expect(lyrics[0].Line[0].Cue).To(HaveLen(2))
 			Expect(*lyrics[0].Line[0].Cue[0].Start).To(Equal(int64(1000)))
 			Expect(lyrics[0].Line[0].Cue[0].Value).To(Equal("Lead "))
 			Expect(lyrics[0].Line[0].Cue[0].End).To(Equal(gg.P(int64(1500))))
 			Expect(lyrics[0].Line[0].Cue[0].ByteStart).To(Equal(0))
 			Expect(lyrics[0].Line[0].Cue[0].ByteEnd).To(Equal(4))
 			Expect(*lyrics[0].Line[0].Cue[1].Start).To(Equal(int64(1500)))
 			Expect(lyrics[0].Line[0].Cue[1].Value).To(Equal("words"))
 			Expect(lyrics[0].Line[0].Cue[1].End).To(Equal(gg.P(int64(3000))))
 			Expect(lyrics[0].Line[0].Cue[1].ByteStart).To(Equal(5))
 			Expect(lyrics[0].Line[0].Cue[1].ByteEnd).To(Equal(9))
 			Expect(lyrics[0].Line[1].Start).To(Equal(gg.P(int64(3000))))
 			Expect(lyrics[0].Line[1].Value).To(Equal("Fallback line"))
 			Expect(lyrics[0].Line[1].Cue).To(BeNil())
 		})
 		It("should return unsynchronized lyrics from a file", func() {
 			mf := model.MediaFile{Path: "tests/fixtures/test.mp3"}
 			lyrics, err := fromExternalFile(ctx, &mf, ".txt")
@ -109,6 +192,66 @@ var _ = Describe("sources", func() {
 			}))
 		})
 		It("should return synchronized lyrics from an SRT file", func() {
 			mf := model.MediaFile{Path: "tests/fixtures/test.mp3"}
 			lyrics, err := fromExternalFile(ctx, &mf, ".srt")
 			Expect(err).To(BeNil())
 			Expect(lyrics).To(Equal(model.LyricList{
 				model.Lyrics{
 					Lang: "xxx",
 					Line: []model.Line{
 						{
 							Start: gg.P(int64(18800)),
 							End:   gg.P(int64(22800)),
 							Value: "We're from subtitles",
 						},
 						{
 							Start: gg.P(int64(22801)),
 							End:   gg.P(int64(26000)),
 							Value: "Another subtitle line",
 						},
 					},
 					Synced: true,
 				},
 			}))
 		})
 		It("should return synchronized multilingual lyrics from a TTML file", func() {
 			mf := model.MediaFile{Path: "tests/fixtures/test.mp3"}
 			lyrics, err := fromExternalFile(ctx, &mf, ".ttml")
 			Expect(err).To(BeNil())
 			Expect(lyrics).To(Equal(model.LyricList{
 				{
 					Kind: "main",
 					Lang: "eng",
 					Line: []model.Line{
 						{
 							Start: gg.P(int64(18800)),
 							Value: "We're no strangers to love",
 						},
 						{
 							Start: gg.P(int64(22800)),
 							Value: "You know the rules and so do I",
 						},
 					},
 					Synced: true,
 				},
 				{
 					Kind: "main",
 					Lang: "por",
 					Line: []model.Line{
 						{
 							Start: gg.P(int64(18800)),
 							Value: "Nao somos estranhos ao amor",
 						},
 					},
 					Synced: true,
 				},
 			}))
 		})
 		It("should handle LRC files with UTF-8 BOM marker (issue #4631)", func() {
 			// The function looks for <basePath-without-ext><suffix>, so we need to pass
 			// a MediaFile with .mp3 path and look for .lrc suffix
@ -142,5 +285,33 @@ var _ = Describe("sources", func() {
 			Expect(lyrics[0].Line[1].Start).To(Equal(gg.P(int64(22801))))
 			Expect(lyrics[0].Line[1].Value).To(Equal("You know the rules and so do I"))
 		})
 		It("should handle TTML files with UTF-8 BOM marker", func() {
 			mf := model.MediaFile{Path: "tests/fixtures/bom-test.mp3"}
 			lyrics, err := fromExternalFile(ctx, &mf, ".ttml")
 			Expect(err).To(BeNil())
 			Expect(lyrics).To(HaveLen(1))
 			Expect(lyrics[0].Kind).To(Equal("main"))
 			Expect(lyrics[0].Synced).To(BeTrue())
 			Expect(lyrics[0].Line).To(HaveLen(1))
 			Expect(lyrics[0].Line[0].Start).To(Equal(gg.P(int64(0))))
 			Expect(lyrics[0].Line[0].Value).To(Equal("BOM test line"))
 		})
 		It("should handle UTF-16 BE encoded TTML files", func() {
 			mf := model.MediaFile{Path: "tests/fixtures/bom-utf16-test.mp3"}
 			lyrics, err := fromExternalFile(ctx, &mf, ".ttml")
 			Expect(err).To(BeNil())
 			Expect(lyrics).To(HaveLen(1))
 			Expect(lyrics[0].Kind).To(Equal("main"))
 			Expect(lyrics[0].Synced).To(BeTrue())
 			Expect(lyrics[0].Line).To(HaveLen(2))
 			Expect(lyrics[0].Line[0].Start).To(Equal(gg.P(int64(18800))))
 			Expect(lyrics[0].Line[0].Value).To(Equal("UTF16 line one"))
 			Expect(lyrics[0].Line[1].Start).To(Equal(gg.P(int64(22801))))
 			Expect(lyrics[0].Line[1].Value).To(Equal("UTF16 line two"))
 		})
 	})
 })
--- a/core/lyrics/srt.go
+++ b/core/lyrics/srt.go
@ -0,0 +1,161 @@
 package lyrics
 import (
 	"bytes"
 	"regexp"
 	"strconv"
 	"strings"
 	"github.com/navidrome/navidrome/model"
 	"github.com/navidrome/navidrome/utils/str"
 )
 var srtTimeRegex = regexp.MustCompile(`^\s*(\d{1,2}):(\d{2}):(\d{2})[,.](\d{1,3})\s*$`)
 func parseSRT(contents []byte) (model.LyricList, error) {
 	raw := strings.ReplaceAll(string(contents), "\r\n", "\n")
 	raw = strings.ReplaceAll(raw, "\r", "\n")
 	blocks := splitSRTBlocks(raw)
 	lines := make([]model.Line, 0, len(blocks))
 	for _, block := range blocks {
 		line, ok, err := parseSRTBlock(block)
 		if err != nil {
 			return nil, err
 		}
 		if ok {
 			lines = append(lines, line)
 		}
 	}
 	if len(lines) == 0 {
 		return nil, nil
 	}
 	lyrics := model.NormalizeLyrics(model.Lyrics{
 		Lang:   "xxx",
 		Line:   lines,
 		Synced: true,
 	})
 	return model.LyricList{lyrics}, nil
 }
 func splitSRTBlocks(raw string) []string {
 	raw = strings.TrimSpace(raw)
 	if raw == "" {
 		return nil
 	}
 	parts := strings.Split(raw, "\n\n")
 	blocks := make([]string, 0, len(parts))
 	for _, part := range parts {
 		part = strings.TrimSpace(part)
 		if part != "" {
 			blocks = append(blocks, part)
 		}
 	}
 	return blocks
 }
 func parseSRTBlock(block string) (model.Line, bool, error) {
 	scanner := bytes.Split([]byte(block), []byte("\n"))
 	if len(scanner) == 0 {
 		return model.Line{}, false, nil
 	}
 	lines := make([]string, 0, len(scanner))
 	for _, line := range scanner {
 		lines = append(lines, strings.TrimSpace(string(line)))
 	}
 	if len(lines) == 0 {
 		return model.Line{}, false, nil
 	}
 	startIdx := 0
 	if digitsOnly(lines[0]) {
 		startIdx = 1
 	}
 	if startIdx >= len(lines) {
 		return model.Line{}, false, nil
 	}
 	timing := strings.Split(lines[startIdx], "-->")
 	if len(timing) != 2 {
 		return model.Line{}, false, nil
 	}
 	startMs, err := parseSRTTime(timing[0])
 	if err != nil {
 		return model.Line{}, false, err
 	}
 	endMs, err := parseSRTTime(timing[1])
 	if err != nil {
 		return model.Line{}, false, err
 	}
 	textLines := make([]string, 0, len(lines)-startIdx-1)
 	for _, line := range lines[startIdx+1:] {
 		if line == "" {
 			continue
 		}
 		textLines = append(textLines, line)
 	}
 	value := str.SanitizeText(strings.Join(textLines, "\n"))
 	if value == "" {
 		return model.Line{}, false, nil
 	}
 	return model.Line{
 		Start: &startMs,
 		End:   &endMs,
 		Value: value,
 	}, true, nil
 }
 func parseSRTTime(value string) (int64, error) {
 	match := srtTimeRegex.FindStringSubmatch(strings.TrimSpace(value))
 	if match == nil {
 		return 0, strconv.ErrSyntax
 	}
 	hours, err := strconv.ParseInt(match[1], 10, 64)
 	if err != nil {
 		return 0, err
 	}
 	minutes, err := strconv.ParseInt(match[2], 10, 64)
 	if err != nil {
 		return 0, err
 	}
 	seconds, err := strconv.ParseInt(match[3], 10, 64)
 	if err != nil {
 		return 0, err
 	}
 	millis, err := strconv.ParseInt(match[4], 10, 64)
 	if err != nil {
 		return 0, err
 	}
 	switch len(match[4]) {
 	case 1:
 		millis *= 100
 	case 2:
 		millis *= 10
 	}
 	return (((hours*60)+minutes)*60+seconds)*1000 + millis, nil
 }
 func digitsOnly(value string) bool {
 	if value == "" {
 		return false
 	}
 	for _, ch := range value {
 		if ch < '0' || ch > '9' {
 			return false
 		}
 	}
 	return true
 }
--- a/core/lyrics/ttml.go
+++ b/core/lyrics/ttml.go
--- a/core/lyrics/ttml_test.go
+++ b/core/lyrics/ttml_test.go
@ -0,0 +1,407 @@
 package lyrics
 import (
 	"github.com/navidrome/navidrome/model"
 	"github.com/navidrome/navidrome/utils/gg"
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
 )
 var _ = Describe("parseTTML", func() {
 	Describe("Multi-language and timing", func() {
 		It("should parse multiple language divs with inherited offsets and frame/tick timing", func() {
 			content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
 <tt xmlns="http://www.w3.org/ns/ttml" xmlns:ttp="http://www.w3.org/ns/ttml#parameter" ttp:frameRate="30" ttp:subFrameRate="2" ttp:tickRate="10">
  <body>
    <div xml:lang="eng" begin="1s">
      <p begin="2s">Line one</p>
      <p begin="00:00:04:15.1"><span>Line two</span><br/>with break</p>
    </div>
    <div xml:lang="por">
      <p begin="45t">Linha</p>
    </div>
  </body>
 </tt>`)
 			list, err := parseTTML(content)
 			Expect(err).ToNot(HaveOccurred())
 			Expect(list).To(HaveLen(2))
 			By("parsing the English track")
 			eng := list[0]
 			Expect(eng.Lang).To(Equal("eng"))
 			Expect(eng.Synced).To(BeTrue())
 			Expect(eng.Line[0].Start).To(Equal(gg.P(int64(3000))))
 			Expect(eng.Line[0].Value).To(Equal("Line one"))
 			Expect(eng.Line[1].Start).To(Equal(gg.P(int64(4517))))
 			Expect(eng.Line[1].Value).To(Equal("Line two\nwith break"))
 			By("parsing the Portuguese track")
 			por := list[1]
 			Expect(por.Lang).To(Equal("por"))
 			Expect(por.Line[0].Start).To(Equal(gg.P(int64(4500))))
 			Expect(por.Line[0].Value).To(Equal("Linha"))
 		})
 	})
 	Describe("Unsupported cue handling", func() {
 		It("should skip wallclock cues and keep valid ones", func() {
 			content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
 <tt xmlns="http://www.w3.org/ns/ttml">
  <body xml:lang="eng">
    <div>
      <p begin="wallclock(2026-01-01T00:00:00Z)">Skip me</p>
      <p begin="1s">Keep me</p>
    </div>
  </body>
 </tt>`)
 			list, err := parseTTML(content)
 			Expect(err).ToNot(HaveOccurred())
 			Expect(list).To(HaveLen(1))
 			Expect(list[0].Line).To(HaveLen(1))
 			Expect(list[0].Line[0].Start).To(Equal(gg.P(int64(1000))))
 			Expect(list[0].Line[0].Value).To(Equal("Keep me"))
 		})
 	})
 	Describe("Begin/End/Dur with inheritance", func() {
 		It("should correctly accumulate nested timing from body, div, and p elements", func() {
 			content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
 <tt xmlns="http://www.w3.org/ns/ttml">
  <body xml:lang="eng" begin="10s">
    <div begin="5s" dur="8s">
      <p begin="1s" dur="2s">First line</p>
      <p begin="3s" end="5s">Second line</p>
    </div>
  </body>
 </tt>`)
 			list, err := parseTTML(content)
 			Expect(err).ToNot(HaveOccurred())
 			Expect(list).To(HaveLen(1))
 			Expect(list[0].Lang).To(Equal("eng"))
 			Expect(list[0].Line).To(HaveLen(2))
 			Expect(list[0].Line[0].Start).To(Equal(gg.P(int64(16000))))
 			Expect(list[0].Line[0].Value).To(Equal("First line"))
 			Expect(list[0].Line[1].Start).To(Equal(gg.P(int64(18000))))
 			Expect(list[0].Line[1].Value).To(Equal("Second line"))
 		})
 	})
 	Describe("Non-standard bare second offsets", func() {
 		It("should parse bare decimal numbers as seconds", func() {
 			content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
 <tt xmlns="http://www.w3.org/ns/ttml">
  <body xml:lang="eng" begin="10">
    <div>
      <p begin="0.170">First line</p>
      <p begin="3.710">Second line</p>
    </div>
  </body>
 </tt>`)
 			list, err := parseTTML(content)
 			Expect(err).ToNot(HaveOccurred())
 			Expect(list).To(HaveLen(1))
 			Expect(list[0].Line).To(HaveLen(2))
 			Expect(list[0].Line[0].Start).To(Equal(gg.P(int64(10170))))
 			Expect(list[0].Line[0].Value).To(Equal("First line"))
 			Expect(list[0].Line[1].Start).To(Equal(gg.P(int64(13710))))
 			Expect(list[0].Line[1].Value).To(Equal("Second line"))
 		})
 	})
 	Describe("Word timing tokens", func() {
 		It("should extract timed tokens from spans including background role", func() {
 			content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
 <tt xmlns="http://www.w3.org/ns/ttml" xmlns:ttm="http://www.w3.org/ns/ttml#metadata">
  <body xml:lang="eng">
    <div>
      <p begin="00:01.000" end="00:03.000">
        <span begin="00:01.000" end="00:01.400">He</span><span begin="00:01.400" end="00:01.800">llo</span>
        <span ttm:role="x-bg"><span begin="00:02.000" end="00:02.500">echo</span></span>
      </p>
    </div>
  </body>
 </tt>`)
 			list, err := parseTTML(content)
 			Expect(err).ToNot(HaveOccurred())
 			Expect(list).To(HaveLen(1))
 			Expect(list[0].Agents).To(Equal([]model.Agent{
 				{ID: "main", Role: "main"},
 				{ID: "__nd_bg__|main", Role: "bg"},
 			}))
 			Expect(list[0].Line).To(HaveLen(1))
 			line := list[0].Line[0]
 			Expect(line.Start).To(Equal(gg.P(int64(1000))))
 			Expect(line.Value).To(Equal("Hello\necho"))
 			Expect(line.End).To(Equal(gg.P(int64(3000))))
 			Expect(line.Cue).To(HaveLen(3))
 			Expect(line.Cue[0]).To(Equal(model.Cue{Start: gg.P(int64(1000)), End: gg.P(int64(1400)), Value: "He", ByteStart: 0, ByteEnd: 1, AgentID: "main"}))
 			Expect(line.Cue[1]).To(Equal(model.Cue{Start: gg.P(int64(1400)), End: gg.P(int64(1800)), Value: "llo", ByteStart: 2, ByteEnd: 4, AgentID: "main"}))
 			Expect(line.Cue[2]).To(Equal(model.Cue{Start: gg.P(int64(2000)), End: gg.P(int64(2500)), Value: "echo", ByteStart: 6, ByteEnd: 9, AgentID: "__nd_bg__|main"}))
 		})
 		It("should parse named TTML agents into main, voice, and group roles", func() {
 			content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
 <tt xmlns="http://www.w3.org/ns/ttml" xmlns:ttm="http://www.w3.org/ns/ttml#metadata">
  <head>
    <metadata>
      <ttm:agent xml:id="v1" type="person"><ttm:name>Chris Martin</ttm:name></ttm:agent>
      <ttm:agent xml:id="v2" type="person"><ttm:name>Jin</ttm:name></ttm:agent>
      <ttm:agent xml:id="v1000" type="group"><ttm:name>All</ttm:name></ttm:agent>
    </metadata>
  </head>
  <body xml:lang="eng">
    <div>
      <p begin="1s" end="2s" ttm:agent="v1"><span begin="1s" end="1.5s">You</span></p>
      <p begin="2s" end="3s" ttm:agent="v2"><span begin="2s" end="2.5s">and</span></p>
      <p begin="3s" end="4s" ttm:agent="v1000"><span begin="3s" end="3.5s">All</span></p>
    </div>
  </body>
 </tt>`)
 			list, err := parseTTML(content)
 			Expect(err).ToNot(HaveOccurred())
 			Expect(list).To(HaveLen(1))
 			Expect(list[0].Agents).To(Equal([]model.Agent{
 				{ID: "v1", Role: "main", Name: "Chris Martin"},
 				{ID: "v2", Role: "voice", Name: "Jin"},
 				{ID: "v1000", Role: "group", Name: "All"},
 			}))
 			Expect(list[0].Line[0].Cue[0].AgentID).To(Equal("v1"))
 			Expect(list[0].Line[1].Cue[0].AgentID).To(Equal("v2"))
 			Expect(list[0].Line[2].Cue[0].AgentID).To(Equal("v1000"))
 		})
 		It("should avoid collisions between derived background agents and explicit TTML agent ids", func() {
 			content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
 <tt xmlns="http://www.w3.org/ns/ttml" xmlns:ttm="http://www.w3.org/ns/ttml#metadata">
  <head>
    <metadata>
      <ttm:agent xml:id="lead" type="person"><ttm:name>Lead</ttm:name></ttm:agent>
      <ttm:agent xml:id="lead__bg" type="person"><ttm:name>Existing Background Id</ttm:name></ttm:agent>
    </metadata>
  </head>
  <body xml:lang="eng">
    <div>
      <p begin="1s" end="2s" ttm:agent="lead">
        <span begin="1s" end="1.4s">Lead</span>
        <span ttm:role="x-bg"><span begin="1.5s" end="1.8s">Echo</span></span>
      </p>
      <p begin="2s" end="3s" ttm:agent="lead__bg">
        <span begin="2s" end="2.5s">Named</span>
      </p>
    </div>
  </body>
 </tt>`)
 			list, err := parseTTML(content)
 			Expect(err).ToNot(HaveOccurred())
 			Expect(list).To(HaveLen(1))
 			Expect(list[0].Agents).To(Equal([]model.Agent{
 				{ID: "lead", Role: "main", Name: "Lead"},
 				{ID: "__nd_bg__|lead", Role: "bg", Name: "Lead"},
 				{ID: "lead__bg", Role: "voice", Name: "Existing Background Id"},
 			}))
 			Expect(list[0].Line).To(HaveLen(2))
 			Expect(list[0].Line[0].Cue).To(HaveLen(2))
 			Expect(list[0].Line[0].Cue[0].AgentID).To(Equal("lead"))
 			Expect(list[0].Line[0].Cue[1].AgentID).To(Equal("__nd_bg__|lead"))
 			Expect(list[0].Line[1].Cue).To(HaveLen(1))
 			Expect(list[0].Line[1].Cue[0].AgentID).To(Equal("lead__bg"))
 		})
 		It("should fill missing cue agent ids with the resolved main agent", func() {
 			content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
 <tt xmlns="http://www.w3.org/ns/ttml" xmlns:ttm="http://www.w3.org/ns/ttml#metadata">
  <head>
    <metadata>
      <ttm:agent xml:id="guest" type="person"><ttm:name>Guest Vocal</ttm:name></ttm:agent>
    </metadata>
  </head>
  <body xml:lang="eng">
    <div>
      <p begin="1s" end="3s">
        <span begin="1s" end="1.4s">Lead</span>
        <span begin="2s" end="2.4s" ttm:agent="guest">Guest</span>
      </p>
    </div>
  </body>
 </tt>`)
 			list, err := parseTTML(content)
 			Expect(err).ToNot(HaveOccurred())
 			Expect(list).To(HaveLen(1))
 			Expect(list[0].Agents).To(Equal([]model.Agent{
 				{ID: "guest", Role: "main", Name: "Guest Vocal"},
 			}))
 			Expect(list[0].Line).To(HaveLen(1))
 			Expect(list[0].Line[0].Cue).To(HaveLen(2))
 			Expect(list[0].Line[0].Cue[0].AgentID).To(Equal("guest"))
 			Expect(list[0].Line[0].Cue[1].AgentID).To(Equal("guest"))
 		})
 	})
 	Describe("Ambiguous decimal timing", func() {
 		It("should prefer absolute timing when values fall inside parent window", func() {
 			content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
 <tt xmlns="http://www.w3.org/ns/ttml">
  <body xml:lang="eng">
    <div begin="37.870" end="45.570">
      <p begin="43.444" end="45.570">
        <span begin="43.444" end="43.716">go</span>
        <span begin="43.716" end="43.887">go</span>
      </p>
    </div>
  </body>
 </tt>`)
 			list, err := parseTTML(content)
 			Expect(err).ToNot(HaveOccurred())
 			Expect(list).To(HaveLen(1))
 			Expect(list[0].Line).To(HaveLen(1))
 			line := list[0].Line[0]
 			Expect(line.Start).To(Equal(gg.P(int64(43444))))
 			Expect(line.Value).To(Equal("go\ngo"))
 			Expect(line.End).To(Equal(gg.P(int64(45570))))
 			Expect(line.Cue).To(HaveLen(2))
 			Expect(line.Cue[0]).To(Equal(model.Cue{Start: gg.P(int64(43444)), End: gg.P(int64(43716)), Value: "go", ByteStart: 0, ByteEnd: 1}))
 			Expect(line.Cue[1]).To(Equal(model.Cue{Start: gg.P(int64(43716)), End: gg.P(int64(43887)), Value: "go", ByteStart: 3, ByteEnd: 4}))
 		})
 	})
 	Describe("Unsynced fallback", func() {
 		It("should return unsynced lyrics when no timing is present", func() {
 			content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
 <tt xmlns="http://www.w3.org/ns/ttml">
  <body>
    <div>
      <p>No timing here</p>
    </div>
  </body>
 </tt>`)
 			list, err := parseTTML(content)
 			Expect(err).ToNot(HaveOccurred())
 			Expect(list).To(HaveLen(1))
 			Expect(list[0].Lang).To(Equal("xxx"))
 			Expect(list[0].Synced).To(BeFalse())
 			Expect(list[0].Line).To(HaveLen(1))
 			Expect(list[0].Line[0].Start).To(BeNil())
 			Expect(list[0].Line[0].Value).To(Equal("No timing here"))
 		})
 	})
 	Describe("Metadata tracks", func() {
 		It("should produce main, translation, and pronunciation tracks from iTunesMetadata", func() {
 			content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
 <tt xmlns="http://www.w3.org/ns/ttml" xmlns:itunes="http://music.apple.com/lyric-ttml-internal">
  <head>
    <metadata>
      <iTunesMetadata xmlns="http://music.apple.com/lyric-ttml-internal">
        <translations>
          <translation xml:lang="es">
            <text for="L1">Hola</text>
            <text for="MISSING">Skip me</text>
          </translation>
        </translations>
        <transliterations>
          <transliteration xml:lang="ja-Latn">
            <text for="L2"><span begin="00:02.000" end="00:02.300" xmlns="http://www.w3.org/ns/ttml">ko</span><span begin="00:02.300" end="00:02.600" xmlns="http://www.w3.org/ns/ttml">nni</span></text>
          </transliteration>
        </transliterations>
      </iTunesMetadata>
    </metadata>
  </head>
  <body xml:lang="ja">
    <div>
      <p begin="00:01.000" end="00:01.500" itunes:key="L1">こんにちは</p>
      <p begin="00:02.000" end="00:02.700" itunes:key="L2">こんばんは</p>
    </div>
  </body>
 </tt>`)
 			list, err := parseTTML(content)
 			Expect(err).ToNot(HaveOccurred())
 			Expect(list).To(HaveLen(3))
 			By("checking the main track")
 			main := list[0]
 			Expect(main.Kind).To(Equal("main"))
 			Expect(main.Lang).To(Equal("ja"))
 			Expect(main.Line).To(HaveLen(2))
 			By("checking the translation track")
 			translation := list[1]
 			Expect(translation.Kind).To(Equal("translation"))
 			Expect(translation.Lang).To(Equal("es"))
 			Expect(translation.Line).To(HaveLen(1))
 			Expect(translation.Line[0].Start).To(Equal(gg.P(int64(1000))))
 			Expect(translation.Line[0].Value).To(Equal("Hola"))
 			Expect(translation.Line[0].End).To(Equal(gg.P(int64(1500))))
 			By("checking the pronunciation track")
 			pronunciation := list[2]
 			Expect(pronunciation.Kind).To(Equal("pronunciation"))
 			Expect(pronunciation.Lang).To(Equal("ja-latn"))
 			Expect(pronunciation.Line).To(HaveLen(1))
 			Expect(pronunciation.Line[0].Start).To(Equal(gg.P(int64(2000))))
 			Expect(pronunciation.Line[0].Value).To(Equal("konni"))
 			Expect(pronunciation.Line[0].End).To(Equal(gg.P(int64(2600))))
 			Expect(pronunciation.Line[0].Cue).To(HaveLen(2))
 			Expect(pronunciation.Line[0].Cue[0]).To(Equal(model.Cue{Start: gg.P(int64(2000)), End: gg.P(int64(2300)), Value: "ko", ByteStart: 0, ByteEnd: 1}))
 			Expect(pronunciation.Line[0].Cue[1]).To(Equal(model.Cue{Start: gg.P(int64(2300)), End: gg.P(int64(2600)), Value: "nni", ByteStart: 2, ByteEnd: 4}))
 		})
 	})
 	Describe("Pronunciation with bare decimal end times", func() {
 		It("should correctly parse bare decimal times in transliteration spans", func() {
 			content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
 <tt xmlns="http://www.w3.org/ns/ttml" xmlns:itunes="http://music.apple.com/lyric-ttml-internal">
  <head>
    <metadata>
      <iTunesMetadata xmlns="http://music.apple.com/lyric-ttml-internal">
        <transliterations>
          <transliteration xml:lang="ja-Latn">
            <text for="L1"><span begin="2.747" end="3.018" xmlns="http://www.w3.org/ns/ttml">I</span> <span begin="3.018" end="3.179" xmlns="http://www.w3.org/ns/ttml">woke</span> <span begin="3.179" end="3.582" xmlns="http://www.w3.org/ns/ttml">up</span></text>
          </transliteration>
        </transliterations>
      </iTunesMetadata>
    </metadata>
  </head>
  <body xml:lang="ja">
    <div>
      <p begin="00:02.747" end="00:04.000" itunes:key="L1">起きた</p>
    </div>
  </body>
 </tt>`)
 			list, err := parseTTML(content)
 			Expect(err).ToNot(HaveOccurred())
 			var pronunciation *model.Lyrics
 			for i := range list {
 				if list[i].Kind == "pronunciation" {
 					pronunciation = &list[i]
 					break
 				}
 			}
 			Expect(pronunciation).ToNot(BeNil())
 			Expect(pronunciation.Line).To(HaveLen(1))
 			line := pronunciation.Line[0]
 			Expect(line.Start).To(Equal(gg.P(int64(2747))))
 			Expect(line.Value).To(Equal("I woke up"))
 			Expect(line.Cue).To(HaveLen(3))
 			Expect(line.Cue[0]).To(Equal(model.Cue{Start: gg.P(int64(2747)), End: gg.P(int64(3018)), Value: "I", ByteStart: 0, ByteEnd: 0}))
 			Expect(line.Cue[1]).To(Equal(model.Cue{Start: gg.P(int64(3018)), End: gg.P(int64(3179)), Value: "woke", ByteStart: 2, ByteEnd: 5}))
 			Expect(line.Cue[2]).To(Equal(model.Cue{Start: gg.P(int64(3179)), End: gg.P(int64(3582)), Value: "up", ByteStart: 7, ByteEnd: 8}))
 		})
 	})
 })
--- a/model/lyrics.go
+++ b/model/lyrics.go
@ -6,23 +6,43 @@ import (
 	"slices"
 	"strconv"
 	"strings"
 	"unicode"
 	"github.com/navidrome/navidrome/log"
 	"github.com/navidrome/navidrome/utils/str"
 )
 type Cue struct {
 	Start     *int64 `structs:"start,omitempty"   json:"start,omitempty"`
 	End       *int64 `structs:"end,omitempty"     json:"end,omitempty"`
 	Value     string `structs:"value"             json:"value"`
 	ByteStart int    `structs:"byteStart"         json:"byteStart"`
 	ByteEnd   int    `structs:"byteEnd"           json:"byteEnd"`
 	AgentID   string `structs:"agentId,omitempty" json:"agentId,omitempty"`
 }
 type Agent struct {
 	ID   string `structs:"id"             json:"id"`
 	Role string `structs:"role"           json:"role"`
 	Name string `structs:"name,omitempty" json:"name,omitempty"`
 }
 type Line struct {
 	Start *int64 `structs:"start,omitempty" json:"start,omitempty"`
 	End   *int64 `structs:"end,omitempty"   json:"end,omitempty"`
 	Value string `structs:"value"           json:"value"`
 	Cue   []Cue  `structs:"cue,omitempty"   json:"cue,omitempty"`
 }
 type Lyrics struct {
-	DisplayArtist string `structs:"displayArtist,omitempty" json:"displayArtist,omitempty"`
+	DisplayArtist string  `structs:"displayArtist,omitempty" json:"displayArtist,omitempty"`
-	DisplayTitle  string `structs:"displayTitle,omitempty"  json:"displayTitle,omitempty"`
+	DisplayTitle  string  `structs:"displayTitle,omitempty"  json:"displayTitle,omitempty"`
-	Lang          string `structs:"lang"                    json:"lang"`
+	Kind          string  `structs:"kind,omitempty"          json:"kind,omitempty"`
-	Line          []Line `structs:"line"                    json:"line"`
+	Lang          string  `structs:"lang"                    json:"lang"`
-	Offset        *int64 `structs:"offset,omitempty"        json:"offset,omitempty"`
+	Agents        []Agent `structs:"agents,omitempty"       json:"agents,omitempty"`
-	Synced        bool   `structs:"synced"                  json:"synced"`
+	Line          []Line  `structs:"line"                    json:"line"`
 	Offset        *int64  `structs:"offset,omitempty"        json:"offset,omitempty"`
 	Synced        bool    `structs:"synced"                  json:"synced"`
 }
 // support the standard [mm:ss.mm], as well as [hh:*] and [*.mmm]
@ -33,6 +53,10 @@ var (
 	syncRegex  = regexp.MustCompile(`(^|\n)\s*` + timeRegexString)
 	timeRegex  = regexp.MustCompile(timeRegexString)
 	lrcIdRegex = regexp.MustCompile(`\[(ar|ti|offset|lang):([^]]+)]`)
 	// Enhanced LRC: inline word-level timing markers like <00:12.34>
 	enhancedLRCTimeString = `<([0-9]{1,2}:)?([0-9]{1,2}):([0-9]{1,2})(.[0-9]{1,3})?>`
 	enhancedLRCRegex      = regexp.MustCompile(enhancedLRCTimeString)
 )
 func (l Lyrics) IsEmpty() bool {
@ -106,9 +130,11 @@ func ToLyrics(language, text string) (*Lyrics, error) {
 			if validLine {
 				for idx := range timestamps {
 					value, cues := parseEnhancedLine(priorLine)
 					structuredLines = append(structuredLines, Line{
 						Start: &timestamps[idx],
-						Value: strings.TrimSpace(priorLine),
+						Value: value,
 						Cue:   cues,
 					})
 				}
 				timestamps = nil
@ -154,9 +180,11 @@ func ToLyrics(language, text string) (*Lyrics, error) {
 	if validLine {
 		for idx := range timestamps {
 			value, cues := parseEnhancedLine(priorLine)
 			structuredLines = append(structuredLines, Line{
 				Start: &timestamps[idx],
-				Value: strings.TrimSpace(priorLine),
+				Value: value,
 				Cue:   cues,
 			})
 		}
 	}
@ -173,13 +201,118 @@ func ToLyrics(language, text string) (*Lyrics, error) {
 		DisplayArtist: artist,
 		DisplayTitle:  title,
 		Lang:          language,
-		Line:          structuredLines,
+		Line:          NormalizeCueLines(structuredLines),
 		Offset:        offset,
 		Synced:        synced,
 	}
 	return &lyrics, nil
 }
 // parseEnhancedLine extracts word-level timing cues from Enhanced LRC inline markers
 // and computes UTF-8 byte offsets against the final stripped line value.
 func parseEnhancedLine(text string) (string, []Cue) {
 	matches := enhancedLRCRegex.FindAllStringSubmatchIndex(text, -1)
 	if len(matches) == 0 {
 		return strings.TrimSpace(text), nil
 	}
 	type segment struct {
 		start    int64
 		rawStart int
 		rawEnd   int
 	}
 	segments := make([]segment, 0, len(matches))
 	var rawValue strings.Builder
 	for i, match := range matches {
 		timeMs, err := parseTime(
 			// Rewrite <...> as [...] so parseTime can handle it with the same logic
 			"["+text[match[0]+1:match[1]-1]+"]",
 			// Adjust match indices to point into our rewritten string (need start/end pairs for each group)
 			[]int{
 				0, match[1] - match[0],
 				adjustGroup(match, 2), adjustGroup(match, 3),
 				adjustGroup(match, 4), adjustGroup(match, 5),
 				adjustGroup(match, 6), adjustGroup(match, 7),
 				adjustGroup(match, 8), adjustGroup(match, 9),
 			},
 		)
 		if err != nil {
 			continue
 		}
 		// Text runs from after this marker to the start of the next marker (or end of string)
 		textStart := match[1]
 		var textEnd int
 		if i+1 < len(matches) {
 			textEnd = matches[i+1][0]
 		} else {
 			textEnd = len(text)
 		}
 		word := text[textStart:textEnd]
 		if word == "" {
 			continue
 		}
 		rawStart := rawValue.Len()
 		rawValue.WriteString(word)
 		segments = append(segments, segment{
 			start:    timeMs,
 			rawStart: rawStart,
 			rawEnd:   rawValue.Len(),
 		})
 	}
 	if len(segments) == 0 {
 		return strings.TrimSpace(stripEnhancedMarkers(text)), nil
 	}
 	finalRaw := rawValue.String()
 	leftTrimBytes := len(finalRaw) - len(strings.TrimLeftFunc(finalRaw, unicode.IsSpace))
 	rightTrimBytes := len(finalRaw) - len(strings.TrimRightFunc(finalRaw, unicode.IsSpace))
 	trimmedEnd := len(finalRaw) - rightTrimBytes
 	if trimmedEnd < leftTrimBytes {
 		trimmedEnd = leftTrimBytes
 	}
 	cues := make([]Cue, 0, len(segments))
 	for _, seg := range segments {
 		start := seg.start
 		byteStart := max(seg.rawStart, leftTrimBytes)
 		byteEnd := min(seg.rawEnd, trimmedEnd)
 		if byteStart >= byteEnd {
 			continue
 		}
 		cues = append(cues, Cue{
 			Start:     &start,
 			Value:     finalRaw[byteStart:byteEnd],
 			ByteStart: byteStart - leftTrimBytes,
 			ByteEnd:   byteEnd - leftTrimBytes - 1,
 		})
 	}
 	return strings.TrimSpace(finalRaw), cues
 }
 // adjustGroup remaps a capture group index from the original match to our rewritten "[...]" string.
 // The rewrite shifts by -1 (removed '<', added '[') so positions within the brackets stay the same.
 func adjustGroup(match []int, groupIdx int) int {
 	orig := match[groupIdx]
 	if orig == -1 {
 		return -1
 	}
 	// Offset is: original position minus the position of '<' in the original, plus 1 for '['
 	return orig - match[0]
 }
 // stripEnhancedMarkers removes all <mm:ss.mm> inline markers from text,
 // returning the plain lyric text.
 func stripEnhancedMarkers(text string) string {
 	return enhancedLRCRegex.ReplaceAllString(text, "")
 }
 func parseTime(line string, match []int) (int64, error) {
 	var hours, millis int64
 	var err error
@ -227,3 +360,115 @@ func parseTime(line string, match []int) (int64, error) {
 }
 type LyricList []Lyrics
 func NormalizeLyrics(lyrics Lyrics) Lyrics {
 	lyrics.Line = NormalizeCueLines(lyrics.Line)
 	if len(lyrics.Agents) == 0 {
 		lyrics.Agents = nil
 	}
 	return lyrics
 }
 func NormalizeCueLines(lines []Line) []Line {
 	if len(lines) == 0 {
 		return lines
 	}
 	normalized := make([]Line, len(lines))
 	copy(normalized, lines)
 	for i := range normalized {
 		var fallbackEnd *int64
 		if normalized[i].End != nil {
 			v := *normalized[i].End
 			fallbackEnd = &v
 		} else if i+1 < len(normalized) && normalized[i+1].Start != nil {
 			v := *normalized[i+1].Start
 			fallbackEnd = &v
 		}
 		normalized[i] = normalizeCueLine(normalized[i], fallbackEnd)
 	}
 	return normalized
 }
 func NormalizeLineTiming(line Line) Line {
 	if len(line.Cue) == 0 {
 		return line
 	}
 	var earliestStart *int64
 	var latestEnd *int64
 	for i := range line.Cue {
 		token := line.Cue[i]
 		if token.Start != nil {
 			if earliestStart == nil || *token.Start < *earliestStart {
 				v := *token.Start
 				earliestStart = &v
 			}
 		}
 		candidateEnd := token.End
 		if candidateEnd == nil {
 			candidateEnd = token.Start
 		}
 		if candidateEnd != nil {
 			if latestEnd == nil || *candidateEnd > *latestEnd {
 				v := *candidateEnd
 				latestEnd = &v
 			}
 		}
 	}
 	if line.Start == nil && earliestStart != nil {
 		v := *earliestStart
 		line.Start = &v
 	}
 	if line.End == nil && latestEnd != nil {
 		v := *latestEnd
 		line.End = &v
 	}
 	return line
 }
 func normalizeCueLine(line Line, fallbackEnd *int64) Line {
 	if len(line.Cue) == 0 {
 		return line
 	}
 	for i := range line.Cue {
 		if line.Cue[i].End != nil {
 			continue
 		}
 		if i+1 < len(line.Cue) && line.Cue[i+1].Start != nil {
 			v := *line.Cue[i+1].Start
 			line.Cue[i].End = &v
 			continue
 		}
 		if fallbackEnd != nil {
 			v := *fallbackEnd
 			line.Cue[i].End = &v
 		}
 	}
 	for i := range line.Cue {
 		if line.Cue[i].End == nil {
 			line.Cue = clearCueEnds(line.Cue)
 			return NormalizeLineTiming(line)
 		}
 	}
 	return NormalizeLineTiming(line)
 }
 func clearCueEnds(cues []Cue) []Cue {
 	normalized := make([]Cue, len(cues))
 	copy(normalized, cues)
 	for i := range normalized {
 		normalized[i].End = nil
 	}
 	return normalized
 }
--- a/model/lyrics_test.go
+++ b/model/lyrics_test.go
@ -116,4 +116,85 @@ var _ = Describe("ToLyrics", func() {
 			{Start: &e, Value: "Test"},
 		}))
 	})
 	It("should parse Enhanced LRC with word-level timing", func() {
 		lyrics, err := ToLyrics("xxx", "[00:01.00]<00:01.00>Some <00:01.50>lyrics <00:02.00>here\n[00:03.00]<00:03.00>More <00:03.50>words")
 		Expect(err).ToNot(HaveOccurred())
 		Expect(lyrics.Synced).To(BeTrue())
 		Expect(lyrics.Line).To(HaveLen(2))
 		t1000, t1500, t2000, t3000, t3500 := int64(1000), int64(1500), int64(2000), int64(3000), int64(3500)
 		line0 := lyrics.Line[0]
 		Expect(line0.Start).To(Equal(&t1000))
 		Expect(line0.End).To(Equal(&t3000))
 		Expect(line0.Value).To(Equal("Some lyrics here"))
 		Expect(line0.Cue).To(Equal([]Cue{
 			{Start: &t1000, End: &t1500, Value: "Some ", ByteStart: 0, ByteEnd: 4},
 			{Start: &t1500, End: &t2000, Value: "lyrics ", ByteStart: 5, ByteEnd: 11},
 			{Start: &t2000, End: &t3000, Value: "here", ByteStart: 12, ByteEnd: 15},
 		}))
 		line1 := lyrics.Line[1]
 		Expect(line1.Start).To(Equal(&t3000))
 		Expect(line1.End).To(Equal(&t3500))
 		Expect(line1.Value).To(Equal("More words"))
 		Expect(line1.Cue).To(Equal([]Cue{
 			{Start: &t3000, Value: "More ", ByteStart: 0, ByteEnd: 4},
 			{Start: &t3500, Value: "words", ByteStart: 5, ByteEnd: 9},
 		}))
 		Expect(line1.Cue[1].End).To(BeNil())
 	})
 	It("should ignore Enhanced LRC markers and return plain lines when no markers present", func() {
 		a, b := int64(1000), int64(3000)
 		lyrics, err := ToLyrics("xxx", "[00:01.00]Plain line\n[00:03.00]Another plain line")
 		Expect(err).ToNot(HaveOccurred())
 		Expect(lyrics.Line).To(Equal([]Line{
 			{Start: &a, Value: "Plain line"},
 			{Start: &b, Value: "Another plain line"},
 		}))
 	})
 	It("should handle mixed Enhanced and plain LRC lines", func() {
 		lyrics, err := ToLyrics("xxx", "[00:01.00]<00:01.00>Some <00:01.50>lyrics\n[00:03.00]Plain line\n[00:05.00]<00:05.00>More <00:05.50>words")
 		Expect(err).ToNot(HaveOccurred())
 		Expect(lyrics.Line).To(HaveLen(3))
 		t1000, t1500, t5000, t5500 := int64(1000), int64(1500), int64(5000), int64(5500)
 		t3000 := int64(3000)
 		Expect(lyrics.Line[0].Cue).To(Equal([]Cue{
 			{Start: &t1000, End: &t1500, Value: "Some ", ByteStart: 0, ByteEnd: 4},
 			{Start: &t1500, End: &t3000, Value: "lyrics", ByteStart: 5, ByteEnd: 10},
 		}))
 		Expect(lyrics.Line[0].Value).To(Equal("Some lyrics"))
 		Expect(lyrics.Line[0].End).To(Equal(&t3000))
 		Expect(lyrics.Line[1].Cue).To(BeNil())
 		Expect(lyrics.Line[1].Value).To(Equal("Plain line"))
 		Expect(lyrics.Line[2].Cue).To(Equal([]Cue{
 			{Start: &t5000, Value: "More ", ByteStart: 0, ByteEnd: 4},
 			{Start: &t5500, Value: "words", ByteStart: 5, ByteEnd: 9},
 		}))
 		Expect(lyrics.Line[2].Value).To(Equal("More words"))
 	})
 	It("should preserve byte offsets for Enhanced LRC cues", func() {
 		lyrics, err := ToLyrics("xxx", "[00:00.00]<00:00.00>Oh <00:00.90>love<00:01.30> me <00:01.60>tonight")
 		Expect(err).ToNot(HaveOccurred())
 		Expect(lyrics.Line).To(HaveLen(1))
 		t0, t900, t1300, t1600 := int64(0), int64(900), int64(1300), int64(1600)
 		line := lyrics.Line[0]
 		Expect(line.Value).To(Equal("Oh love me tonight"))
 		Expect(line.Cue).To(Equal([]Cue{
 			{Start: &t0, Value: "Oh ", ByteStart: 0, ByteEnd: 2},
 			{Start: &t900, Value: "love", ByteStart: 3, ByteEnd: 6},
 			{Start: &t1300, Value: " me ", ByteStart: 7, ByteEnd: 10},
 			{Start: &t1600, Value: "tonight", ByteStart: 11, ByteEnd: 17},
 		}))
 	})
 })
--- a/server/subsonic/helpers.go
+++ b/server/subsonic/helpers.go
@ -493,14 +493,79 @@ func mapExplicitStatus(explicitStatus string) string {
 	return ""
 }
-func buildStructuredLyric(mf *model.MediaFile, lyrics model.Lyrics) responses.StructuredLyric {
+func buildStructuredLyric(mf *model.MediaFile, lyrics model.Lyrics, enhanced bool) responses.StructuredLyric {
 	lines := make([]responses.Line, len(lyrics.Line))
 	var cueLines []responses.CueLine
 	agentOrderByID := make(map[string]int, len(lyrics.Agents))
 	agentRoleByID := make(map[string]string, len(lyrics.Agents))
 	responseAgents := make([]responses.Agent, 0, len(lyrics.Agents))
 	for i, agent := range lyrics.Agents {
 		agentOrderByID[agent.ID] = i
 		agentRoleByID[agent.ID] = agent.Role
 		responseAgents = append(responseAgents, responses.Agent{
 			ID:   agent.ID,
 			Role: agent.Role,
 			Name: agent.Name,
 		})
 	}
 	for i, line := range lyrics.Line {
 		lines[i] = responses.Line{
 			Start: line.Start,
 			Value: line.Value,
 		}
 		if !enhanced || len(line.Cue) == 0 {
 			continue
 		}
 		agentOrder := make([]string, 0, 2)
 		cuesByAgent := make(map[string][]model.Cue)
 		for _, cue := range line.Cue {
 			if cue.Start == nil {
 				continue
 			}
 			agentID := strings.TrimSpace(cue.AgentID)
 			if _, exists := cuesByAgent[agentID]; !exists {
 				agentOrder = append(agentOrder, agentID)
 			}
 			cuesByAgent[agentID] = append(cuesByAgent[agentID], cue)
 		}
 		sort.SliceStable(agentOrder, func(i, j int) bool {
 			leftRole := agentRoleByID[agentOrder[i]]
 			rightRole := agentRoleByID[agentOrder[j]]
 			if leftRole == "main" && rightRole != "main" {
 				return true
 			}
 			if rightRole == "main" && leftRole != "main" {
 				return false
 			}
 			leftOrder, leftOK := agentOrderByID[agentOrder[i]]
 			rightOrder, rightOK := agentOrderByID[agentOrder[j]]
 			if leftOK && rightOK && leftOrder != rightOrder {
 				return leftOrder < rightOrder
 			}
 			if leftOK != rightOK {
 				return leftOK
 			}
 			return i < j
 		})
 		for _, agentID := range agentOrder {
 			cueLine := responses.CueLine{
 				Index: int32(i),
 				Start: line.Start,
 				End:   line.End,
 				Value: line.Value,
 				Cue:   buildLyricCues(cuesByAgent[agentID], line.End),
 			}
 			if agentID != "" {
 				cueLine.AgentID = agentID
 			}
 			cueLines = append(cueLines, cueLine)
 		}
 	}
 	structured := responses.StructuredLyric{
@ -508,10 +573,22 @@ func buildStructuredLyric(mf *model.MediaFile, lyrics model.Lyrics) responses.St
 		DisplayTitle:  lyrics.DisplayTitle,
 		Lang:          lyrics.Lang,
 		Line:          lines,
 		CueLine:       cueLines,
 		Offset:        lyrics.Offset,
 		Synced:        lyrics.Synced,
 	}
 	if enhanced {
 		kind := strings.TrimSpace(lyrics.Kind)
 		if kind == "" {
 			kind = "main"
 		}
 		structured.Kind = kind
 		if len(cueLines) > 0 && len(responseAgents) > 0 {
 			structured.Agents = responseAgents
 		}
 	}
 	if structured.DisplayArtist == "" {
 		structured.DisplayArtist = mf.Artist
 	}
@ -522,11 +599,86 @@ func buildStructuredLyric(mf *model.MediaFile, lyrics model.Lyrics) responses.St
 	return structured
 }
-func buildLyricsList(mf *model.MediaFile, lyricsList model.LyricList) *responses.LyricsList {
+func buildLyricCues(cues []model.Cue, lineEnd *int64) []responses.LyricCue {
-	lyricList := make(responses.StructuredLyrics, len(lyricsList))
+	if len(cues) == 0 {
 		return nil
 	}
-	for i, lyrics := range lyricsList {
+	hasAnyEnd := false
-		lyricList[i] = buildStructuredLyric(mf, lyrics)
+	for i := range cues {
 		if cues[i].End != nil {
 			hasAnyEnd = true
 			break
 		}
 	}
 	normalized := make([]responses.LyricCue, 0, len(cues))
 	for i := range cues {
 		if cues[i].Start == nil {
 			continue
 		}
 		cue := responses.LyricCue{
 			Start:     *cues[i].Start,
 			Value:     cues[i].Value,
 			ByteStart: cues[i].ByteStart,
 			ByteEnd:   cues[i].ByteEnd,
 		}
 		if hasAnyEnd {
 			end := cues[i].End
 			if end == nil {
 				if i+1 < len(cues) && cues[i+1].Start != nil {
 					v := *cues[i+1].Start
 					end = &v
 				} else if lineEnd != nil {
 					v := *lineEnd
 					end = &v
 				}
 			}
 			if end != nil && i+1 < len(cues) && cues[i+1].Start != nil && *end > *cues[i+1].Start {
 				v := *cues[i+1].Start
 				end = &v
 			}
 			if end != nil && *end < cue.Start {
 				v := cue.Start
 				end = &v
 			}
 			cue.End = end
 		}
 		normalized = append(normalized, cue)
 	}
 	if hasAnyEnd {
 		for i := range normalized {
 			if normalized[i].End == nil {
 				for j := range normalized {
 					normalized[j].End = nil
 				}
 				break
 			}
 		}
 	}
 	return normalized
 }
 func buildLyricsList(mf *model.MediaFile, lyricsList model.LyricList, enhanced bool) *responses.LyricsList {
 	var filtered model.LyricList
 	if enhanced {
 		filtered = lyricsList
 	} else {
 		// Without enhanced, only return "main" kind entries
 		for _, l := range lyricsList {
 			kind := strings.TrimSpace(l.Kind)
 			if kind == "" || kind == "main" {
 				filtered = append(filtered, l)
 			}
 		}
 	}
 	lyricList := make(responses.StructuredLyrics, len(filtered))
 	for i, lyrics := range filtered {
 		lyricList[i] = buildStructuredLyric(mf, lyrics, enhanced)
 	}
 	res := &responses.LyricsList{
--- a/server/subsonic/media_retrieval.go
+++ b/server/subsonic/media_retrieval.go
@ -10,6 +10,7 @@ import (
 	"github.com/navidrome/navidrome/conf"
 	"github.com/navidrome/navidrome/consts"
 	lyricssvc "github.com/navidrome/navidrome/core/lyrics"
 	"github.com/navidrome/navidrome/log"
 	"github.com/navidrome/navidrome/model"
 	"github.com/navidrome/navidrome/resources"
@ -19,6 +20,8 @@ import (
 	"github.com/navidrome/navidrome/utils/req"
 )
 const maxLegacyLyricsCandidates = 10
 func (api *Router) GetAvatar(w http.ResponseWriter, r *http.Request) (*responses.Subsonic, error) {
 	if !conf.Server.EnableGravatar {
 		return api.getPlaceHolderAvatar(w, r)
@ -98,7 +101,11 @@ func (api *Router) GetLyrics(r *http.Request) (*responses.Subsonic, error) {
 	response := newResponse()
 	lyricsResponse := responses.Lyrics{}
 	response.Lyrics = &lyricsResponse
-	mediaFiles, err := api.ds.MediaFile(r.Context()).GetAll(filter.SongsByArtistTitleWithLyricsFirst(artist, title))
+	opts := filter.SongsByArtistTitleWithLyricsFirst(artist, title)
 	// Search a bounded duplicate window so source-priority fallback can still
 	// reach older matches without turning legacy getLyrics into an unbounded scan.
 	opts.Max = maxLegacyLyricsCandidates
 	mediaFiles, err := api.ds.MediaFile(r.Context()).GetAll(opts)
 	if err != nil {
 		return nil, err
@ -108,9 +115,22 @@ func (api *Router) GetLyrics(r *http.Request) (*responses.Subsonic, error) {
 		return response, nil
 	}
-	structuredLyrics, err := api.lyrics.GetLyrics(r.Context(), &mediaFiles[0])
+	var structuredLyrics model.LyricList
-	if err != nil {
+	if batchLyrics, ok := api.lyrics.(lyricssvc.BatchLyrics); ok {
-		return nil, err
+		structuredLyrics, err = batchLyrics.GetLyricsForMediaFiles(r.Context(), mediaFiles)
 		if err != nil {
 			return nil, err
 		}
 	} else {
 		for i := range mediaFiles {
 			structuredLyrics, err = api.lyrics.GetLyrics(r.Context(), &mediaFiles[i])
 			if err != nil {
 				return nil, err
 			}
 			if len(structuredLyrics) > 0 {
 				break
 			}
 		}
 	}
 	if len(structuredLyrics) == 0 {
@ -124,7 +144,6 @@ func (api *Router) GetLyrics(r *http.Request) (*responses.Subsonic, error) {
 	for _, line := range structuredLyrics[0].Line {
 		lyricsText.WriteString(line.Value + "\n")
 	}
 	lyricsResponse.Value = lyricsText.String()
 	return response, nil
@ -146,8 +165,10 @@ func (api *Router) GetLyricsBySongId(r *http.Request) (*responses.Subsonic, erro
 		return nil, err
 	}
 	enhanced, _ := req.Params(r).Bool("enhanced")
 	response := newResponse()
-	response.LyricsList = buildLyricsList(mediaFile, structuredLyrics)
+	response.LyricsList = buildLyricsList(mediaFile, structuredLyrics, enhanced)
 	return response, nil
 }
--- a/server/subsonic/media_retrieval_test.go
+++ b/server/subsonic/media_retrieval_test.go
@ -186,6 +186,41 @@ var _ = Describe("MediaRetrievalController", func() {
 			Expect(response.Lyrics.Title).To(Equal("Never Gonna Give You Up"))
 			Expect(response.Lyrics.Value).To(Equal("We're no strangers to love\nYou know the rules and so do I\n"))
 		})
 		It("should prefer higher-priority sidecar lyrics across duplicate candidates", func() {
 			conf.Server.LyricsPriority = ".ttml,embedded"
 			r := newGetRequest("artist=Rick+Astley", "title=Never+Gonna+Give+You+Up")
 			baseTime := time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC)
 			embedded, err := model.ToLyrics("eng", "Newest duplicate embedded lyrics")
 			Expect(err).ToNot(HaveOccurred())
 			embeddedJSON, err := json.Marshal(model.LyricList{*embedded})
 			Expect(err).ToNot(HaveOccurred())
 			mockRepo.SetData(model.MediaFiles{
 				{
 					ID:        "1",
 					Path:      "tests/fixtures/01 Invisible (RED) Edit Version.mp3",
 					Artist:    "Rick Astley",
 					Title:     "Never Gonna Give You Up",
 					Lyrics:    string(embeddedJSON),
 					UpdatedAt: baseTime.Add(2 * time.Hour), // Newer duplicate with embedded lyrics only
 				},
 				{
 					ID:        "2",
 					Path:      "tests/fixtures/test.mp3",
 					Artist:    "Rick Astley",
 					Title:     "Never Gonna Give You Up",
 					Lyrics:    "[]",
 					UpdatedAt: baseTime.Add(1 * time.Hour), // Older, but has TTML sidecar
 				},
 			})
 			response, err := router.GetLyrics(r)
 			Expect(err).ToNot(HaveOccurred())
 			Expect(response.Lyrics.Artist).To(Equal("Rick Astley"))
 			Expect(response.Lyrics.Title).To(Equal("Never Gonna Give You Up"))
 			Expect(response.Lyrics.Value).To(Equal("We're no strangers to love\nYou know the rules and so do I\n"))
 			Expect(mockRepo.Options.Max).To(Equal(maxLegacyLyricsCandidates))
 		})
 	})
 	Describe("GetLyricsBySongId", func() {
@ -202,8 +237,10 @@ var _ = Describe("MediaRetrievalController", func() {
 				Expect(realLyric.DisplayArtist).To(Equal(expectedLyric.DisplayArtist))
 				Expect(realLyric.DisplayTitle).To(Equal(expectedLyric.DisplayTitle))
 				Expect(realLyric.Kind).To(Equal(expectedLyric.Kind))
 				Expect(realLyric.Lang).To(Equal(expectedLyric.Lang))
 				Expect(realLyric.Synced).To(Equal(expectedLyric.Synced))
 				Expect(realLyric.Agents).To(Equal(expectedLyric.Agents))
 				if expectedLyric.Offset == nil {
 					Expect(realLyric.Offset).To(BeNil())
@ -222,6 +259,38 @@ var _ = Describe("MediaRetrievalController", func() {
 						Expect(*realLine.Start).To(Equal(*expectedLine.Start))
 					}
 				}
 				Expect(realLyric.CueLine).To(HaveLen(len(expectedLyric.CueLine)))
 				for j, realCueLine := range realLyric.CueLine {
 					expectedCueLine := expectedLyric.CueLine[j]
 					Expect(realCueLine.Index).To(Equal(expectedCueLine.Index))
 					Expect(realCueLine.Value).To(Equal(expectedCueLine.Value))
 					Expect(realCueLine.AgentID).To(Equal(expectedCueLine.AgentID))
 					if expectedCueLine.Start == nil {
 						Expect(realCueLine.Start).To(BeNil())
 					} else {
 						Expect(*realCueLine.Start).To(Equal(*expectedCueLine.Start))
 					}
 					if expectedCueLine.End == nil {
 						Expect(realCueLine.End).To(BeNil())
 					} else {
 						Expect(*realCueLine.End).To(Equal(*expectedCueLine.End))
 					}
 					Expect(realCueLine.Cue).To(HaveLen(len(expectedCueLine.Cue)))
 					for k, realCue := range realCueLine.Cue {
 						expectedCue := expectedCueLine.Cue[k]
 						Expect(realCue.Value).To(Equal(expectedCue.Value))
 						Expect(realCue.Start).To(Equal(expectedCue.Start))
 						Expect(realCue.ByteStart).To(Equal(expectedCue.ByteStart))
 						Expect(realCue.ByteEnd).To(Equal(expectedCue.ByteEnd))
 						if expectedCue.End == nil {
 							Expect(realCue.End).To(BeNil())
 						} else {
 							Expect(*realCue.End).To(Equal(*expectedCue.End))
 						}
 					}
 				}
 			}
 		}
@ -323,6 +392,427 @@ var _ = Describe("MediaRetrievalController", func() {
 				},
 			})
 		})
 		It("should return multilingual TTML sidecar lyrics", func() {
 			conf.Server.LyricsPriority = ".ttml,embedded"
 			r := newGetRequest("id=1")
 			mockRepo.SetData(model.MediaFiles{
 				{
 					ID:     "1",
 					Path:   "tests/fixtures/test.mp3",
 					Artist: "Rick Astley",
 					Title:  "Never Gonna Give You Up",
 					Lyrics: "[]",
 				},
 			})
 			response, err := router.GetLyricsBySongId(r)
 			Expect(err).ToNot(HaveOccurred())
 			porTime := int64(18800)
 			ttmlTime := int64(22800)
 			compareResponses(response.LyricsList, responses.LyricsList{
 				StructuredLyrics: responses.StructuredLyrics{
 					{
 						DisplayArtist: "Rick Astley",
 						DisplayTitle:  "Never Gonna Give You Up",
 						Lang:          "eng",
 						Synced:        true,
 						Line: []responses.Line{
 							{
 								Start: &times[0],
 								Value: "We're no strangers to love",
 							},
 							{
 								Start: &ttmlTime,
 								Value: "You know the rules and so do I",
 							},
 						},
 					},
 					{
 						DisplayArtist: "Rick Astley",
 						DisplayTitle:  "Never Gonna Give You Up",
 						Lang:          "por",
 						Synced:        true,
 						Line: []responses.Line{
 							{
 								Start: &porTime,
 								Value: "Nao somos estranhos ao amor",
 							},
 						},
 					},
 				},
 			})
 		})
 		It("should return metadata-linked translation and pronunciation tracks from TTML", func() {
 			conf.Server.LyricsPriority = ".ttml,embedded"
 			r := newGetRequest("id=1&enhanced=true")
 			mockRepo.SetData(model.MediaFiles{
 				{
 					ID:     "1",
 					Path:   "tests/fixtures/test-metadata.mp3",
 					Artist: "Rick Astley",
 					Title:  "Never Gonna Give You Up",
 					Lyrics: "[]",
 				},
 			})
 			response, err := router.GetLyricsBySongId(r)
 			Expect(err).ToNot(HaveOccurred())
 			mainStartA := int64(1000)
 			mainStartB := int64(2000)
 			tokenStartA := int64(2000)
 			tokenEndA := int64(2300)
 			tokenStartB := int64(2300)
 			tokenEndB := int64(2600)
 			compareResponses(response.LyricsList, responses.LyricsList{
 				StructuredLyrics: responses.StructuredLyrics{
 					{
 						DisplayArtist: "Rick Astley",
 						DisplayTitle:  "Never Gonna Give You Up",
 						Kind:          "main",
 						Lang:          "ja",
 						Synced:        true,
 						Line: []responses.Line{
 							{
 								Start: &mainStartA,
 								Value: "こんにちは",
 							},
 							{
 								Start: &mainStartB,
 								Value: "こんばんは",
 							},
 						},
 					},
 					{
 						DisplayArtist: "Rick Astley",
 						DisplayTitle:  "Never Gonna Give You Up",
 						Kind:          "translation",
 						Lang:          "es",
 						Synced:        true,
 						Line: []responses.Line{
 							{
 								Start: &mainStartA,
 								Value: "Hola",
 							},
 						},
 					},
 					{
 						DisplayArtist: "Rick Astley",
 						DisplayTitle:  "Never Gonna Give You Up",
 						Kind:          "pronunciation",
 						Lang:          "ja-latn",
 						Synced:        true,
 						Line: []responses.Line{
 							{
 								Start: &mainStartB,
 								Value: "konni",
 							},
 						},
 						CueLine: []responses.CueLine{
 							{
 								Index: 0,
 								Start: &mainStartB,
 								End:   &tokenEndB,
 								Value: "konni",
 								Cue: []responses.LyricCue{
 									{
 										Start:     tokenStartA,
 										End:       &tokenEndA,
 										ByteStart: 0,
 										ByteEnd:   1,
 										Value:     "ko",
 									},
 									{
 										Start:     tokenStartB,
 										End:       &tokenEndB,
 										ByteStart: 2,
 										ByteEnd:   4,
 										Value:     "nni",
 									},
 								},
 							},
 						},
 					},
 				},
 			})
 		})
 		It("should return cue lines for songLyrics v2 clients with enhanced=true", func() {
 			r := newGetRequest("id=1&enhanced=true")
 			lineStart := int64(1000)
 			lineEnd := int64(3000)
 			tokenStartA := int64(1000)
 			tokenEndA := int64(1400)
 			tokenStartB := int64(2000)
 			tokenEndB := int64(2500)
 			lyricsJson, err := json.Marshal(model.LyricList{
 				{
 					Lang:   "eng",
 					Agents: []model.Agent{{ID: "lead", Role: "main"}, {ID: "__nd_bg__|lead", Role: "bg"}},
 					Synced: true,
 					Line: []model.Line{
 						{
 							Start: &lineStart,
 							End:   &lineEnd,
 							Value: "Hello echo",
 							Cue: []model.Cue{
 								{
 									Start:     &tokenStartA,
 									End:       &tokenEndA,
 									Value:     "Hello",
 									ByteStart: 0,
 									ByteEnd:   4,
 									AgentID:   "lead",
 								},
 								{
 									Start:     &tokenStartB,
 									End:       &tokenEndB,
 									Value:     "echo",
 									ByteStart: 6,
 									ByteEnd:   9,
 									AgentID:   "__nd_bg__|lead",
 								},
 							},
 						},
 					},
 				},
 			})
 			Expect(err).ToNot(HaveOccurred())
 			mockRepo.SetData(model.MediaFiles{
 				{
 					ID:     "1",
 					Artist: "Rick Astley",
 					Title:  "Never Gonna Give You Up",
 					Lyrics: string(lyricsJson),
 				},
 			})
 			response, err := router.GetLyricsBySongId(r)
 			Expect(err).ToNot(HaveOccurred())
 			compareResponses(response.LyricsList, responses.LyricsList{
 				StructuredLyrics: responses.StructuredLyrics{
 					{
 						DisplayArtist: "Rick Astley",
 						DisplayTitle:  "Never Gonna Give You Up",
 						Kind:          "main",
 						Lang:          "eng",
 						Synced:        true,
 						Agents: []responses.Agent{
 							{ID: "lead", Role: "main"},
 							{ID: "__nd_bg__|lead", Role: "bg"},
 						},
 						Line: []responses.Line{
 							{
 								Start: &lineStart,
 								Value: "Hello echo",
 							},
 						},
 						CueLine: []responses.CueLine{
 							{
 								Index:   0,
 								Start:   &lineStart,
 								End:     &lineEnd,
 								Value:   "Hello echo",
 								AgentID: "lead",
 								Cue: []responses.LyricCue{
 									{
 										Start:     tokenStartA,
 										End:       &tokenEndA,
 										ByteStart: 0,
 										ByteEnd:   4,
 										Value:     "Hello",
 									},
 								},
 							},
 							{
 								Index:   0,
 								Start:   &lineStart,
 								End:     &lineEnd,
 								Value:   "Hello echo",
 								AgentID: "__nd_bg__|lead",
 								Cue: []responses.LyricCue{
 									{
 										Start:     tokenStartB,
 										End:       &tokenEndB,
 										ByteStart: 6,
 										ByteEnd:   9,
 										Value:     "echo",
 									},
 								},
 							},
 						},
 					},
 				},
 			})
 		})
 		It("should keep enhanced line-level lyrics when no cue data is available", func() {
 			r := newGetRequest("id=1&enhanced=true")
 			lineStart := int64(1000)
 			lineEnd := int64(3000)
 			lyricsJSON, err := json.Marshal(model.LyricList{
 				{
 					Kind:   "main",
 					Lang:   "eng",
 					Synced: true,
 					Line: []model.Line{
 						{
 							Start: &lineStart,
 							End:   &lineEnd,
 							Value: "Line without word timing",
 						},
 					},
 				},
 			})
 			Expect(err).ToNot(HaveOccurred())
 			mockRepo.SetData(model.MediaFiles{
 				{
 					ID:     "1",
 					Artist: "Rick Astley",
 					Title:  "Never Gonna Give You Up",
 					Lyrics: string(lyricsJSON),
 				},
 			})
 			response, err := router.GetLyricsBySongId(r)
 			Expect(err).ToNot(HaveOccurred())
 			compareResponses(response.LyricsList, responses.LyricsList{
 				StructuredLyrics: responses.StructuredLyrics{
 					{
 						DisplayArtist: "Rick Astley",
 						DisplayTitle:  "Never Gonna Give You Up",
 						Kind:          "main",
 						Lang:          "eng",
 						Synced:        true,
 						Line: []responses.Line{
 							{
 								Start: &lineStart,
 								Value: "Line without word timing",
 							},
 						},
 					},
 				},
 			})
 		})
 		It("should return required cue byte offsets for ambiguous and multibyte cue lines", func() {
 			r := newGetRequest("id=1&enhanced=true")
 			asciiLineStart := int64(0)
 			asciiLineEnd := int64(2400)
 			asciiCueStartA := int64(0)
 			asciiCueEndA := int64(300)
 			asciiCueStartB := int64(900)
 			asciiCueEndB := int64(1300)
 			asciiCueStartC := int64(1300)
 			asciiCueEndC := int64(1600)
 			asciiCueStartD := int64(1600)
 			utfLineStart := int64(2747)
 			utfLineEnd := int64(6214)
 			utfCueStartA := int64(2747)
 			utfCueEndA := int64(3018)
 			utfCueStartB := int64(3018)
 			utfCueEndB := int64(3179)
 			utfCueStartC := int64(3582)
 			utfCueEndC := int64(4100)
 			utfCueStartD := int64(4500)
 			utfCueEndD := int64(6214)
 			lyricsJSON, err := json.Marshal(model.LyricList{
 				{
 					Lang:   "eng",
 					Synced: true,
 					Line: []model.Line{
 						{
 							Start: &asciiLineStart,
 							End:   &asciiLineEnd,
 							Value: "Oh love love me tonight",
 							Cue: []model.Cue{
 								{Start: &asciiCueStartA, End: &asciiCueEndA, Value: "Oh", ByteStart: 0, ByteEnd: 1},
 								{Start: &asciiCueStartB, End: &asciiCueEndB, Value: "love", ByteStart: 8, ByteEnd: 11},
 								{Start: &asciiCueStartC, End: &asciiCueEndC, Value: "me", ByteStart: 13, ByteEnd: 14},
 								{Start: &asciiCueStartD, Value: "tonight", ByteStart: 16, ByteEnd: 22},
 							},
 						},
 						{
 							Start: &utfLineStart,
 							End:   &utfLineEnd,
 							Value: "눈을 뜬 순간",
 							Cue: []model.Cue{
 								{Start: &utfCueStartA, End: &utfCueEndA, Value: "눈", ByteStart: 0, ByteEnd: 2},
 								{Start: &utfCueStartB, End: &utfCueEndB, Value: "을", ByteStart: 3, ByteEnd: 5},
 								{Start: &utfCueStartC, End: &utfCueEndC, Value: "뜬", ByteStart: 7, ByteEnd: 9},
 								{Start: &utfCueStartD, End: &utfCueEndD, Value: "순간", ByteStart: 11, ByteEnd: 16},
 							},
 						},
 					},
 				},
 			})
 			Expect(err).ToNot(HaveOccurred())
 			mockRepo.SetData(model.MediaFiles{
 				{
 					ID:     "1",
 					Artist: "Rick Astley",
 					Title:  "Never Gonna Give You Up",
 					Lyrics: string(lyricsJSON),
 				},
 			})
 			response, err := router.GetLyricsBySongId(r)
 			Expect(err).ToNot(HaveOccurred())
 			compareResponses(response.LyricsList, responses.LyricsList{
 				StructuredLyrics: responses.StructuredLyrics{
 					{
 						DisplayArtist: "Rick Astley",
 						DisplayTitle:  "Never Gonna Give You Up",
 						Kind:          "main",
 						Lang:          "eng",
 						Synced:        true,
 						Line: []responses.Line{
 							{Start: &asciiLineStart, Value: "Oh love love me tonight"},
 							{Start: &utfLineStart, Value: "눈을 뜬 순간"},
 						},
 						CueLine: []responses.CueLine{
 							{
 								Index: 0,
 								Start: &asciiLineStart,
 								End:   &asciiLineEnd,
 								Value: "Oh love love me tonight",
 								Cue: []responses.LyricCue{
 									{Start: asciiCueStartA, End: &asciiCueEndA, Value: "Oh", ByteStart: 0, ByteEnd: 1},
 									{Start: asciiCueStartB, End: &asciiCueEndB, Value: "love", ByteStart: 8, ByteEnd: 11},
 									{Start: asciiCueStartC, End: &asciiCueEndC, Value: "me", ByteStart: 13, ByteEnd: 14},
 									{Start: asciiCueStartD, End: &asciiLineEnd, Value: "tonight", ByteStart: 16, ByteEnd: 22},
 								},
 							},
 							{
 								Index: 1,
 								Start: &utfLineStart,
 								End:   &utfLineEnd,
 								Value: "눈을 뜬 순간",
 								Cue: []responses.LyricCue{
 									{Start: utfCueStartA, End: &utfCueEndA, Value: "눈", ByteStart: 0, ByteEnd: 2},
 									{Start: utfCueStartB, End: &utfCueEndB, Value: "을", ByteStart: 3, ByteEnd: 5},
 									{Start: utfCueStartC, End: &utfCueEndC, Value: "뜬", ByteStart: 7, ByteEnd: 9},
 									{Start: utfCueStartD, End: &utfCueEndD, Value: "순간", ByteStart: 11, ByteEnd: 16},
 								},
 							},
 						},
 					},
 				},
 			})
 		})
 	})
 })
--- a/server/subsonic/opensubsonic.go
+++ b/server/subsonic/opensubsonic.go
@ -11,7 +11,7 @@ func (api *Router) GetOpenSubsonicExtensions(_ *http.Request) (*responses.Subson
 	extensions := responses.OpenSubsonicExtensions{
 		{Name: "transcodeOffset", Versions: []int32{1}},
 		{Name: "formPost", Versions: []int32{1}},
-		{Name: "songLyrics", Versions: []int32{1}},
+		{Name: "songLyrics", Versions: []int32{1, 2}},
 		{Name: "indexBasedQueue", Versions: []int32{1}},
 		{Name: "transcoding", Versions: []int32{1}},
 	}
--- a/server/subsonic/opensubsonic_test.go
+++ b/server/subsonic/opensubsonic_test.go
@ -58,7 +58,7 @@ var _ = Describe("GetOpenSubsonicExtensions", func() {
 				HaveLen(5),
 				ContainElement(responses.OpenSubsonicExtension{Name: "transcodeOffset", Versions: []int32{1}}),
 				ContainElement(responses.OpenSubsonicExtension{Name: "formPost", Versions: []int32{1}}),
-				ContainElement(responses.OpenSubsonicExtension{Name: "songLyrics", Versions: []int32{1}}),
+				ContainElement(responses.OpenSubsonicExtension{Name: "songLyrics", Versions: []int32{1, 2}}),
 				ContainElement(responses.OpenSubsonicExtension{Name: "indexBasedQueue", Versions: []int32{1}}),
 				ContainElement(responses.OpenSubsonicExtension{Name: "transcoding", Versions: []int32{1}}),
 			))
@ -87,7 +87,7 @@ var _ = Describe("GetOpenSubsonicExtensions", func() {
 				HaveLen(6),
 				ContainElement(responses.OpenSubsonicExtension{Name: "transcodeOffset", Versions: []int32{1}}),
 				ContainElement(responses.OpenSubsonicExtension{Name: "formPost", Versions: []int32{1}}),
-				ContainElement(responses.OpenSubsonicExtension{Name: "songLyrics", Versions: []int32{1}}),
+				ContainElement(responses.OpenSubsonicExtension{Name: "songLyrics", Versions: []int32{1, 2}}),
 				ContainElement(responses.OpenSubsonicExtension{Name: "indexBasedQueue", Versions: []int32{1}}),
 				ContainElement(responses.OpenSubsonicExtension{Name: "transcoding", Versions: []int32{1}}),
 				ContainElement(responses.OpenSubsonicExtension{Name: "sonicSimilarity", Versions: []int32{1}}),
--- a/server/subsonic/responses/responses.go
+++ b/server/subsonic/responses/responses.go
@ -543,13 +543,39 @@ type Line struct {
 	Value string `xml:",chardata"            json:"value"`
 }
 type LyricCue struct {
 	Start     int64  `xml:"start,attr"           json:"start"`
 	End       *int64 `xml:"end,attr,omitempty"   json:"end,omitempty"`
 	ByteStart int    `xml:"byteStart,attr"       json:"byteStart"`
 	ByteEnd   int    `xml:"byteEnd,attr"         json:"byteEnd"`
 	Value     string `xml:",chardata"            json:"value"`
 }
 type Agent struct {
 	ID   string `xml:"id,attr"                 json:"id"`
 	Role string `xml:"role,attr"               json:"role"`
 	Name string `xml:"name,attr,omitempty"     json:"name,omitempty"`
 }
 type CueLine struct {
 	Index   int32      `xml:"index,attr"                    json:"index"`
 	Start   *int64     `xml:"start,attr,omitempty"          json:"start,omitempty"`
 	End     *int64     `xml:"end,attr,omitempty"            json:"end,omitempty"`
 	Value   string     `xml:"value,attr"                    json:"value"`
 	AgentID string     `xml:"agentId,attr,omitempty"        json:"agentId,omitempty"`
 	Cue     []LyricCue `xml:"cue,omitempty"                 json:"cue,omitempty"`
 }
 type StructuredLyric struct {
-	DisplayArtist string `xml:"displayArtist,attr,omitempty" json:"displayArtist,omitempty"`
+	DisplayArtist string    `xml:"displayArtist,attr,omitempty" json:"displayArtist,omitempty"`
-	DisplayTitle  string `xml:"displayTitle,attr,omitempty"  json:"displayTitle,omitempty"`
+	DisplayTitle  string    `xml:"displayTitle,attr,omitempty"  json:"displayTitle,omitempty"`
-	Lang          string `xml:"lang,attr"                    json:"lang"`
+	Kind          string    `xml:"kind,attr,omitempty"          json:"kind,omitempty"`
-	Line          []Line `xml:"line"                         json:"line"`
+	Lang          string    `xml:"lang,attr"                    json:"lang"`
-	Offset        *int64 `xml:"offset,attr,omitempty"        json:"offset,omitempty"`
+	Line          []Line    `xml:"line"                         json:"line"`
-	Synced        bool   `xml:"synced,attr"                  json:"synced"`
+	Agents        []Agent   `xml:"agent,omitempty"              json:"agents,omitempty"`
 	CueLine       []CueLine `xml:"cueLine,omitempty"     json:"cueLine,omitempty"`
 	Offset        *int64    `xml:"offset,attr,omitempty"        json:"offset,omitempty"`
 	Synced        bool      `xml:"synced,attr"                  json:"synced"`
 }
 type StructuredLyrics []StructuredLyric
--- a/tests/fixtures/bom-test.ttml
+++ b/tests/fixtures/bom-test.ttml
@ -0,0 +1,2 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <tt xmlns="http://www.w3.org/ns/ttml"><body><div xml:lang="eng"><p begin="00:00:00.00">BOM test line</p></div></body></tt>
--- a/tests/fixtures/bom-utf16-test.ttml
+++ b/tests/fixtures/bom-utf16-test.ttml
--- a/tests/fixtures/test-enhanced.lrc
+++ b/tests/fixtures/test-enhanced.lrc
@ -0,0 +1,6 @@
 [ar:Test Artist]
 [ti:Enhanced Test]
 [lang:eng]
 [00:01.00]<00:01.00>Some <00:01.50>lyrics <00:02.00>here
 [00:03.00]<00:03.00>More <00:03.50>words
 [00:05.00]Plain line without inline markers
--- a/tests/fixtures/test-metadata.ttml
+++ b/tests/fixtures/test-metadata.ttml
@ -0,0 +1,25 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <tt xmlns="http://www.w3.org/ns/ttml" xmlns:itunes="http://music.apple.com/lyric-ttml-internal">
  <head>
    <metadata>
      <iTunesMetadata xmlns="http://music.apple.com/lyric-ttml-internal">
        <translations>
          <translation xml:lang="es">
            <text for="L1">Hola</text>
          </translation>
        </translations>
        <transliterations>
          <transliteration xml:lang="ja-Latn">
            <text for="L2"><span begin="00:02.000" end="00:02.300" xmlns="http://www.w3.org/ns/ttml">ko</span><span begin="00:02.300" end="00:02.600" xmlns="http://www.w3.org/ns/ttml">nni</span></text>
          </transliteration>
        </transliterations>
      </iTunesMetadata>
    </metadata>
  </head>
  <body xml:lang="ja">
    <div>
      <p begin="00:01.000" end="00:01.500" itunes:key="L1">こんにちは</p>
      <p begin="00:02.000" end="00:02.700" itunes:key="L2">こんばんは</p>
    </div>
  </body>
 </tt>
--- a/tests/fixtures/test.elrc
+++ b/tests/fixtures/test.elrc
@ -0,0 +1,5 @@
 [ar:ELRC Artist]
 [ti:ELRC Song]
 [lang:eng]
 [00:01.00]<00:01.00>Lead <00:01.50>words
 [00:03.00]Fallback line
--- a/tests/fixtures/test.srt
+++ b/tests/fixtures/test.srt
@ -0,0 +1,7 @@
 1
 00:00:18,800 --> 00:00:22,800
 We're from subtitles
 2
 00:00:22,801 --> 00:00:26,000
 Another subtitle line
--- a/tests/fixtures/test.ttml
+++ b/tests/fixtures/test.ttml
@ -0,0 +1,12 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <tt xmlns="http://www.w3.org/ns/ttml" xmlns:ttp="http://www.w3.org/ns/ttml#parameter" ttp:frameRate="30" ttp:subFrameRate="2" ttp:tickRate="10">
  <body>
    <div xml:lang="eng">
      <p begin="00:00:18.80">We're no strangers to love</p>
      <p begin="00:00:22:24">You know the rules and so do I</p>
    </div>
    <div xml:lang="por">
      <p begin="188t">Nao somos estranhos ao amor</p>
    </div>
  </body>
 </tt>
--- a/ui/src/actions/player.js
+++ b/ui/src/actions/player.js
@ -9,6 +9,7 @@ export const PLAYER_SET_VOLUME = 'PLAYER_SET_VOLUME'
 export const PLAYER_SET_MODE = 'PLAYER_SET_MODE'
 export const TRANSCODING_SET_PROFILE = 'TRANSCODING_SET_PROFILE'
 export const PLAYER_REFRESH_QUEUE = 'PLAYER_REFRESH_QUEUE'
 export const PLAYER_UPDATE_LYRIC = 'PLAYER_UPDATE_LYRIC'
 export const setTrack = (data) => ({
  type: PLAYER_SET_TRACK,
@ -114,3 +115,8 @@ export const refreshQueue = (resolvedUrls) => ({
  type: PLAYER_REFRESH_QUEUE,
  data: resolvedUrls,
 })
 export const updateQueueLyric = (trackId, lyric) => ({
  type: PLAYER_UPDATE_LYRIC,
  data: { trackId, lyric },
 })
--- a/ui/src/audioplayer/KaraokeLyricsOverlay.jsx
+++ b/ui/src/audioplayer/KaraokeLyricsOverlay.jsx
--- a/ui/src/audioplayer/KaraokeLyricsOverlay.test.jsx
+++ b/ui/src/audioplayer/KaraokeLyricsOverlay.test.jsx
@ -0,0 +1,514 @@
 import React from 'react'
 import {
  cleanup,
  fireEvent,
  render,
  screen,
  waitFor,
 } from '@testing-library/react'
 import KaraokeLyricsOverlay from './KaraokeLyricsOverlay'
 const DEFAULT_LINE_HEIGHT_TEXT = '1.30'
 const NEXT_LINE_HEIGHT_TEXT = '1.32'
 const audioInstance = {
  currentTime: 0,
  paused: true,
  seeking: false,
  playbackRate: 1,
 }
 const buildLyric = (kind, lang, value) => ({
  kind,
  lang,
  synced: true,
  line: [{ start: 1000, value }],
 })
 const renderOverlay = (props = {}) =>
  render(
    <KaraokeLyricsOverlay
      visible={true}
      mainLyric={buildLyric('main', 'ja', 'こんにちは')}
      translationLyric={buildLyric('translation', 'en', 'Hello')}
      pronunciationLyric={buildLyric('pronunciation', 'ja-Latn', 'konnichiwa')}
      showTranslation={false}
      showPronunciation={true}
      translationEnabled={true}
      pronunciationEnabled={true}
      onToggleTranslation={() => {}}
      onTogglePronunciation={() => {}}
      audioInstance={audioInstance}
      onClose={() => {}}
      {...props}
    />,
  )
 describe('<KaraokeLyricsOverlay /> behavior', () => {
  beforeEach(() => {
    localStorage.clear()
    window.innerWidth = 1200
    window.innerHeight = 900
    vi.spyOn(window, 'requestAnimationFrame').mockImplementation(() => 1)
    vi.spyOn(window, 'cancelAnimationFrame').mockImplementation(() => {})
  })
  afterEach(() => {
    vi.restoreAllMocks()
    cleanup()
  })
  it('shows tooltips for translation, pronunciation, and appearance controls', async () => {
    renderOverlay()
    fireEvent.mouseOver(screen.getByTestId('lyrics-language-badge-tr'))
    expect(await screen.findByText('Show translation')).toBeInTheDocument()
    fireEvent.mouseOver(screen.getByTestId('lyrics-language-badge-pr'))
    expect(await screen.findByText('Hide pronunciation')).toBeInTheDocument()
    fireEvent.mouseOver(screen.getByTestId('lyrics-settings-button'))
    expect(await screen.findByText('Appearance')).toBeInTheDocument()
  })
  it('renders inline mode without the desktop resize handle', () => {
    renderOverlay({ inline: true })
    expect(screen.getByTestId('karaoke-lyrics-overlay')).toHaveAttribute(
      'data-inline',
      'true',
    )
    expect(screen.queryByTestId('lyrics-resize-handle')).not.toBeInTheDocument()
  })
  it('renders the appearance popup with Main label and default line height for older settings', async () => {
    localStorage.setItem(
      'karaoke-lyrics-settings',
      JSON.stringify({
        tr: { fontSize: 16, colorKey: 'blue' },
        main: { fontSize: 26, colorKey: 'white' },
        pr: { fontSize: 15, colorKey: 'green' },
      }),
    )
    renderOverlay()
    fireEvent.click(screen.getByTestId('lyrics-settings-button'))
    expect(await screen.findByText('Appearance')).toBeInTheDocument()
    expect(screen.getByText('Main', { selector: 'div' })).toBeInTheDocument()
    expect(screen.queryByText('Default')).not.toBeInTheDocument()
    expect(screen.getByTestId('lyrics-reset-appearance')).toBeInTheDocument()
    expect(screen.getByTestId('lyrics-line-height-value')).toHaveTextContent(
      DEFAULT_LINE_HEIGHT_TEXT,
    )
  })
  it('renders the lyric group in main, pronunciation, translation order with layer badges', () => {
    renderOverlay({
      showTranslation: true,
      showPronunciation: true,
    })
    const mainLine = screen.getByText('こんにちは')
    const pronunciationLine = screen.getByText('konnichiwa')
    const translationLine = screen.getByText('Hello')
    expect(
      mainLine.compareDocumentPosition(pronunciationLine) &
        Node.DOCUMENT_POSITION_FOLLOWING,
    ).toBeTruthy()
    expect(
      pronunciationLine.compareDocumentPosition(translationLine) &
        Node.DOCUMENT_POSITION_FOLLOWING,
    ).toBeTruthy()
    expect(screen.getByTestId('lyrics-language-badge-main')).toHaveTextContent(
      'Mainja',
    )
    expect(screen.getByTestId('lyrics-language-badge-pr')).toHaveTextContent(
      'PRja-Latn',
    )
    expect(screen.getByTestId('lyrics-language-badge-tr')).toHaveTextContent(
      'TRen',
    )
  })
  it('renders line-timed rows as whole-line spans without synthetic token splits', () => {
    renderOverlay({
      mainLyric: {
        kind: 'main',
        lang: 'en',
        synced: true,
        line: [
          { start: 1000, end: 2400, value: 'Batter up, batter up, batter up' },
        ],
      },
      translationLyric: {
        kind: 'translation',
        lang: 'ja',
        synced: true,
        line: [
          {
            start: 1000,
            end: 2400,
            value: 'バッターアップ、バッターアップ、バッターアップ',
          },
        ],
      },
      pronunciationLyric: {
        kind: 'pronunciation',
        lang: 'ja-Latn',
        synced: true,
        line: [
          {
            start: 1000,
            end: 2400,
            value: 'Battaa appu, battaa appu, battaa appu',
          },
        ],
      },
      showTranslation: true,
      showPronunciation: true,
    })
    const mainLine = screen.getByText(
      'Batter up, batter up, batter up',
    ).parentElement
    const pronunciationLine = screen.getByText(
      'Battaa appu, battaa appu, battaa appu',
    ).parentElement
    const translationLine = screen.getByText(
      'バッターアップ、バッターアップ、バッターアップ',
    ).parentElement
    expect(mainLine.querySelectorAll('span')).toHaveLength(1)
    expect(pronunciationLine.querySelectorAll('span')).toHaveLength(1)
    expect(translationLine.querySelectorAll('span')).toHaveLength(1)
  })
  it('uses cue byte offsets to segment repeated words in the karaoke line', () => {
    renderOverlay({
      mainLyric: {
        kind: 'main',
        lang: 'en',
        synced: true,
        line: [{ start: 0, end: 2400, value: 'Oh love love me tonight' }],
        cueLine: [
          {
            index: 0,
            start: 0,
            end: 2400,
            value: 'Oh love love me tonight',
            cue: [
              { start: 0, end: 300, value: 'Oh', byteStart: 0, byteEnd: 1 },
              {
                start: 900,
                end: 1300,
                value: 'love',
                byteStart: 8,
                byteEnd: 11,
              },
              {
                start: 1300,
                end: 1600,
                value: 'me',
                byteStart: 13,
                byteEnd: 14,
              },
              {
                start: 1600,
                end: 2400,
                value: 'tonight',
                byteStart: 16,
                byteEnd: 22,
              },
            ],
          },
        ],
      },
      translationLyric: null,
      pronunciationLyric: null,
      showTranslation: false,
      showPronunciation: false,
      translationEnabled: false,
      pronunciationEnabled: false,
      audioInstance: {
        ...audioInstance,
        currentTime: 1.0,
      },
    })
    const mainLine = screen.getByText('Oh').parentElement
    const segments = Array.from(mainLine.querySelectorAll('span')).map(
      (span) => span.textContent,
    )
    expect(segments).toEqual([
      'Oh',
      ' love ',
      'love',
      ' ',
      'me',
      ' ',
      'tonight',
    ])
  })
  it('uses cue byte offsets to preserve explicit space cues in multibyte karaoke lines', () => {
    renderOverlay({
      mainLyric: {
        kind: 'main',
        lang: 'ko',
        synced: true,
        line: [{ start: 0, end: 900, value: '눈을 뜬 순간' }],
        cueLine: [
          {
            index: 0,
            start: 0,
            end: 900,
            value: '눈을 뜬 순간',
            cue: [
              { start: 0, end: 150, value: '눈을', byteStart: 0, byteEnd: 5 },
              { start: 150, end: 250, value: ' ', byteStart: 6, byteEnd: 6 },
              { start: 250, end: 450, value: '뜬', byteStart: 7, byteEnd: 9 },
              { start: 450, end: 550, value: ' ', byteStart: 10, byteEnd: 10 },
              { start: 550, end: 900, value: '순간', byteStart: 11, byteEnd: 16 },
            ],
          },
        ],
      },
      translationLyric: null,
      pronunciationLyric: null,
      showTranslation: false,
      showPronunciation: false,
      translationEnabled: false,
      pronunciationEnabled: false,
      audioInstance: {
        ...audioInstance,
        currentTime: 0.3,
      },
    })
    const mainLine = screen.getByText('눈을').parentElement
    const segments = Array.from(mainLine.querySelectorAll('span')).map(
      (span) => span.textContent,
    )
    expect(segments).toEqual(['눈을', ' ', '뜬', ' ', '순간'])
  })
  it('highlights line-timed pronunciation and translation rows with the active main line', () => {
    renderOverlay({
      mainLyric: {
        kind: 'main',
        lang: 'en',
        synced: true,
        line: [
          { start: 1000, end: 1800, value: 'Line one' },
          { start: 2500, end: 3300, value: 'Line two' },
        ],
      },
      translationLyric: {
        kind: 'translation',
        lang: 'ja',
        synced: true,
        line: [
          { start: 1000, end: 1800, value: '一行目' },
          { start: 2500, end: 3300, value: '二行目' },
        ],
      },
      pronunciationLyric: {
        kind: 'pronunciation',
        lang: 'ja-Latn',
        synced: true,
        line: [
          { start: 1000, end: 1800, value: 'ichigyoume' },
          { start: 2500, end: 3300, value: 'nigyoume' },
        ],
      },
      showTranslation: true,
      showPronunciation: true,
      audioInstance: {
        ...audioInstance,
        currentTime: 1.2,
      },
    })
    const activePronunciation = screen.getByText('ichigyoume').parentElement
    const inactivePronunciation = screen.getByText('nigyoume').parentElement
    const activeTranslation = screen.getByText('一行目').parentElement
    const inactiveTranslation = screen.getByText('二行目').parentElement
    expect(parseFloat(activePronunciation.style.opacity)).toBeGreaterThan(
      parseFloat(inactivePronunciation.style.opacity),
    )
    expect(parseFloat(activeTranslation.style.opacity)).toBeGreaterThan(
      parseFloat(inactiveTranslation.style.opacity),
    )
  })
  it('pre-wraps inactive main lines so the active line keeps the same wrap shape', () => {
    renderOverlay({
      mainLyric: {
        kind: 'main',
        lang: 'en',
        synced: true,
        line: [
          { start: 1000, end: 1800, value: 'First line that is getting focus' },
          { start: 2500, end: 3300, value: 'Second line waiting below' },
        ],
      },
      translationLyric: null,
      pronunciationLyric: null,
      showTranslation: false,
      showPronunciation: false,
      translationEnabled: false,
      pronunciationEnabled: false,
      audioInstance: {
        ...audioInstance,
        currentTime: 1.2,
      },
    })
    const activeLine = screen.getByText('First line that is getting focus')
      .parentElement
    const inactiveLine = screen.getByText('Second line waiting below')
      .parentElement
    expect(parseFloat(activeLine.style.fontSize)).toBeGreaterThan(
      parseFloat(inactiveLine.style.fontSize),
    )
    expect(activeLine.style.maxWidth).toBe('100%')
    expect(inactiveLine.style.maxWidth).toBe('80%')
  })
  it('centers pronunciation text inside the pill container', () => {
    renderOverlay({
      showTranslation: false,
      showPronunciation: true,
    })
    const pronunciationLine = screen.getByText('konnichiwa').parentElement
    const styles = window.getComputedStyle(pronunciationLine)
    expect(styles.display).toBe('inline-flex')
    expect(styles.justifyContent).toBe('center')
    expect(styles.alignItems).toBe('center')
  })
  it('renders untimed text lyrics in manual reading mode without a pinned active line', () => {
    renderOverlay({
      mainLyric: {
        kind: 'main',
        lang: 'en',
        synced: false,
        line: [{ value: 'First plain line' }, { value: 'Second plain line' }],
      },
      translationLyric: null,
      pronunciationLyric: null,
      showTranslation: false,
      showPronunciation: false,
      translationEnabled: false,
      pronunciationEnabled: false,
    })
    const firstLine = screen.getByText('First plain line').parentElement
    const secondLine = screen.getByText('Second plain line').parentElement
    expect(firstLine.style.opacity).toBe('1')
    expect(secondLine.style.opacity).toBe('1')
    expect(firstLine.style.color).toBe(secondLine.style.color)
  })
  it('persists line height changes, keeps aux line spacing fixed, and stores overlay height', async () => {
    renderOverlay({
      mainLyric: buildLyric('main', 'en', 'Hello world'),
      translationLyric: buildLyric('translation', 'es', 'Hola'),
      pronunciationLyric: buildLyric('pronunciation', 'en-Latn', 'heh-loh'),
      showTranslation: true,
      showPronunciation: true,
      translationEnabled: true,
      pronunciationEnabled: true,
    })
    const overlay = screen.getByTestId('karaoke-lyrics-overlay')
    const mainLine = screen.getByText('Hello world').parentElement
    const pronunciationLine = screen.getByText('heh-loh').parentElement
    expect(mainLine).toHaveStyle(`line-height: ${DEFAULT_LINE_HEIGHT_TEXT}`)
    expect(pronunciationLine).toHaveStyle('line-height: 1.2')
    fireEvent.click(screen.getByTestId('lyrics-settings-button'))
    const slider = screen.getByRole('slider', { name: 'Line height' })
    slider.focus()
    fireEvent.keyDown(slider, { key: 'ArrowRight' })
    await waitFor(() =>
      expect(screen.getByTestId('lyrics-line-height-value')).toHaveTextContent(
        NEXT_LINE_HEIGHT_TEXT,
      ),
    )
    await waitFor(() =>
      expect(mainLine).toHaveStyle(`line-height: ${NEXT_LINE_HEIGHT_TEXT}`),
    )
    expect(pronunciationLine).toHaveStyle('line-height: 1.2')
    fireEvent.mouseDown(screen.getByTestId('lyrics-resize-handle'), {
      clientY: 400,
    })
    fireEvent.mouseMove(window, { clientY: 360 })
    fireEvent.mouseUp(window)
    await waitFor(() => expect(overlay).toHaveStyle('height: 340px'))
    const stored = JSON.parse(localStorage.getItem('karaoke-lyrics-settings'))
    expect(stored.lineHeight).toBeCloseTo(1.32, 2)
    expect(stored.overlayHeight).toBe(340)
  })
  it('resets appearance back to the default spacing and overlay height', async () => {
    localStorage.setItem(
      'karaoke-lyrics-settings',
      JSON.stringify({
        lineHeight: 1.8,
        overlayHeight: 420,
        tr: { fontSize: 16, colorKey: 'yellow' },
        main: { fontSize: 28, colorKey: 'cyan' },
        pr: { fontSize: 15, colorKey: 'pink' },
      }),
    )
    renderOverlay({
      mainLyric: buildLyric('main', 'en', 'Hello world'),
      translationLyric: null,
      pronunciationLyric: null,
      showPronunciation: false,
      translationEnabled: false,
      pronunciationEnabled: false,
    })
    const overlay = screen.getByTestId('karaoke-lyrics-overlay')
    const mainLine = screen.getByText('Hello world').parentElement
    expect(overlay).toHaveStyle('height: 420px')
    expect(mainLine).toHaveStyle('line-height: 1.8')
    fireEvent.click(screen.getByTestId('lyrics-settings-button'))
    fireEvent.click(screen.getByTestId('lyrics-reset-appearance'))
    await waitFor(() =>
      expect(screen.getByTestId('lyrics-line-height-value')).toHaveTextContent(
        DEFAULT_LINE_HEIGHT_TEXT,
      ),
    )
    await waitFor(() => expect(overlay).toHaveStyle('height: 300px'))
    await waitFor(() =>
      expect(mainLine).toHaveStyle(`line-height: ${DEFAULT_LINE_HEIGHT_TEXT}`),
    )
    const stored = JSON.parse(localStorage.getItem('karaoke-lyrics-settings'))
    expect(stored.lineHeight).toBeCloseTo(1.3, 2)
    expect(stored.overlayHeight).toBe(300)
  })
 })
--- a/ui/src/audioplayer/MobileKaraokeLyricsPortal.jsx
+++ b/ui/src/audioplayer/MobileKaraokeLyricsPortal.jsx
@ -0,0 +1,65 @@
 import React, { useEffect, useState } from 'react'
 import { createPortal } from 'react-dom'
 export const MOBILE_KARAOKE_LYRICS_HOST_SELECTOR =
  '.react-jinke-music-player-mobile-cover'
 export const MOBILE_KARAOKE_LYRICS_ACTIVE_CLASS = 'nd-mobile-lyrics-active'
 const resolveMobileLyricsHost = () => {
  if (typeof document === 'undefined') {
    return null
  }
  return document.querySelector(MOBILE_KARAOKE_LYRICS_HOST_SELECTOR)
 }
 const MobileKaraokeLyricsPortal = ({ active, children }) => {
  const [host, setHost] = useState(() =>
    active ? resolveMobileLyricsHost() : null,
  )
  useEffect(() => {
    if (typeof document === 'undefined') {
      setHost(null)
      return undefined
    }
    if (!active) {
      setHost(null)
      return undefined
    }
    const syncHost = () => {
      setHost(resolveMobileLyricsHost())
    }
    syncHost()
    const observer = new MutationObserver(syncHost)
    observer.observe(document.body, {
      childList: true,
      subtree: true,
    })
    return () => observer.disconnect()
  }, [active])
  useEffect(() => {
    if (!host) {
      return undefined
    }
    host.classList.toggle(MOBILE_KARAOKE_LYRICS_ACTIVE_CLASS, active)
    return () => {
      host.classList.remove(MOBILE_KARAOKE_LYRICS_ACTIVE_CLASS)
    }
  }, [active, host])
  if (!active || !host) {
    return null
  }
  return createPortal(children, host)
 }
 export default MobileKaraokeLyricsPortal
--- a/ui/src/audioplayer/MobileKaraokeLyricsPortal.test.jsx
+++ b/ui/src/audioplayer/MobileKaraokeLyricsPortal.test.jsx
@ -0,0 +1,55 @@
 import React from 'react'
 import { cleanup, render, screen, waitFor } from '@testing-library/react'
 import MobileKaraokeLyricsPortal, {
  MOBILE_KARAOKE_LYRICS_ACTIVE_CLASS,
 } from './MobileKaraokeLyricsPortal'
 const HOST_CLASS = 'react-jinke-music-player-mobile-cover'
 describe('<MobileKaraokeLyricsPortal />', () => {
  afterEach(() => {
    cleanup()
    document.body.innerHTML = ''
  })
  it('renders lyrics into the mobile cover host and toggles the active class', () => {
    const host = document.createElement('div')
    host.className = HOST_CLASS
    document.body.appendChild(host)
    const { rerender } = render(
      <MobileKaraokeLyricsPortal active={true}>
        <div data-testid="mobile-inline-lyrics">Lyrics</div>
      </MobileKaraokeLyricsPortal>,
    )
    expect(host).toContainElement(screen.getByTestId('mobile-inline-lyrics'))
    expect(host).toHaveClass(MOBILE_KARAOKE_LYRICS_ACTIVE_CLASS)
    rerender(
      <MobileKaraokeLyricsPortal active={false}>
        <div data-testid="mobile-inline-lyrics">Lyrics</div>
      </MobileKaraokeLyricsPortal>,
    )
    expect(screen.queryByTestId('mobile-inline-lyrics')).not.toBeInTheDocument()
    expect(host).not.toHaveClass(MOBILE_KARAOKE_LYRICS_ACTIVE_CLASS)
  })
  it('attaches when the mobile cover host appears after mount', async () => {
    render(
      <MobileKaraokeLyricsPortal active={true}>
        <div data-testid="mobile-inline-lyrics">Lyrics</div>
      </MobileKaraokeLyricsPortal>,
    )
    const host = document.createElement('div')
    host.className = HOST_CLASS
    document.body.appendChild(host)
    await waitFor(() =>
      expect(host).toContainElement(screen.getByTestId('mobile-inline-lyrics')),
    )
    expect(host).toHaveClass(MOBILE_KARAOKE_LYRICS_ACTIVE_CLASS)
  })
 })
--- a/ui/src/audioplayer/Player.jsx
+++ b/ui/src/audioplayer/Player.jsx
@ -22,6 +22,7 @@ import {
  refreshQueue,
  setPlayMode,
  setTranscodingProfile,
  updateQueueLyric,
  setVolume,
  syncQueue,
 } from '../actions'
@ -33,6 +34,30 @@ import { keyMap } from '../hotkeys'
 import keyHandlers from './keyHandlers'
 import { calculateGain } from '../utils/calculateReplayGain'
 import { detectBrowserProfile, decisionService } from '../transcode'
 import {
  getPreferredLyricLanguage,
  hasStructuredLyricContent,
  selectLyricLayers,
  structuredLyricToLrc,
 } from './lyrics'
 import {
  resolveLyricsOverlayState,
  togglePronunciationPreference,
 } from './lyricsOverlayState'
 import KaraokeLyricsOverlay from './KaraokeLyricsOverlay'
 import MobileKaraokeLyricsPortal from './MobileKaraokeLyricsPortal'
 const emptyLyricLayers = {
  main: null,
  translation: null,
  pronunciation: null,
 }
 const normalizeLyricLayers = (layers) => ({
  main: layers?.main || null,
  translation: layers?.translation || null,
  pronunciation: layers?.pronunciation || null,
 })
 const Player = () => {
  const theme = useCurrentTheme()
@ -120,6 +145,83 @@ const Player = () => {
  const gainInfo = useSelector((state) => state.replayGain)
  const [context, setContext] = useState(null)
  const [gainNode, setGainNode] = useState(null)
  const lyricCacheRef = useRef(new Map())
  const lyricRequestIdRef = useRef(0)
  const playerRef = useRef(null)
  const [karaokeVisiblePreference, setKaraokeVisiblePreference] =
    useState(false)
  const [selectedLyricLayers, setSelectedLyricLayers] =
    useState(emptyLyricLayers)
  const [translationPreference, setTranslationPreference] = useState(false)
  const [pronunciationPreference, setPronunciationPreference] = useState(null)
  const currentTrackId = playerState.current?.trackId
  const currentTrackIsRadio = playerState.current?.isRadio
  const selectedStructuredLyric = selectedLyricLayers.main
  const hasKaraokeLyric = hasStructuredLyricContent(selectedStructuredLyric)
  const hasTranslationLyric = hasStructuredLyricContent(
    selectedLyricLayers.translation,
  )
  const hasPronunciationLyric = hasStructuredLyricContent(
    selectedLyricLayers.pronunciation,
  )
  const { karaokeVisible, showTranslation, showPronunciation } =
    resolveLyricsOverlayState({
      karaokeVisiblePreference,
      translationPreference,
      pronunciationPreference,
      hasKaraokeLyric,
      hasTranslationLyric,
      hasPronunciationLyric,
    })
  const useInlineMobileLyrics = karaokeVisible && !isDesktop
  const applyLyricToRuntimePlayer = useCallback((trackId, lyric) => {
    if (!trackId) {
      return
    }
    const player = playerRef.current
    if (!player || typeof player.setState !== 'function') {
      return
    }
    player.setState((prevState) => {
      const prevLists = Array.isArray(prevState.audioLists)
        ? prevState.audioLists
        : []
      let changed = false
      const audioLists = prevLists.map((item) => {
        if (item.trackId !== trackId) {
          return item
        }
        if (item.lyric === lyric) {
          return item
        }
        changed = true
        return {
          ...item,
          lyric,
        }
      })
      const currentItem = audioLists.find(
        (item) => item.musicSrc === prevState.musicSrc,
      )
      const currentLyric =
        typeof currentItem?.lyric === 'string'
          ? currentItem.lyric
          : prevState.lyric
      if (!changed && currentLyric === prevState.lyric) {
        return null
      }
      return {
        audioLists,
        lyric: currentLyric,
      }
    })
  }, [])
  useEffect(() => {
    if (
@ -166,6 +268,88 @@ const Player = () => {
    return () => window.removeEventListener('beforeunload', handleBeforeUnload)
  }, [playerState, audioInstance])
  useEffect(() => {
    if (!currentTrackId || currentTrackIsRadio) {
      setSelectedLyricLayers(emptyLyricLayers)
      return
    }
    const cached = lyricCacheRef.current.get(currentTrackId)
    let layers = emptyLyricLayers
    if (cached && typeof cached !== 'string') {
      if (cached.layers) {
        layers = normalizeLyricLayers(cached.layers)
      } else if (cached.structuredLyric) {
        layers = normalizeLyricLayers({
          main: cached.structuredLyric,
        })
      }
    }
    setSelectedLyricLayers(layers)
  }, [currentTrackId, currentTrackIsRadio])
  useEffect(() => {
    lyricRequestIdRef.current += 1
    const requestId = lyricRequestIdRef.current
    if (!currentTrackId || currentTrackIsRadio) {
      return
    }
    const cached = lyricCacheRef.current.get(currentTrackId)
    if (cached !== undefined) {
      const cachedLyric =
        typeof cached === 'string' ? cached : cached?.lrc || ''
      const cachedLayers =
        typeof cached === 'string'
          ? emptyLyricLayers
          : cached?.layers
            ? normalizeLyricLayers(cached.layers)
            : normalizeLyricLayers({ main: cached?.structuredLyric })
      setSelectedLyricLayers(cachedLayers)
      if (cachedLyric) {
        dispatch(updateQueueLyric(currentTrackId, cachedLyric))
        applyLyricToRuntimePlayer(currentTrackId, cachedLyric)
      }
      return
    }
    subsonic
      .getLyricsBySongId(currentTrackId)
      .then((resp) => {
        if (lyricRequestIdRef.current !== requestId) {
          return
        }
        const structuredLyrics =
          resp?.json?.['subsonic-response']?.lyricsList?.structuredLyrics || []
        const layers = selectLyricLayers(
          structuredLyrics,
          getPreferredLyricLanguage(),
        )
        const lyric = layers.main ? structuredLyricToLrc(layers.main) : ''
        lyricCacheRef.current.set(currentTrackId, {
          lrc: lyric,
          layers,
        })
        setSelectedLyricLayers(layers)
        if (lyric !== '') {
          dispatch(updateQueueLyric(currentTrackId, lyric))
          applyLyricToRuntimePlayer(currentTrackId, lyric)
        }
      })
      .catch(() => {
        if (lyricRequestIdRef.current !== requestId) {
          return
        }
        setSelectedLyricLayers(emptyLyricLayers)
        // Do not cache network/request failures as empty lyrics, so we can retry.
        lyricCacheRef.current.delete(currentTrackId)
      })
  }, [dispatch, currentTrackId, currentTrackIsRadio, applyLyricToRuntimePlayer])
  const defaultOptions = useMemo(
    () => ({
      theme: playerTheme,
@ -177,7 +361,7 @@ const Player = () => {
      clearPriorAudioLists: false,
      showDestroy: true,
      showDownload: false,
-      showLyric: true,
+      showLyric: false,
      showReload: false,
      toggleMode: !isDesktop,
      glassBg: false,
@ -215,12 +399,26 @@ const Player = () => {
        (playerState.clear || playerState.playIndex === 0),
      clearPriorAudioLists: playerState.clear,
      extendsContent: (
-        <PlayerToolbar id={current.trackId} isRadio={current.isRadio} />
+        <PlayerToolbar
          id={current.trackId}
          isRadio={current.isRadio}
          onToggleLyrics={() =>
            setKaraokeVisiblePreference((visible) => !visible)
          }
          lyricsActive={karaokeVisible}
          lyricsDisabled={!hasKaraokeLyric}
        />
      ),
      defaultVolume: isMobilePlayer ? 1 : playerState.volume,
      showMediaSession: !current.isRadio,
    }
-  }, [playerState, defaultOptions, isMobilePlayer])
+  }, [
    playerState,
    defaultOptions,
    isMobilePlayer,
    karaokeVisible,
    hasKaraokeLyric,
  ])
  const onAudioListsChange = useCallback(
    (_, audioLists, audioInfo) => dispatch(syncQueue(audioInfo, audioLists)),
@ -340,10 +538,13 @@ const Player = () => {
  )
  const onCoverClick = useCallback((mode, audioLists, audioInfo) => {
    if (!isDesktop && karaokeVisible) {
      return
    }
    if (mode === 'full' && audioInfo?.song?.albumId) {
      window.location.href = `#/album/${audioInfo.song.albumId}/show`
    }
-  }, [])
+  }, [isDesktop, karaokeVisible])
  const onAudioError = useCallback(
    (error, currentPlayId, audioLists, audioInfo) => {
@ -392,6 +593,7 @@ const Player = () => {
  return (
    <ThemeProvider theme={createMuiTheme(theme)}>
      <ReactJkMusicPlayer
        ref={playerRef}
        {...options}
        className={classes.player}
        onAudioListsChange={onAudioListsChange}
@ -407,6 +609,55 @@ const Player = () => {
        onBeforeDestroy={onBeforeDestroy}
        getAudioInstance={setAudioInstance}
      />
      {isDesktop && (
        <KaraokeLyricsOverlay
          visible={karaokeVisible}
          mainLyric={selectedLyricLayers.main}
          translationLyric={selectedLyricLayers.translation}
          pronunciationLyric={selectedLyricLayers.pronunciation}
          showTranslation={showTranslation}
          showPronunciation={showPronunciation}
          translationEnabled={hasTranslationLyric}
          pronunciationEnabled={hasPronunciationLyric}
          onToggleTranslation={() =>
            setTranslationPreference((previous) =>
              hasTranslationLyric ? !previous : false,
            )
          }
          onTogglePronunciation={() =>
            setPronunciationPreference((previous) =>
              togglePronunciationPreference(previous, hasPronunciationLyric),
            )
          }
          audioInstance={audioInstance}
          onClose={() => setKaraokeVisiblePreference(false)}
        />
      )}
      <MobileKaraokeLyricsPortal active={useInlineMobileLyrics}>
        <KaraokeLyricsOverlay
          visible={useInlineMobileLyrics}
          inline={true}
          mainLyric={selectedLyricLayers.main}
          translationLyric={selectedLyricLayers.translation}
          pronunciationLyric={selectedLyricLayers.pronunciation}
          showTranslation={showTranslation}
          showPronunciation={showPronunciation}
          translationEnabled={hasTranslationLyric}
          pronunciationEnabled={hasPronunciationLyric}
          onToggleTranslation={() =>
            setTranslationPreference((previous) =>
              hasTranslationLyric ? !previous : false,
            )
          }
          onTogglePronunciation={() =>
            setPronunciationPreference((previous) =>
              togglePronunciationPreference(previous, hasPronunciationLyric),
            )
          }
          audioInstance={audioInstance}
          onClose={() => setKaraokeVisiblePreference(false)}
        />
      </MobileKaraokeLyricsPortal>
      <GlobalHotKeys handlers={handlers} keyMap={keyMap} allowChanges />
    </ThemeProvider>
  )
--- a/ui/src/audioplayer/Player.lyricsState.test.jsx
+++ b/ui/src/audioplayer/Player.lyricsState.test.jsx
@ -0,0 +1,77 @@
 import {
  resolveLyricsOverlayState,
  togglePronunciationPreference,
 } from './lyricsOverlayState'
 describe('Player lyrics state helpers', () => {
  it('keeps the lyrics window preference across track changes in the session', () => {
    const visibleOnCurrentTrack = resolveLyricsOverlayState({
      karaokeVisiblePreference: true,
      translationPreference: false,
      pronunciationPreference: null,
      hasKaraokeLyric: true,
      hasTranslationLyric: true,
      hasPronunciationLyric: true,
    })
    expect(visibleOnCurrentTrack.karaokeVisible).toBe(true)
    const hiddenForTrackWithoutLyrics = resolveLyricsOverlayState({
      karaokeVisiblePreference: true,
      translationPreference: false,
      pronunciationPreference: null,
      hasKaraokeLyric: false,
      hasTranslationLyric: false,
      hasPronunciationLyric: false,
    })
    expect(hiddenForTrackWithoutLyrics.karaokeVisible).toBe(false)
    const restoredOnNextLyricsTrack = resolveLyricsOverlayState({
      karaokeVisiblePreference: true,
      translationPreference: false,
      pronunciationPreference: null,
      hasKaraokeLyric: true,
      hasTranslationLyric: false,
      hasPronunciationLyric: false,
    })
    expect(restoredOnNextLyricsTrack.karaokeVisible).toBe(true)
  })
  it('restores translation and pronunciation preferences after tracks without those layers', () => {
    const initialState = resolveLyricsOverlayState({
      karaokeVisiblePreference: false,
      translationPreference: false,
      pronunciationPreference: null,
      hasKaraokeLyric: true,
      hasTranslationLyric: true,
      hasPronunciationLyric: true,
    })
    expect(initialState.showTranslation).toBe(false)
    expect(initialState.showPronunciation).toBe(true)
    const translationPreference = true
    const pronunciationPreference = togglePronunciationPreference(null, true)
    expect(pronunciationPreference).toBe(false)
    const hiddenOnTrackWithoutAuxLayers = resolveLyricsOverlayState({
      karaokeVisiblePreference: false,
      translationPreference,
      pronunciationPreference,
      hasKaraokeLyric: true,
      hasTranslationLyric: false,
      hasPronunciationLyric: false,
    })
    expect(hiddenOnTrackWithoutAuxLayers.showTranslation).toBe(false)
    expect(hiddenOnTrackWithoutAuxLayers.showPronunciation).toBe(false)
    const restoredOnTrackWithAuxLayers = resolveLyricsOverlayState({
      karaokeVisiblePreference: false,
      translationPreference,
      pronunciationPreference,
      hasKaraokeLyric: true,
      hasTranslationLyric: true,
      hasPronunciationLyric: true,
    })
    expect(restoredOnTrackWithAuxLayers.showTranslation).toBe(true)
    expect(restoredOnTrackWithAuxLayers.showPronunciation).toBe(false)
  })
 })
--- a/ui/src/audioplayer/PlayerToolbar.jsx
+++ b/ui/src/audioplayer/PlayerToolbar.jsx
@ -4,7 +4,9 @@ import { useGetOne } from 'react-admin'
 import { GlobalHotKeys } from 'react-hotkeys'
 import IconButton from '@material-ui/core/IconButton'
 import { useMediaQuery } from '@material-ui/core'
 import Tooltip from '@material-ui/core/Tooltip'
 import { RiSaveLine } from 'react-icons/ri'
 import { RiFileMusicLine } from 'react-icons/ri'
 import { LoveButton, useToggleLove } from '../common'
 import { openSaveQueueDialog } from '../actions'
 import { keyMap } from '../hotkeys'
@ -55,7 +57,13 @@ const useStyles = makeStyles((theme) => ({
  },
 }))
-const PlayerToolbar = ({ id, isRadio }) => {
+const PlayerToolbar = ({
  id,
  isRadio,
  onToggleLyrics,
  lyricsActive = false,
  lyricsDisabled = false,
 }) => {
  const dispatch = useDispatch()
  const { data, loading } = useGetOne('song', id, { enabled: !!id && !isRadio })
  const [toggleLove, toggling] = useToggleLove('song', data)
@ -99,6 +107,25 @@ const PlayerToolbar = ({ id, isRadio }) => {
    />
  )
  const toggleLyricsButton = (
    <Tooltip title="Toggle lyrics">
      <span>
        <IconButton
          size={isDesktop ? 'small' : undefined}
          onClick={onToggleLyrics}
          disabled={!onToggleLyrics || lyricsDisabled}
          data-testid="toggle-lyrics-button"
          className={buttonClass}
          color={lyricsActive ? 'primary' : 'default'}
        >
          <RiFileMusicLine
            className={!isDesktop ? classes.mobileIcon : undefined}
          />
        </IconButton>
      </span>
    </Tooltip>
  )
  return (
    <>
      <GlobalHotKeys keyMap={keyMap} handlers={handlers} allowChanges />
@ -106,11 +133,13 @@ const PlayerToolbar = ({ id, isRadio }) => {
        <li className={`${listItemClass} item`}>
          {saveQueueButton}
          {loveButton}
          {toggleLyricsButton}
        </li>
      ) : (
        <>
          <li className={`${listItemClass} item`}>{saveQueueButton}</li>
          <li className={`${listItemClass} item`}>{loveButton}</li>
          <li className={`${listItemClass} item`}>{toggleLyricsButton}</li>
        </>
      )}
    </>
--- a/ui/src/audioplayer/PlayerToolbar.test.jsx
+++ b/ui/src/audioplayer/PlayerToolbar.test.jsx
@ -71,6 +71,7 @@ describe('<PlayerToolbar />', () => {
      // Verify both buttons are rendered
      expect(screen.getByTestId('save-queue-button')).toBeInTheDocument()
      expect(screen.getByTestId('love-button')).toBeInTheDocument()
      expect(screen.getByTestId('toggle-lyrics-button')).toBeInTheDocument()
      // Verify desktop classes are applied
      expect(listItems[0].className).toContain('toolbar')
@ -102,6 +103,14 @@ describe('<PlayerToolbar />', () => {
        type: 'OPEN_SAVE_QUEUE_DIALOG',
      })
    })
    it('triggers lyric toggle callback when lyrics button is clicked', () => {
      const onToggleLyrics = vi.fn()
      render(<PlayerToolbar id="song-1" onToggleLyrics={onToggleLyrics} />)
      fireEvent.click(screen.getByTestId('toggle-lyrics-button'))
      expect(onToggleLyrics).toHaveBeenCalledTimes(1)
    })
  })
  describe('Mobile layout', () => {
@ -114,11 +123,12 @@ describe('<PlayerToolbar />', () => {
      // Each button should be in its own list item
      const listItems = screen.getAllByRole('listitem')
-      expect(listItems).toHaveLength(2)
+      expect(listItems).toHaveLength(3)
      // Verify both buttons are rendered
      expect(screen.getByTestId('save-queue-button')).toBeInTheDocument()
      expect(screen.getByTestId('love-button')).toBeInTheDocument()
      expect(screen.getByTestId('toggle-lyrics-button')).toBeInTheDocument()
      // Verify mobile classes are applied
      expect(listItems[0].className).toContain('mobileListItem')
@ -140,6 +150,13 @@ describe('<PlayerToolbar />', () => {
      const loveButton = screen.getByTestId('love-button')
      expect(loveButton).toBeDisabled()
    })
    it('disables lyrics button when lyrics are unavailable', () => {
      render(<PlayerToolbar id="song-1" lyricsDisabled={true} />)
      const lyricsButton = screen.getByTestId('toggle-lyrics-button')
      expect(lyricsButton).toBeDisabled()
    })
  })
  describe('Common behavior', () => {
--- a/ui/src/audioplayer/lyrics.js
+++ b/ui/src/audioplayer/lyrics.js
@ -0,0 +1,725 @@
 const normalizeLanguageTag = (language) =>
  (language || '').toLowerCase().replace('_', '-')
 // Roughly one 60fps frame; keeps line/token switching stable near tight boundaries.
 const KARAOKE_SWITCH_EPSILON_MS = 50
 const LYRIC_KIND_MAIN = 'main'
 const LYRIC_KIND_TRANSLATION = 'translation'
 const LYRIC_KIND_PRONUNCIATION = 'pronunciation'
 const padTime = (value) => {
  const str = value.toString()
  return str.length === 1 ? `0${str}` : str
 }
 const toTime = (value) => {
  if (value == null || value === '') {
    return null
  }
  const numeric = Number(value)
  return Number.isFinite(numeric) ? numeric : null
 }
 const toByteOffset = (value) => {
  if (value == null || value === '') {
    return null
  }
  const numeric = Number(value)
  if (!Number.isInteger(numeric) || numeric < 0) {
    return null
  }
  return numeric
 }
 const compareNullableTime = (a, b) => {
  if (a == null && b == null) {
    return 0
  }
  if (a == null) {
    return 1
  }
  if (b == null) {
    return -1
  }
  return a - b
 }
 const sortTokensByStart = (tokens) =>
  tokens
    .map((token, order) => ({ ...token, order }))
    .sort((a, b) => {
      const byStart = compareNullableTime(a.start, b.start)
      if (byStart !== 0) {
        return byStart
      }
      const byEnd = compareNullableTime(a.end, b.end)
      if (byEnd !== 0) {
        return byEnd
      }
      return a.order - b.order
    })
    .map(({ order, ...token }) => token)
 const languageMatch = (candidate, preferred) => {
  if (!candidate || !preferred) {
    return false
  }
  return (
    candidate === preferred ||
    candidate.startsWith(`${preferred}-`) ||
    preferred.startsWith(`${candidate}-`)
  )
 }
 const hasTimedLines = (lyric) =>
  lyric &&
  lyric.synced &&
  Array.isArray(lyric.line) &&
  lyric.line.some((line) => Number.isFinite(Number(line.start)))
 const preferTimedLyrics = (lyrics) => {
  const timed = lyrics.filter(hasTimedLines)
  return timed.length > 0 ? timed : lyrics
 }
 const normalizeToken = (token) => {
  if (!token) {
    return null
  }
  const value = typeof token.value === 'string' ? token.value : ''
  if (value.length === 0) {
    return null
  }
  const byteStart = toByteOffset(token.byteStart)
  const byteEnd = toByteOffset(token.byteEnd)
  return {
    start: toTime(token.start),
    end: toTime(token.end),
    value,
    ...(byteStart != null ? { byteStart } : {}),
    ...(byteEnd != null ? { byteEnd } : {}),
  }
 }
 const utf8BytesForCodePoint = (codePoint) => {
  if (codePoint <= 0x7f) {
    return 1
  }
  if (codePoint <= 0x7ff) {
    return 2
  }
  if (codePoint <= 0xffff) {
    return 3
  }
  return 4
 }
 export const utf8ByteOffsetToCodeUnitIndex = (text, targetByteOffset) => {
  if (typeof text !== 'string' || text.length === 0) {
    return 0
  }
  const target = toByteOffset(targetByteOffset)
  if (target == null || target <= 0) {
    return 0
  }
  let byteOffset = 0
  let index = 0
  while (index < text.length) {
    if (byteOffset >= target) {
      return index
    }
    const codePoint = text.codePointAt(index)
    byteOffset += utf8BytesForCodePoint(codePoint)
    index += codePoint > 0xffff ? 2 : 1
  }
  return text.length
 }
 export const utf8ByteRangeToCodeUnitRange = (text, byteStart, byteEnd) => {
  if (typeof text !== 'string') {
    return null
  }
  const start = toByteOffset(byteStart)
  const end = toByteOffset(byteEnd)
  if (start == null || end == null || end < start) {
    return null
  }
  const startIndex = utf8ByteOffsetToCodeUnitIndex(text, start)
  const endIndex = utf8ByteOffsetToCodeUnitIndex(text, end + 1)
  if (
    startIndex >= endIndex ||
    startIndex > text.length ||
    endIndex > text.length
  ) {
    return null
  }
  return {
    start: startIndex,
    end: endIndex,
    text: text.slice(startIndex, endIndex),
  }
 }
 const buildAgentLookup = (structuredLyric) => {
  const lookup = new Map()
  const agents = Array.isArray(structuredLyric?.agents)
    ? structuredLyric.agents
    : []
  for (const agent of agents) {
    const id = typeof agent?.id === 'string' ? agent.id : ''
    if (!id || lookup.has(id)) {
      continue
    }
    lookup.set(id, {
      id,
      role: typeof agent?.role === 'string' ? agent.role : '',
      name: typeof agent?.name === 'string' ? agent.name : '',
    })
  }
  return lookup
 }
 const deriveUiRole = (agent) => {
  if (!agent?.role || agent.role === 'main') {
    return ''
  }
  return agent.role
 }
 const normalizeCueLine = (cueLine, fallbackIndex, agentLookup) => {
  const index = Number.isFinite(Number(cueLine?.index))
    ? Number(cueLine.index)
    : fallbackIndex
  const agentId = typeof cueLine?.agentId === 'string' ? cueLine.agentId : ''
  const agent = agentId ? agentLookup.get(agentId) || null : null
  const fallbackRole = typeof cueLine?.role === 'string' ? cueLine.role : ''
  const tokens = sortTokensByStart(
    Array.isArray(cueLine?.cue)
      ? cueLine.cue.map(normalizeToken).filter(Boolean)
      : [],
  )
  return {
    index,
    start: toTime(cueLine?.start),
    end: toTime(cueLine?.end),
    value: typeof cueLine?.value === 'string' ? cueLine.value : '',
    role: agent ? deriveUiRole(agent) : fallbackRole,
    agentId,
    agentRole: agent?.role || fallbackRole,
    agentName: agent?.name || '',
    tokens,
  }
 }
 const normalizeLyricKind = (kind) => {
  const normalized = (kind || '').toLowerCase().trim()
  switch (normalized) {
    case LYRIC_KIND_TRANSLATION:
      return LYRIC_KIND_TRANSLATION
    case LYRIC_KIND_PRONUNCIATION:
      return LYRIC_KIND_PRONUNCIATION
    default:
      return LYRIC_KIND_MAIN
  }
 }
 const pickLyricByLanguage = (lyrics, preferredLanguage) => {
  if (!Array.isArray(lyrics) || lyrics.length === 0) {
    return null
  }
  const preferred = normalizeLanguageTag(preferredLanguage)
  const preferredBase = preferred.split('-')[0]
  return (
    lyrics.find((lyric) =>
      languageMatch(normalizeLanguageTag(lyric.lang), preferred),
    ) ||
    lyrics.find((lyric) =>
      languageMatch(normalizeLanguageTag(lyric.lang), preferredBase),
    ) ||
    lyrics.find((lyric) =>
      languageMatch(normalizeLanguageTag(lyric.lang), 'en'),
    ) ||
    lyrics[0]
  )
 }
 const lineTimeWindow = (lines, index) => {
  const line = lines[index]
  if (!line) {
    return { start: null, end: null }
  }
  const start = toTime(line.start)
  const end = toTime(line.end) ?? toTime(lines[index + 1]?.start)
  return { start, end }
 }
 export const hasCueTiming = (structuredLyric) =>
  Boolean(
    structuredLyric &&
    Array.isArray(structuredLyric.cueLine) &&
    structuredLyric.cueLine.some(
      (cueLine) =>
        Array.isArray(cueLine?.cue) &&
        cueLine.cue.some((cue) => Number.isFinite(Number(cue?.start))),
    ),
  )
 export const hasStructuredLyricContent = (structuredLyric) =>
  Boolean(
    structuredLyric &&
    ((Array.isArray(structuredLyric.line) &&
      structuredLyric.line.some(
        (line) => typeof line?.value === 'string' && line.value.trim() !== '',
      )) ||
      hasCueTiming(structuredLyric)),
  )
 export const getPreferredLyricLanguage = () => {
  if (typeof window !== 'undefined' && window.localStorage) {
    const stored = window.localStorage.getItem('locale')
    if (stored) {
      return stored
    }
  }
  if (typeof navigator !== 'undefined' && navigator.language) {
    return navigator.language
  }
  return 'en'
 }
 export const selectLyricLayers = (structuredLyrics, preferredLanguage) => {
  if (!Array.isArray(structuredLyrics)) {
    return {
      main: null,
      translation: null,
      pronunciation: null,
    }
  }
  const available = structuredLyrics.filter(hasStructuredLyricContent)
  if (available.length === 0) {
    return {
      main: null,
      translation: null,
      pronunciation: null,
    }
  }
  const grouped = {
    [LYRIC_KIND_MAIN]: [],
    [LYRIC_KIND_TRANSLATION]: [],
    [LYRIC_KIND_PRONUNCIATION]: [],
  }
  for (const lyric of available) {
    grouped[normalizeLyricKind(lyric?.kind)].push(lyric)
  }
  const mainCandidates = grouped[LYRIC_KIND_MAIN].length
    ? grouped[LYRIC_KIND_MAIN]
    : available
  return {
    main: pickLyricByLanguage(
      preferTimedLyrics(mainCandidates),
      preferredLanguage,
    ),
    translation: pickLyricByLanguage(
      preferTimedLyrics(grouped[LYRIC_KIND_TRANSLATION]),
      preferredLanguage,
    ),
    pronunciation: pickLyricByLanguage(
      preferTimedLyrics(grouped[LYRIC_KIND_PRONUNCIATION]),
      preferredLanguage,
    ),
  }
 }
 export const pickStructuredLyric = (structuredLyrics, preferredLanguage) =>
  selectLyricLayers(structuredLyrics, preferredLanguage).main
 export const structuredLyricToLrc = (structuredLyric) => {
  if (!structuredLyric || !Array.isArray(structuredLyric.line)) {
    return ''
  }
  let lyricText = ''
  for (const line of structuredLyric.line) {
    const start = Number(line.start)
    if (!Number.isFinite(start) || start < 0) {
      continue
    }
    let time = Math.floor(start / 10)
    const ms = time % 100
    time = Math.floor(time / 100)
    const sec = time % 60
    time = Math.floor(time / 60)
    const min = time % 60
    lyricText += `[${padTime(min)}:${padTime(sec)}.${padTime(ms)}] ${line.value || ''}\n`
  }
  return lyricText
 }
 export const structuredLyricsToLrc = (structuredLyrics, preferredLanguage) => {
  const selected = pickStructuredLyric(structuredLyrics, preferredLanguage)
  if (!selected) {
    return ''
  }
  return structuredLyricToLrc(selected)
 }
 const buildBaseKaraokeLines = (baseLines) =>
  baseLines.map((line, index) => ({
    index,
    start: toTime(line.start),
    end: toTime(line.end),
    value: typeof line.value === 'string' ? line.value : '',
    tokens: [],
  }))
 export const buildKaraokeLinesFromCueLines = (
  rawCueLines,
  baseLines,
  agentLookup,
 ) => {
  const normalizedCueLines = rawCueLines.map((cueLine, fallbackIndex) => {
    const normalized = normalizeCueLine(cueLine, fallbackIndex, agentLookup)
    return {
      ...normalized,
      tokens: normalized.tokens.map((token) => ({
        ...token,
        role: normalized.role,
        agentId: normalized.agentId,
        agentName: normalized.agentName,
        agentRole: normalized.agentRole,
      })),
    }
  })
  const byIndex = new Map()
  for (const cueLine of normalizedCueLines) {
    if (!byIndex.has(cueLine.index)) {
      byIndex.set(cueLine.index, [])
    }
    byIndex.get(cueLine.index).push(cueLine)
  }
  return Array.from(byIndex.entries()).map(([index, group]) => {
    const first = group[0]
    const baseLine = baseLines[index] || {}
    const tokens = sortTokensByStart(group.flatMap((cueLine) => cueLine.tokens))
    const fallbackStart =
      tokens.find((token) => token.start != null)?.start ?? null
    const fallbackEnd =
      [...tokens].reverse().find((token) => token.end != null)?.end ?? null
    const value =
      first.value ||
      (typeof baseLine.value === 'string' ? baseLine.value : '') ||
      tokens.map((token) => token.value).join('')
    return {
      index,
      start: first.start ?? toTime(baseLine.start) ?? fallbackStart,
      end: first.end ?? toTime(baseLine.end) ?? fallbackEnd,
      value,
      agentId: first.agentId,
      agentName: first.agentName,
      agentRole: first.agentRole,
      tokens,
    }
  })
 }
 export const buildKaraokeLines = (structuredLyric) => {
  if (!structuredLyric) {
    return []
  }
  const agentLookup = buildAgentLookup(structuredLyric)
  const baseLines = Array.isArray(structuredLyric.line)
    ? structuredLyric.line
    : []
  const rawCueLines = Array.isArray(structuredLyric.cueLine)
    ? structuredLyric.cueLine
    : []
  const lines =
    rawCueLines.length > 0
      ? buildKaraokeLinesFromCueLines(rawCueLines, baseLines, agentLookup)
      : buildBaseKaraokeLines(baseLines)
  const normalized = lines
    .filter((line) => line.value || line.tokens.length > 0)
    .sort((a, b) => {
      if (a.start == null && b.start == null) {
        return a.index - b.index
      }
      if (a.start == null) {
        return 1
      }
      if (b.start == null) {
        return -1
      }
      if (a.start !== b.start) {
        return a.start - b.start
      }
      return a.index - b.index
    })
  for (let i = 0; i < normalized.length; i += 1) {
    if (normalized[i].end == null) {
      const nextStart = normalized[i + 1]?.start
      if (nextStart != null) {
        normalized[i].end = nextStart
      }
    }
  }
  return normalized
 }
 export const resolveKaraokeTokenWindow = (
  line,
  tokenIndex,
  lineEndFallback = null,
 ) => {
  const tokens = Array.isArray(line?.tokens) ? line.tokens : []
  const token = tokens[tokenIndex]
  if (!token) {
    return { start: null, end: null }
  }
  const prevToken = tokenIndex > 0 ? tokens[tokenIndex - 1] : null
  const nextToken =
    tokenIndex + 1 < tokens.length ? tokens[tokenIndex + 1] : null
  const lineStart = toTime(line?.start)
  const lineEnd = toTime(line?.end) ?? toTime(lineEndFallback)
  const tokenCount = tokens.length
  const hasLineWindow =
    lineStart != null &&
    lineEnd != null &&
    Number.isFinite(lineStart) &&
    Number.isFinite(lineEnd) &&
    lineEnd > lineStart
  const estimatedStart =
    hasLineWindow && tokenCount > 0
      ? lineStart + ((lineEnd - lineStart) * tokenIndex) / tokenCount
      : null
  const estimatedEnd =
    hasLineWindow && tokenCount > 0
      ? lineStart + ((lineEnd - lineStart) * (tokenIndex + 1)) / tokenCount
      : null
  let explicitStartCount = 0
  let explicitEndCount = 0
  const uniqueStarts = new Set()
  const uniqueEnds = new Set()
  for (let i = 0; i < tokenCount; i += 1) {
    const explicitStart = toTime(tokens[i]?.start)
    if (explicitStart != null) {
      explicitStartCount += 1
      uniqueStarts.add(explicitStart)
    }
    const explicitEnd = toTime(tokens[i]?.end)
    if (explicitEnd != null) {
      explicitEndCount += 1
      uniqueEnds.add(explicitEnd)
    }
  }
  const collapsedStarts =
    explicitStartCount > 1 && uniqueStarts.size <= Math.max(1, tokenCount / 4)
  const collapsedEnds =
    explicitEndCount > 1 && uniqueEnds.size <= Math.max(1, tokenCount / 4)
  const shouldForceEstimated =
    hasLineWindow && tokenCount > 1 && (collapsedStarts || collapsedEnds)
  if (shouldForceEstimated) {
    return {
      start: estimatedStart,
      end: estimatedEnd,
    }
  }
  const prevEnd = toTime(prevToken?.end) ?? toTime(prevToken?.start)
  let start = toTime(token.start)
  if (start == null) {
    start = prevEnd ?? estimatedStart ?? lineStart
  }
  let end = toTime(token.end)
  if (end == null) {
    const nextDirectStart = toTime(nextToken?.start)
    const nextEstimatedStart =
      hasLineWindow && tokenIndex + 1 < tokenCount
        ? lineStart + ((lineEnd - lineStart) * (tokenIndex + 1)) / tokenCount
        : null
    end = nextDirectStart ?? nextEstimatedStart ?? estimatedEnd ?? lineEnd
  }
  if (
    tokenCount === 1 &&
    hasLineWindow &&
    (start == null || end == null || end <= start + 1)
  ) {
    start = lineStart
    end = lineEnd
  }
  if (start != null && end != null && end < start) {
    end = start
  }
  return { start, end }
 }
 export const getActiveKaraokeState = (lines, currentTimeMs) => {
  if (!Array.isArray(lines) || lines.length === 0) {
    return { lineIndex: -1, tokenIndex: -1 }
  }
  const current = Number.isFinite(Number(currentTimeMs))
    ? Number(currentTimeMs)
    : 0
  let lineIndex = 0
  for (let i = 0; i < lines.length; i += 1) {
    const lineStart = toTime(lines[i]?.start)
    if (lineStart == null || lineStart <= current + KARAOKE_SWITCH_EPSILON_MS) {
      lineIndex = i
      continue
    }
    break
  }
  for (let i = lineIndex; i >= 0; i -= 1) {
    const lineStart = toTime(lines[i]?.start)
    const lineEnd = toTime(lines[i]?.end) ?? toTime(lines[i + 1]?.start)
    if (lineStart != null && current + KARAOKE_SWITCH_EPSILON_MS < lineStart) {
      continue
    }
    if (lineEnd == null || current <= lineEnd + KARAOKE_SWITCH_EPSILON_MS) {
      lineIndex = i
      break
    }
  }
  const activeLine = lines[lineIndex] || null
  const tokens = Array.isArray(activeLine?.tokens) ? activeLine.tokens : []
  let tokenIndex = -1
  for (let i = 0; i < tokens.length; i += 1) {
    const { start: tokenStart, end: tokenEnd } = resolveKaraokeTokenWindow(
      activeLine,
      i,
      lines[lineIndex + 1]?.start,
    )
    if (
      tokenStart == null ||
      tokenStart <= current + KARAOKE_SWITCH_EPSILON_MS
    ) {
      tokenIndex = i
      if (tokenEnd != null && current <= tokenEnd + KARAOKE_SWITCH_EPSILON_MS) {
        break
      }
      continue
    }
    break
  }
  return { lineIndex, tokenIndex }
 }
 export const hasUsableKaraokeTiming = (lines) =>
  Array.isArray(lines) &&
  lines.some(
    (line) =>
      toTime(line?.start) != null ||
      (Array.isArray(line?.tokens) &&
        line.tokens.some(
          (token) => toTime(token?.start) != null || toTime(token?.end) != null,
        )),
  )
 export const findLayerLineIndexForMain = (mainLines, layerLines, mainIndex) => {
  if (
    !Array.isArray(mainLines) ||
    !Array.isArray(layerLines) ||
    mainLines.length === 0 ||
    layerLines.length === 0 ||
    mainIndex < 0 ||
    mainIndex >= mainLines.length
  ) {
    return -1
  }
  const { start: mainStart, end: mainEnd } = lineTimeWindow(
    mainLines,
    mainIndex,
  )
  if (mainStart == null) {
    return -1
  }
  const mainWindowEnd = mainEnd ?? mainStart
  const mainWindowDuration = Math.max(0, mainWindowEnd - mainStart)
  const maxDelta = Math.max(550, Math.min(1400, mainWindowDuration + 420))
  let bestIdx = -1
  let bestScore = Number.POSITIVE_INFINITY
  for (let i = 0; i < layerLines.length; i += 1) {
    const { start, end } = lineTimeWindow(layerLines, i)
    if (start != null && end != null) {
      const overlap = Math.min(end, mainEnd ?? end) - Math.max(start, mainStart)
      if (overlap >= 0) {
        const score = Math.abs(start - mainStart) + Math.abs(i - mainIndex) * 30
        if (score < bestScore) {
          bestScore = score
          bestIdx = i
        }
        continue
      }
    }
    if (start != null) {
      if (Math.abs(start - mainStart) > maxDelta) {
        continue
      }
      const score = Math.abs(start - mainStart) + Math.abs(i - mainIndex) * 45
      if (score < bestScore) {
        bestScore = score
        bestIdx = i
      }
    }
  }
  return bestIdx
 }
 export const resolveLayerLineForMain = (mainLines, layerLines, mainIndex) => {
  const index = findLayerLineIndexForMain(mainLines, layerLines, mainIndex)
  return {
    index,
    line: index >= 0 ? layerLines[index] : null,
  }
 }
 export const buildHighlightedMainLine = (line) => line
 export const buildHighlightedAuxLine = (_referenceLine, auxiliaryLine) =>
  auxiliaryLine ?? null
--- a/ui/src/audioplayer/lyrics.test.js
+++ b/ui/src/audioplayer/lyrics.test.js
@ -0,0 +1,786 @@
 import {
  buildHighlightedAuxLine,
  buildHighlightedMainLine,
  buildKaraokeLines,
  buildKaraokeLinesFromCueLines,
  findLayerLineIndexForMain,
  getActiveKaraokeState,
  getPreferredLyricLanguage,
  hasUsableKaraokeTiming,
  hasStructuredLyricContent,
  pickStructuredLyric,
  resolveKaraokeTokenWindow,
  resolveLayerLineForMain,
  selectLyricLayers,
  structuredLyricsToLrc,
  structuredLyricToLrc,
  utf8ByteOffsetToCodeUnitIndex,
  utf8ByteRangeToCodeUnitRange,
 } from './lyrics'
 describe('lyrics helpers', () => {
  beforeEach(() => {
    localStorage.clear()
  })
  it('prefers a lyric track that matches the locale', () => {
    const selected = pickStructuredLyric(
      [
        {
          lang: 'eng',
          synced: true,
          line: [{ start: 1000, value: 'English line' }],
        },
        {
          lang: 'pt-BR',
          synced: true,
          line: [{ start: 1000, value: 'Linha em portugues' }],
        },
      ],
      'pt-BR',
    )
    expect(selected.lang).toBe('pt-BR')
  })
  it('falls back to english when preferred locale is not available', () => {
    const selected = pickStructuredLyric(
      [
        {
          lang: 'eng',
          synced: true,
          line: [{ start: 1000, value: 'English line' }],
        },
        {
          lang: 'deu',
          synced: true,
          line: [{ start: 1000, value: 'Deutsche Zeile' }],
        },
      ],
      'pt-BR',
    )
    expect(selected.lang).toBe('eng')
  })
  it('falls back to first synced track when english is missing', () => {
    const selected = pickStructuredLyric(
      [
        {
          lang: 'jpn',
          synced: true,
          line: [{ start: 1000, value: 'Nihongo' }],
        },
        {
          lang: 'deu',
          synced: true,
          line: [{ start: 1000, value: 'Deutsch' }],
        },
      ],
      'pt-BR',
    )
    expect(selected.lang).toBe('jpn')
  })
  it('selects translation and pronunciation layers by kind', () => {
    const layers = selectLyricLayers(
      [
        {
          kind: 'main',
          lang: 'ja',
          synced: true,
          line: [{ start: 1000, value: 'こんにちは' }],
        },
        {
          kind: 'translation',
          lang: 'es',
          synced: true,
          line: [{ start: 1000, value: 'Hola' }],
        },
        {
          kind: 'pronunciation',
          lang: 'ja-Latn',
          synced: true,
          line: [{ start: 1000, value: 'konnichiwa' }],
        },
      ],
      'es-MX',
    )
    expect(layers.main.lang).toBe('ja')
    expect(layers.translation.lang).toBe('es')
    expect(layers.pronunciation.lang).toBe('ja-Latn')
  })
  it('treats missing kind as main for backward compatibility', () => {
    const layers = selectLyricLayers(
      [
        {
          lang: 'eng',
          synced: true,
          line: [{ start: 1000, value: 'Main' }],
        },
      ],
      'eng',
    )
    expect(layers.main.lang).toBe('eng')
    expect(layers.translation).toBeNull()
    expect(layers.pronunciation).toBeNull()
  })
  it('falls back to unsynced lyric content when no timed track exists', () => {
    const layers = selectLyricLayers(
      [
        {
          lang: 'eng',
          synced: false,
          line: [{ value: 'Plain embedded lyric' }],
        },
      ],
      'eng',
    )
    expect(layers.main).toEqual({
      lang: 'eng',
      synced: false,
      line: [{ value: 'Plain embedded lyric' }],
    })
  })
  it('still prefers timed lyrics when both timed and untimed tracks exist', () => {
    const layers = selectLyricLayers(
      [
        {
          lang: 'eng',
          synced: false,
          line: [{ value: 'Plain lyric' }],
        },
        {
          lang: 'eng',
          synced: true,
          line: [{ start: 1000, value: 'Timed lyric' }],
        },
      ],
      'eng',
    )
    expect(layers.main).toEqual({
      lang: 'eng',
      synced: true,
      line: [{ start: 1000, value: 'Timed lyric' }],
    })
  })
  it('matches layer line by timing for the active main line', () => {
    const mainLines = [
      { index: 0, start: 1000, end: 1800, value: 'Line A', tokens: [] },
      { index: 1, start: 2000, end: 2800, value: 'Line B', tokens: [] },
    ]
    const layerLines = [
      { index: 0, start: 900, end: 1750, value: 'A2', tokens: [] },
      { index: 1, start: 2050, end: 2900, value: 'B2', tokens: [] },
    ]
    expect(findLayerLineIndexForMain(mainLines, layerLines, 1)).toBe(1)
    expect(resolveLayerLineForMain(mainLines, layerLines, 0).line.value).toBe(
      'A2',
    )
  })
  it('matches metadata layers by nearest timing even when indexes differ', () => {
    const mainLines = [
      { index: 0, start: 1000, end: 1800, value: 'Line A', tokens: [] },
      { index: 1, start: 2000, end: 2800, value: 'Line B', tokens: [] },
      { index: 2, start: 3000, end: 3800, value: 'Line C', tokens: [] },
    ]
    const layerLines = [
      { index: 2, start: 3020, end: 3820, value: 'C2', tokens: [] },
      { index: 0, start: 980, end: 1760, value: 'A2', tokens: [] },
      { index: 1, start: 2010, end: 2810, value: 'B2', tokens: [] },
    ]
    expect(findLayerLineIndexForMain(mainLines, layerLines, 1)).toBe(2)
    expect(resolveLayerLineForMain(mainLines, layerLines, 2).line.value).toBe(
      'C2',
    )
  })
  it('keeps translation lines line-level when they do not have real cue timing', () => {
    const mainLine = {
      index: 0,
      start: 1000,
      end: 2200,
      value: '불을 질러라',
      tokens: [
        { start: 1000, end: 1300, value: '불을 ' },
        { start: 1300, end: 1650, value: '질' },
        { start: 1650, end: 2200, value: '러라' },
      ],
    }
    const translationLine = {
      index: 0,
      start: 1000,
      end: 2200,
      value: 'Set it on fire',
      tokens: [],
    }
    const highlighted = buildHighlightedAuxLine(mainLine, translationLine, 2600)
    expect(highlighted).toBe(translationLine)
    expect(highlighted.tokens).toEqual([])
  })
  it('keeps pronunciation lines line-level when they do not have real cue timing', () => {
    const mainLine = {
      index: 0,
      start: 1000,
      end: 2200,
      value: 'You もっと強く 素早く 吹き飛ばせ',
      tokens: [],
    }
    const pronunciationLine = {
      index: 0,
      start: 1000,
      end: 2200,
      value: 'You motto tsuyoku subayaku fukitobase',
      tokens: [],
    }
    const highlighted = buildHighlightedAuxLine(
      mainLine,
      pronunciationLine,
      2600,
    )
    expect(highlighted).toBe(pronunciationLine)
    expect(highlighted.tokens).toEqual([])
  })
  it('keeps main lines line-level when they do not have real cue timing', () => {
    const line = {
      index: 0,
      start: 1000,
      end: 2200,
      value: 'Youもっと強く 素早く 吹き飛ばせ',
      tokens: [],
    }
    const highlighted = buildHighlightedMainLine(line, 2600)
    expect(highlighted).toBe(line)
    expect(highlighted.tokens).toEqual([])
  })
  it('keeps auxiliary lines line-level when end time is missing and they lack cues', () => {
    const mainLine = {
      index: 0,
      start: 1000,
      end: null,
      value: 'Hello there',
      tokens: [],
    }
    const translationLine = {
      index: 0,
      start: 1000,
      end: null,
      value: 'Bonjour toi',
      tokens: [],
    }
    const highlighted = buildHighlightedAuxLine(mainLine, translationLine, 2400)
    expect(highlighted).toBe(translationLine)
    expect(highlighted.tokens).toEqual([])
  })
  it('keeps main lines line-level when end time is missing and they lack cues', () => {
    const line = {
      index: 0,
      start: 1000,
      end: null,
      value: 'One more time',
      tokens: [],
    }
    const highlighted = buildHighlightedMainLine(line, 2400)
    expect(highlighted).toBe(line)
    expect(highlighted.tokens).toEqual([])
  })
  it('returns no layer match when the nearest line is too far in time', () => {
    const mainLines = [
      { index: 0, start: 1000, end: 1800, value: 'Line A', tokens: [] },
      { index: 1, start: 2000, end: 2800, value: 'Line B', tokens: [] },
    ]
    const layerLines = [
      { index: 0, start: 60000, end: 60800, value: 'Far line', tokens: [] },
    ]
    expect(findLayerLineIndexForMain(mainLines, layerLines, 1)).toBe(-1)
    expect(resolveLayerLineForMain(mainLines, layerLines, 1).line).toBeNull()
  })
  it('converts a structured lyric track to LRC', () => {
    const lrc = structuredLyricToLrc({
      lang: 'eng',
      synced: true,
      line: [
        { start: 18800, value: "We're no strangers to love" },
        { start: 22801, value: 'You know the rules and so do I' },
      ],
    })
    expect(lrc).toBe(
      "[00:18.80] We're no strangers to love\n[00:22.80] You know the rules and so do I\n",
    )
  })
  it('returns empty text when no synced lyrics are available', () => {
    const lrc = structuredLyricsToLrc(
      [{ lang: 'eng', synced: false, line: [{ value: 'Unsynced line' }] }],
      'eng',
    )
    expect(lrc).toBe('')
  })
  it('reads preferred language from localStorage first', () => {
    localStorage.setItem('locale', 'pt-BR')
    expect(getPreferredLyricLanguage()).toBe('pt-BR')
  })
  it('builds karaoke lines from agent-based cueLine payload', () => {
    const lines = buildKaraokeLines({
      lang: 'eng',
      synced: true,
      line: [{ start: 1000, end: 3000, value: 'Hello world' }],
      agents: [
        { id: 'lead', role: 'main', name: 'Lead Vocal' },
        { id: 'backing', role: 'bg' },
      ],
      cueLine: [
        {
          index: 0,
          start: 1000,
          end: 3000,
          value: 'Hello world',
          agentId: 'lead',
          cue: [{ start: 1000, end: 1500, value: 'Hello' }],
        },
        {
          index: 0,
          start: 1000,
          end: 3000,
          value: 'Hello world',
          agentId: 'backing',
          cue: [{ start: 2000, end: 2500, value: 'world' }],
        },
      ],
    })
    expect(lines).toEqual([
      {
        agentId: 'lead',
        agentName: 'Lead Vocal',
        agentRole: 'main',
        index: 0,
        start: 1000,
        end: 3000,
        value: 'Hello world',
        tokens: [
          {
            start: 1000,
            end: 1500,
            value: 'Hello',
            role: '',
            agentId: 'lead',
            agentName: 'Lead Vocal',
            agentRole: 'main',
          },
          {
            start: 2000,
            end: 2500,
            value: 'world',
            role: 'bg',
            agentId: 'backing',
            agentName: '',
            agentRole: 'bg',
          },
        ],
      },
    ])
  })
  it('builds grouped karaoke lines directly from cue lines', () => {
    const agentLookup = new Map([
      ['lead', { id: 'lead', role: 'main', name: 'Lead Vocal' }],
      ['backing', { id: 'backing', role: 'bg', name: '' }],
    ])
    const lines = buildKaraokeLinesFromCueLines(
      [
        {
          index: 0,
          start: 1000,
          end: 3000,
          value: 'Hello world',
          agentId: 'lead',
          cue: [{ start: 1000, end: 1500, value: 'Hello' }],
        },
        {
          index: 0,
          start: 1000,
          end: 3000,
          value: 'Hello world',
          agentId: 'backing',
          cue: [{ start: 2000, end: 2500, value: 'world' }],
        },
      ],
      [{ start: 1000, end: 3000, value: 'Hello world' }],
      agentLookup,
    )
    expect(lines).toEqual([
      {
        agentId: 'lead',
        agentName: 'Lead Vocal',
        agentRole: 'main',
        index: 0,
        start: 1000,
        end: 3000,
        value: 'Hello world',
        tokens: [
          {
            start: 1000,
            end: 1500,
            value: 'Hello',
            role: '',
            agentId: 'lead',
            agentName: 'Lead Vocal',
            agentRole: 'main',
          },
          {
            start: 2000,
            end: 2500,
            value: 'world',
            role: 'bg',
            agentId: 'backing',
            agentName: '',
            agentRole: 'bg',
          },
        ],
      },
    ])
  })
  it('preserves cue byte offsets on karaoke tokens', () => {
    const lines = buildKaraokeLines({
      lang: 'eng',
      synced: true,
      line: [{ start: 0, end: 2400, value: 'Oh love love me tonight' }],
      cueLine: [
        {
          index: 0,
          start: 0,
          end: 2400,
          value: 'Oh love love me tonight',
          cue: [
            { start: 0, end: 300, value: 'Oh', byteStart: 0, byteEnd: 1 },
            { start: 900, end: 1300, value: 'love', byteStart: 8, byteEnd: 11 },
            { start: 1300, end: 1600, value: 'me', byteStart: 13, byteEnd: 14 },
            {
              start: 1600,
              end: 2400,
              value: 'tonight',
              byteStart: 16,
              byteEnd: 22,
            },
          ],
        },
      ],
    })
    expect(
      lines[0].tokens.map((token) => [
        token.value,
        token.byteStart,
        token.byteEnd,
      ]),
    ).toEqual([
      ['Oh', 0, 1],
      ['love', 8, 11],
      ['me', 13, 14],
      ['tonight', 16, 22],
    ])
  })
  it('preserves whitespace-only cues for exact byte-range rendering', () => {
    const lines = buildKaraokeLines({
      lang: 'kor',
      synced: true,
      line: [{ start: 0, end: 900, value: '눈을 뜬 순간' }],
      cueLine: [
        {
          index: 0,
          start: 0,
          end: 900,
          value: '눈을 뜬 순간',
          cue: [
            { start: 0, end: 150, value: '눈을', byteStart: 0, byteEnd: 5 },
            { start: 150, end: 250, value: ' ', byteStart: 6, byteEnd: 6 },
            { start: 250, end: 450, value: '뜬', byteStart: 7, byteEnd: 9 },
            { start: 450, end: 550, value: ' ', byteStart: 10, byteEnd: 10 },
            { start: 550, end: 900, value: '순간', byteStart: 11, byteEnd: 16 },
          ],
        },
      ],
    })
    expect(
      lines[0].tokens.map((token) => [
        token.value,
        token.byteStart,
        token.byteEnd,
      ]),
    ).toEqual([
      ['눈을', 0, 5],
      [' ', 6, 6],
      ['뜬', 7, 9],
      [' ', 10, 10],
      ['순간', 11, 16],
    ])
  })
  it('maps UTF-8 byte offsets to string ranges for multibyte lyrics', () => {
    const text = '눈을 뜬 순간'
    expect(utf8ByteOffsetToCodeUnitIndex(text, 0)).toBe(0)
    expect(utf8ByteOffsetToCodeUnitIndex(text, 3)).toBe(1)
    expect(utf8ByteOffsetToCodeUnitIndex(text, 7)).toBe(3)
    expect(utf8ByteRangeToCodeUnitRange(text, 11, 16)).toEqual({
      start: 5,
      end: 7,
      text: '순간',
    })
  })
  it('falls back to legacy cueLine role values when agents are absent', () => {
    const lines = buildKaraokeLines({
      lang: 'eng',
      synced: true,
      line: [{ start: 1000, end: 3000, value: 'Hello world' }],
      cueLine: [
        {
          index: 0,
          start: 1000,
          end: 3000,
          value: 'Hello world',
          role: 'bg',
          cue: [{ start: 1000, end: 1500, value: 'Hello' }],
        },
      ],
    })
    expect(lines[0].tokens[0].role).toBe('bg')
    expect(lines[0].tokens[0].agentId).toBe('')
    expect(lines[0].tokens[0].agentName).toBe('')
  })
  it('sorts token timing by start to keep playback stable', () => {
    const lines = buildKaraokeLines({
      lang: 'eng',
      synced: true,
      line: [{ start: 1000, end: 3000, value: 'Hello world' }],
      cueLine: [
        {
          index: 0,
          start: 1000,
          end: 3000,
          value: 'Hello world',
          role: '',
          cue: [
            { start: 2000, end: 2500, value: 'world' },
            { start: 1000, end: 1500, value: 'Hello' },
          ],
        },
      ],
    })
    expect(lines[0].tokens.map((token) => token.value)).toEqual([
      'Hello',
      'world',
    ])
  })
  it('keeps a single full-line token unchanged instead of expanding it synthetically', () => {
    const lines = buildKaraokeLines({
      lang: 'ko-Latn',
      synced: true,
      line: [{ start: 1000, end: 2000, value: 'Da-la-lun, dun' }],
      cueLine: [
        {
          index: 0,
          start: 1000,
          end: 2000,
          value: 'Da-la-lun, dun',
          role: '',
          cue: [{ start: 1000, end: 2000, value: 'Da-la-lun, dun' }],
        },
      ],
    })
    expect(lines).toHaveLength(1)
    expect(lines[0].tokens).toHaveLength(1)
    expect(lines[0].tokens[0].value).toBe('Da-la-lun, dun')
    const firstWindow = resolveKaraokeTokenWindow(lines[0], 0)
    expect(firstWindow.start).toBeCloseTo(1000)
    expect(firstWindow.end).toBeCloseTo(2000)
  })
  it('detects active line and token for karaoke timing', () => {
    const state = getActiveKaraokeState(
      [
        {
          index: 0,
          start: 1000,
          end: 3000,
          value: 'Hello world',
          tokens: [
            { start: 1000, end: 1500, value: 'Hello', role: '' },
            { start: 2000, end: 2500, value: 'world', role: '' },
          ],
        },
        {
          index: 1,
          start: 3500,
          end: 5000,
          value: 'Second line',
          tokens: [],
        },
      ],
      2200,
    )
    expect(state).toEqual({ lineIndex: 0, tokenIndex: 1 })
  })
  it('resolves token window fallback boundaries from neighboring tokens', () => {
    const line = {
      start: 1000,
      end: 3000,
      value: 'Hello world',
      tokens: [
        { start: 1200, value: 'Hello', role: '' },
        { start: 1800, value: 'world', role: '' },
      ],
    }
    expect(resolveKaraokeTokenWindow(line, 0)).toEqual({
      start: 1200,
      end: 1800,
    })
    expect(resolveKaraokeTokenWindow(line, 1)).toEqual({
      start: 1800,
      end: 3000,
    })
  })
  it('infers sequential token windows when token timings are missing', () => {
    const line = {
      start: 1000,
      end: 2000,
      value: 'A B C',
      tokens: [
        { value: 'A', role: '' },
        { value: 'B', role: '' },
        { value: 'C', role: '' },
      ],
    }
    const first = resolveKaraokeTokenWindow(line, 0)
    const second = resolveKaraokeTokenWindow(line, 1)
    const third = resolveKaraokeTokenWindow(line, 2)
    expect(first.start).toBeCloseTo(1000)
    expect(first.end).toBeCloseTo(1333.3333333333333)
    expect(second.start).toBeCloseTo(1333.3333333333333)
    expect(second.end).toBeCloseTo(1666.6666666666667)
    expect(third.start).toBeCloseTo(1666.6666666666667)
    expect(third.end).toBeCloseTo(2000)
  })
  it('falls back to sequential windows when token timings are collapsed', () => {
    const line = {
      start: 1000,
      end: 2000,
      value: 'A B C',
      tokens: [
        { start: 1000, end: 2000, value: 'A', role: '' },
        { start: 1000, end: 2000, value: 'B', role: '' },
        { start: 1000, end: 2000, value: 'C', role: '' },
      ],
    }
    const first = resolveKaraokeTokenWindow(line, 0)
    const second = resolveKaraokeTokenWindow(line, 1)
    const third = resolveKaraokeTokenWindow(line, 2)
    expect(first.start).toBeCloseTo(1000)
    expect(first.end).toBeCloseTo(1333.3333333333333)
    expect(second.start).toBeCloseTo(1333.3333333333333)
    expect(second.end).toBeCloseTo(1666.6666666666667)
    expect(third.start).toBeCloseTo(1666.6666666666667)
    expect(third.end).toBeCloseTo(2000)
  })
  it('keeps token selection stable near tight token boundaries', () => {
    const state = getActiveKaraokeState(
      [
        {
          index: 0,
          start: 1000,
          end: 2000,
          value: 'A B',
          tokens: [
            { start: 1000, end: 1100, value: 'A', role: '' },
            { start: 1110, end: 1300, value: 'B', role: '' },
          ],
        },
      ],
      1108,
    )
    expect(state).toEqual({ lineIndex: 0, tokenIndex: 0 })
  })
  it('reports structured lyric content when token timing exists', () => {
    expect(
      hasStructuredLyricContent({
        cueLine: [{ cue: [{ start: 100, value: 'a' }] }],
      }),
    ).toBe(true)
  })
  it('detects when built karaoke lines have no usable timing', () => {
    expect(
      hasUsableKaraokeTiming([
        { index: 0, value: 'First line', tokens: [] },
        { index: 1, value: 'Second line', tokens: [] },
      ]),
    ).toBe(false)
    expect(
      hasUsableKaraokeTiming([
        { index: 0, start: 1000, value: 'Timed line', tokens: [] },
      ]),
    ).toBe(true)
  })
 })
--- a/ui/src/audioplayer/lyricsOverlayState.js
+++ b/ui/src/audioplayer/lyricsOverlayState.js
@ -0,0 +1,27 @@
 export const resolveLyricsOverlayState = ({
  karaokeVisiblePreference,
  translationPreference,
  pronunciationPreference,
  hasKaraokeLyric,
  hasTranslationLyric,
  hasPronunciationLyric,
 }) => ({
  karaokeVisible: karaokeVisiblePreference && hasKaraokeLyric,
  showTranslation: translationPreference && hasTranslationLyric,
  showPronunciation:
    (pronunciationPreference == null
      ? hasPronunciationLyric
      : pronunciationPreference) && hasPronunciationLyric,
 })
 export const togglePronunciationPreference = (
  previousPreference,
  hasPronunciationLyric,
 ) => {
  if (!hasPronunciationLyric) {
    return false
  }
  const currentPreference =
    previousPreference == null ? hasPronunciationLyric : previousPreference
  return !currentPreference
 }
--- a/ui/src/audioplayer/styles.js
+++ b/ui/src/audioplayer/styles.js
@ -62,12 +62,30 @@ const useStyle = makeStyles(
          // Fix cover display when image is not square
          aspectRatio: '1/1',
          display: 'flex',
          position: 'relative',
        },
      '& .react-jinke-music-player-mobile .react-jinke-music-player-mobile-cover.nd-mobile-lyrics-active':
        {
          width: '100%',
          maxWidth: 'none',
          height: 'clamp(280px, 42vh, 460px)',
          aspectRatio: 'auto',
          borderRadius: 12,
          border: 'none',
          boxShadow: 'none',
          background: 'transparent',
          cursor: 'default',
        },
      '& .react-jinke-music-player-mobile .react-jinke-music-player-mobile-cover img.cover':
        {
          animationDuration: (props) => !props.enableCoverAnimation && '0s',
          objectFit: 'contain', // Fix cover display when image is not square
        },
      '& .react-jinke-music-player-mobile .react-jinke-music-player-mobile-cover.nd-mobile-lyrics-active img.cover':
        {
          opacity: 0,
          pointerEvents: 'none',
        },
      // Hide old singer display
      '& .react-jinke-music-player-mobile .react-jinke-music-player-mobile-singer':
        {
--- a/ui/src/reducers/playerReducer.js
+++ b/ui/src/reducers/playerReducer.js
@ -7,6 +7,7 @@ import {
  PLAYER_CURRENT,
  PLAYER_PLAY_NEXT,
  PLAYER_PLAY_TRACKS,
  PLAYER_UPDATE_LYRIC,
  PLAYER_SET_TRACK,
  PLAYER_SET_VOLUME,
  PLAYER_SYNC_QUEUE,
@ -60,21 +61,25 @@ const mapToAudioLists = (item) => {
  let lyricText = ''
  if (lyrics) {
-    const structured = JSON.parse(lyrics)
+    try {
-    for (const structuredLyric of structured) {
+      const structured = JSON.parse(lyrics)
-      if (structuredLyric.synced) {
+      for (const structuredLyric of structured) {
-        for (const line of structuredLyric.line) {
+        if (structuredLyric.synced) {
-          let time = Math.floor(line.start / 10)
+          for (const line of structuredLyric.line) {
-          const ms = time % 100
+            let time = Math.floor(line.start / 10)
-          time = Math.floor(time / 100)
+            const ms = time % 100
-          const sec = time % 60
+            time = Math.floor(time / 100)
-          time = Math.floor(time / 60)
+            const sec = time % 60
-          const min = time % 60
+            time = Math.floor(time / 60)
            const min = time % 60
-          ms.toString()
+            ms.toString()
-          lyricText += `[${pad(min)}:${pad(sec)}.${pad(ms)}] ${line.value}\n`
+            lyricText += `[${pad(min)}:${pad(sec)}.${pad(ms)}] ${line.value}\n`
          }
        }
      }
    } catch {
      lyricText = ''
    }
  }
@ -208,6 +213,45 @@ const reduceMode = (state, { data: { mode } }) => {
  }
 }
 const reduceUpdateLyric = (state, { data: { trackId, lyric } }) => {
  if (!trackId) {
    return state
  }
  let changed = false
  const queue = state.queue.map((item) => {
    if (item.trackId !== trackId) {
      return item
    }
    if (item.lyric === lyric) {
      return item
    }
    changed = true
    return {
      ...item,
      lyric,
    }
  })
  if (!changed) {
    return state
  }
  const current =
    state.current?.trackId === trackId
      ? {
          ...state.current,
          lyric,
        }
      : state.current
  return {
    ...state,
    queue,
    current,
  }
 }
 export const playerReducer = (previousState = initialState, payload) => {
  const { type } = payload
  switch (type) {
@ -245,6 +289,8 @@ export const playerReducer = (previousState = initialState, payload) => {
          previousState.savedPlayIndex >= 0 ? previousState.savedPlayIndex : 0,
      }
    }
    case PLAYER_UPDATE_LYRIC:
      return reduceUpdateLyric(previousState, payload)
    default:
      return previousState
  }
--- a/ui/src/reducers/playerReducer.test.js
+++ b/ui/src/reducers/playerReducer.test.js
@ -1,11 +1,24 @@
-import { describe, it, expect } from 'vitest'
+import { describe, expect, it, vi } from 'vitest'
 import { playerReducer } from './playerReducer'
 import {
  PLAYER_SYNC_QUEUE,
  PLAYER_CURRENT,
  PLAYER_REFRESH_QUEUE,
  PLAYER_SET_TRACK,
  PLAYER_SYNC_QUEUE,
  PLAYER_UPDATE_LYRIC,
 } from '../actions'
 vi.mock('uuid', () => ({
  v4: () => 'test-uuid',
 }))
 vi.mock('../subsonic', () => ({
  default: {
    streamUrl: vi.fn((id) => `/rest/stream?id=${id}`),
    getCoverArtUrl: vi.fn(() => '/rest/getCoverArt?id=test'),
  },
 }))
 describe('playerReducer', () => {
  describe('pending track selection survives SYNC_QUEUE and premature CURRENT', () => {
    // Simulates the real sequence when clicking a new song while one is playing:
@ -54,8 +67,6 @@ describe('playerReducer', () => {
    })
    it('CURRENT for old track preserves pending playIndex', () => {
      // After SYNC_QUEUE, queue has new UUIDs. The old track's UUID (zzz)
      // is at index 2, but playIndex is 0. This is a premature callback.
      const stateAfterSync = {
        ...stateAfterPlayTracks,
        queue: [
@ -71,7 +82,7 @@ describe('playerReducer', () => {
      const result = playerReducer(stateAfterSync, action)
      expect(result.playIndex).toBe(0)
      expect(result.clear).toBe(true)
-      expect(result.savedPlayIndex).toBe(2) // preserved from before
+      expect(result.savedPlayIndex).toBe(2)
    })
    it('CURRENT for correct track consumes pending playIndex', () => {
@ -83,7 +94,6 @@ describe('playerReducer', () => {
          { trackId: 's3', uuid: 'zzz', name: 'Song 3' },
        ],
      }
      // Player switched to Song 1 (uuid 'xxx', index 0 == playIndex)
      const action = {
        type: PLAYER_CURRENT,
        data: { uuid: 'xxx', name: 'Song 1', volume: 1 },
@ -224,4 +234,80 @@ describe('playerReducer', () => {
      expect(result.playIndex).toBe(0)
    })
  })
  it('maps embedded synced lyrics to LRC text', () => {
    const lyrics = JSON.stringify([
      {
        lang: 'eng',
        synced: true,
        line: [{ start: 1000, value: 'Line one' }],
      },
      {
        lang: 'eng',
        synced: false,
        line: [{ value: 'Unsynced line' }],
      },
    ])
    const state = playerReducer(undefined, {
      type: PLAYER_SET_TRACK,
      data: {
        id: 'song-1',
        title: 'Test Song',
        artist: 'Test Artist',
        album: 'Test Album',
        duration: 60,
        lyrics,
      },
    })
    expect(state.queue).toHaveLength(1)
    expect(state.queue[0].lyric).toBe('[00:01.00] Line one\n')
  })
  it('updates queue lyric by track id', () => {
    const initial = playerReducer(undefined, {
      type: PLAYER_SET_TRACK,
      data: {
        id: 'song-1',
        title: 'Test Song',
        artist: 'Test Artist',
        album: 'Test Album',
        duration: 60,
      },
    })
    const updated = playerReducer(initial, {
      type: PLAYER_UPDATE_LYRIC,
      data: {
        trackId: 'song-1',
        lyric: '[00:01.00] Updated lyric\n',
      },
    })
    expect(updated.queue[0].lyric).toBe('[00:01.00] Updated lyric\n')
  })
  it('returns same state when lyric update does not match any track', () => {
    const initial = playerReducer(undefined, {
      type: PLAYER_SET_TRACK,
      data: {
        id: 'song-1',
        title: 'Test Song',
        artist: 'Test Artist',
        album: 'Test Album',
        duration: 60,
      },
    })
    const updated = playerReducer(initial, {
      type: PLAYER_UPDATE_LYRIC,
      data: {
        trackId: 'missing-track',
        lyric: '[00:01.00] Updated lyric\n',
      },
    })
    expect(updated).toBe(initial)
  })
 })
--- a/ui/src/subsonic/index.js
+++ b/ui/src/subsonic/index.js
@ -1,5 +1,5 @@
 import { baseUrl } from '../utils'
 import { httpClient } from '../dataProvider'
 import { baseUrl } from '../utils'
 const url = (command, id, options) => {
  const username = localStorage.getItem('username')
@ -120,6 +120,10 @@ const getTopSongs = (artist, count = 50) => {
  return httpClient(url('getTopSongs', null, { artist, count }))
 }
 const getLyricsBySongId = (id) => {
  return httpClient(url('getLyricsBySongId', id, { enhanced: true }))
 }
 const streamUrl = (id, options) => {
  return baseUrl(
    url('stream', id, {
@ -149,4 +153,5 @@ export default {
  getArtistInfo,
  getTopSongs,
  getSimilarSongs2,
  getLyricsBySongId,
 }
--- a/ui/src/subsonic/index.test.js
+++ b/ui/src/subsonic/index.test.js
@ -1,7 +1,13 @@
 import { vi } from 'vitest'
-import config from '../config'
+import { httpClient } from '../dataProvider'
 import subsonic from './index'
 vi.mock('../dataProvider', () => ({
  httpClient: vi.fn(() => Promise.resolve({})),
 }))
 const COVER_ART_SIZE = 600
 describe('getCoverArtUrl', () => {
  beforeEach(() => {
    // Mock window.location
@ -31,11 +37,7 @@ describe('getCoverArtUrl', () => {
      updatedAt: '2023-01-01T00:00:00Z',
    }
-    const url = subsonic.getCoverArtUrl(
+    const url = subsonic.getCoverArtUrl(playlistRecord, COVER_ART_SIZE, true)
      playlistRecord,
      config.uiCoverArtSize,
      true,
    )
    expect(url).toContain('pl-playlist-123')
    expect(url).toContain('size=600')
@ -49,11 +51,7 @@ describe('getCoverArtUrl', () => {
      sync: true,
    }
-    const url = subsonic.getCoverArtUrl(
+    const url = subsonic.getCoverArtUrl(playlistRecord, COVER_ART_SIZE, true)
      playlistRecord,
      config.uiCoverArtSize,
      true,
    )
    expect(url).toContain('pl-playlist-123')
    expect(url).toContain('size=600')
@ -68,11 +66,7 @@ describe('getCoverArtUrl', () => {
      updatedAt: '2023-01-01T00:00:00Z',
    }
-    const url = subsonic.getCoverArtUrl(
+    const url = subsonic.getCoverArtUrl(albumRecord, COVER_ART_SIZE, true)
      albumRecord,
      config.uiCoverArtSize,
      true,
    )
    expect(url).toContain('al-album-123')
    expect(url).toContain('size=600')
@ -86,7 +80,7 @@ describe('getCoverArtUrl', () => {
      updatedAt: '2023-01-01T00:00:00Z',
    }
-    const url = subsonic.getCoverArtUrl(songRecord, config.uiCoverArtSize, true)
+    const url = subsonic.getCoverArtUrl(songRecord, COVER_ART_SIZE, true)
    expect(url).toContain('mf-song-123')
    expect(url).toContain('size=600')
@ -99,11 +93,7 @@ describe('getCoverArtUrl', () => {
      updatedAt: '2023-01-01T00:00:00Z',
    }
-    const url = subsonic.getCoverArtUrl(
+    const url = subsonic.getCoverArtUrl(artistRecord, COVER_ART_SIZE, true)
      artistRecord,
      config.uiCoverArtSize,
      true,
    )
    expect(url).toContain('ar-artist-123')
    expect(url).toContain('size=600')
@ -194,3 +184,30 @@ describe('getAvatarUrl', () => {
    expect(url).toContain('username=john')
  })
 })
 describe('getLyricsBySongId', () => {
  beforeEach(() => {
    vi.clearAllMocks()
    const localStorageMock = {
      getItem: vi.fn((key) => {
        const values = {
          username: 'testuser',
          'subsonic-token': 'testtoken',
          'subsonic-salt': 'testsalt',
        }
        return values[key] || null
      }),
    }
    Object.defineProperty(window, 'localStorage', { value: localStorageMock })
  })
  it('calls the getLyricsBySongId endpoint with enhanced=true', async () => {
    await subsonic.getLyricsBySongId('song-1')
    expect(httpClient).toHaveBeenCalledTimes(1)
    const calledUrl = httpClient.mock.calls[0][0]
    expect(calledUrl).toContain('/rest/getLyricsBySongId?')
    expect(calledUrl).toContain('id=song-1')
    expect(calledUrl).toContain('enhanced=true')
  })
 })
		`@ -0,0 +1,2 @@`
							`<?xml version="1.0" encoding="UTF-8"?>`
							`<tt xmlns="http://www.w3.org/ns/ttml"><body><div xml:lang="eng"><p begin="00:00:00.00">BOM test line</p></div></body></tt>`