From 9dcec350566ce3fc8f3dc2b7d36420ec1527bfab Mon Sep 17 00:00:00 2001 From: ranokay Date: Fri, 20 Feb 2026 16:54:45 +0200 Subject: [PATCH 01/14] feat: add TTML lyrics support with token-level karaoke and translation/pronunciation layers Add a full TTML (Timed Text Markup Language) sidecar lyrics parser that extracts word/syllable-level timing from elements, plus translation and pronunciation (transliteration) tracks from Apple Music TTML metadata sections. Backend changes: - TTML parser (core/lyrics/ttml.go) with support for all TTML time formats, nested timing contexts, and bare decimal second offsets - Translation/pronunciation tracks resolved via key-based metadata linking - Line timing hydration from token-level start/end values - 'kind' field added to Lyrics model and StructuredLyric API response (main/translation/pronunciation) - 'tokenLine' array in API response for word-level timing data - UTF-8 BOM and UTF-16 LE encoding support for TTML files - Fix for ambiguous time resolution in pronunciation spans (pre-1-minute) Frontend changes: - KaraokeLyricsOverlay rewritten with scrollable multi-line layout, word-level wipe highlighting with eased alpha transitions, rAF-driven playback clock with drift correction - Inline translation (above) and pronunciation (below) each main line, with smart filtering to hide redundant lines (same normalized text) - TR/PR toggle buttons and layer selection via selectLyricLayers() - Click-to-seek: click any lyric line to jump to that position - Customization popover with font-size sliders and color presets for each line type (TR/Default/PR), persisted to localStorage - Smooth font-size transition between active and inactive lines - Resizable overlay height via drag handle - lyrics.js: resolveKaraokeTokenWindow, buildSyntheticWordTokens, findLayerLineIndexForMain, token sorting, collapsed timing detection API extension (non-breaking, additive): - tokenLine[].token[] provides per-word start/end timing (ms) - tokenLine[].index maps back to the corresponding line[] entry - kind field: 'main', 'translation', 'pronunciation' - Clients ignoring tokenLine/kind continue to work unchanged --- README.md | 1 + conf/configuration.go | 2 +- core/lyrics/lyrics_test.go | 32 +- core/lyrics/sources.go | 28 +- core/lyrics/sources_test.go | 63 + core/lyrics/sources_ttml_test.go | 92 ++ core/lyrics/ttml.go | 886 +++++++++++++ core/lyrics/ttml_test.go | 398 ++++++ model/lyrics.go | 12 +- server/subsonic/helpers.go | 28 + server/subsonic/media_retrieval.go | 39 +- server/subsonic/media_retrieval_test.go | 301 +++++ server/subsonic/opensubsonic.go | 2 +- server/subsonic/opensubsonic_test.go | 2 +- server/subsonic/responses/responses.go | 29 +- tests/fixtures/bom-test.ttml | 2 + tests/fixtures/bom-utf16-test.ttml | Bin 0 -> 414 bytes tests/fixtures/test-metadata.ttml | 25 + tests/fixtures/test.ttml | 12 + ui/src/actions/player.js | 6 + ui/src/audioplayer/KaraokeLyricsOverlay.jsx | 1228 +++++++++++++++++++ ui/src/audioplayer/Player.jsx | 228 +++- ui/src/audioplayer/PlayerToolbar.jsx | 31 +- ui/src/audioplayer/PlayerToolbar.test.jsx | 19 +- ui/src/audioplayer/lyrics.js | 617 ++++++++++ ui/src/audioplayer/lyrics.test.js | 416 +++++++ ui/src/reducers/playerReducer.js | 70 +- ui/src/reducers/playerReducer.test.js | 98 +- ui/src/subsonic/index.js | 5 + ui/src/subsonic/index.test.js | 60 +- 30 files changed, 4651 insertions(+), 81 deletions(-) create mode 100644 core/lyrics/sources_ttml_test.go create mode 100644 core/lyrics/ttml.go create mode 100644 core/lyrics/ttml_test.go create mode 100644 tests/fixtures/bom-test.ttml create mode 100644 tests/fixtures/bom-utf16-test.ttml create mode 100644 tests/fixtures/test-metadata.ttml create mode 100644 tests/fixtures/test.ttml create mode 100644 ui/src/audioplayer/KaraokeLyricsOverlay.jsx create mode 100644 ui/src/audioplayer/lyrics.js create mode 100644 ui/src/audioplayer/lyrics.test.js diff --git a/README.md b/README.md index 0ae5bdfaf..6b9aff799 100644 --- a/README.md +++ b/README.md @@ -52,6 +52,7 @@ A share of the revenue helps fund the development of Navidrome at no additional - **Multi-platform**, runs on macOS, Linux and Windows. **Docker** images are also provided - Ready to use binaries for all major platforms, including **Raspberry Pi** - Automatically **monitors your library** for changes, importing new files and reloading new metadata + - Supports synchronized lyrics from sidecar **.lrc** and **.ttml** files (via `lyricspriority`) - **Themeable**, modern and responsive **Web interface** based on [Material UI](https://material-ui.com) - **Compatible** with all Subsonic/Madsonic/Airsonic [clients](https://www.navidrome.org/docs/overview/#apps) - **Transcoding** on the fly. Can be set per user/player. **Opus encoding is supported** diff --git a/conf/configuration.go b/conf/configuration.go index a8b0e4c8a..af9f6c283 100644 --- a/conf/configuration.go +++ b/conf/configuration.go @@ -730,7 +730,7 @@ func setViperDefaults() { viper.SetDefault("artistartpriority", "artist.*, album/artist.*, external") viper.SetDefault("artistimagefolder", "") viper.SetDefault("discartpriority", "disc*.*, cd*.*, cover.*, folder.*, front.*, discsubtitle, embedded") - viper.SetDefault("lyricspriority", ".lrc,.txt,embedded") + viper.SetDefault("lyricspriority", ".lrc,.ttml,.txt,embedded") viper.SetDefault("enablegravatar", false) viper.SetDefault("enablefavourites", true) viper.SetDefault("enablestarrating", true) diff --git a/core/lyrics/lyrics_test.go b/core/lyrics/lyrics_test.go index 2e495a714..d5f79a4d0 100644 --- a/core/lyrics/lyrics_test.go +++ b/core/lyrics/lyrics_test.go @@ -44,6 +44,35 @@ var _ = Describe("sources", func() { }, } + ttmlLyrics := model.LyricList{ + model.Lyrics{ + Kind: "main", + Lang: "eng", + Line: []model.Line{ + { + Start: gg.P(int64(18800)), + Value: "We're no strangers to love", + }, + { + Start: gg.P(int64(22800)), + Value: "You know the rules and so do I", + }, + }, + Synced: true, + }, + model.Lyrics{ + Kind: "main", + Lang: "por", + Line: []model.Line{ + { + Start: gg.P(int64(18800)), + Value: "Nao somos estranhos ao amor", + }, + }, + Synced: true, + }, + } + unsyncedLyrics := model.LyricList{ model.Lyrics{ Lang: "xxx", @@ -80,7 +109,8 @@ var _ = Describe("sources", func() { }, Entry("embedded > lrc > txt", "embedded,.lrc,.txt", embeddedLyrics), Entry("lrc > embedded > txt", ".lrc,embedded,.txt", syncedLyrics), - Entry("txt > lrc > embedded", ".txt,.lrc,embedded", unsyncedLyrics)) + Entry("txt > lrc > embedded", ".txt,.lrc,embedded", unsyncedLyrics), + Entry("ttml > lrc > embedded", ".ttml,.lrc,embedded", ttmlLyrics)) Context("Errors", func() { var RegularUserContext = XContext diff --git a/core/lyrics/sources.go b/core/lyrics/sources.go index 82a10ca41..38a71cb8a 100644 --- a/core/lyrics/sources.go +++ b/core/lyrics/sources.go @@ -5,6 +5,7 @@ import ( "errors" "os" "path" + "strings" "github.com/navidrome/navidrome/log" "github.com/navidrome/navidrome/model" @@ -36,18 +37,31 @@ func fromExternalFile(ctx context.Context, mf *model.MediaFile, suffix string) ( return nil, err } - lyrics, err := model.ToLyrics("xxx", string(contents)) - if err != nil { - log.Error(ctx, "error parsing lyric external file", "path", externalLyric, err) - return nil, err - } else if lyrics == nil { + var list model.LyricList + if strings.EqualFold(suffix, ".ttml") { + list, err = parseTTML(contents) + if err != nil { + log.Error(ctx, "error parsing ttml external file", "path", externalLyric, err) + return nil, err + } + } else { + lyrics, err := model.ToLyrics("xxx", string(contents)) + if err != nil { + log.Error(ctx, "error parsing lyric external file", "path", externalLyric, err) + return nil, err + } + if lyrics != nil { + list = model.LyricList{*lyrics} + } + } + + if len(list) == 0 { log.Trace(ctx, "empty lyrics from external file", "path", externalLyric) return nil, nil } log.Trace(ctx, "retrieved lyrics from external file", "path", externalLyric) - - return model.LyricList{*lyrics}, nil + return list, nil } // fromPlugin attempts to load lyrics from a plugin with the given name. diff --git a/core/lyrics/sources_test.go b/core/lyrics/sources_test.go index b3d502101..8823a3175 100644 --- a/core/lyrics/sources_test.go +++ b/core/lyrics/sources_test.go @@ -109,6 +109,41 @@ var _ = Describe("sources", func() { })) }) + It("should return synchronized multilingual lyrics from a TTML file", func() { + mf := model.MediaFile{Path: "tests/fixtures/test.mp3"} + lyrics, err := fromExternalFile(ctx, &mf, ".ttml") + + Expect(err).To(BeNil()) + Expect(lyrics).To(Equal(model.LyricList{ + { + Kind: "main", + Lang: "eng", + Line: []model.Line{ + { + Start: gg.P(int64(18800)), + Value: "We're no strangers to love", + }, + { + Start: gg.P(int64(22800)), + Value: "You know the rules and so do I", + }, + }, + Synced: true, + }, + { + Kind: "main", + Lang: "por", + Line: []model.Line{ + { + Start: gg.P(int64(18800)), + Value: "Nao somos estranhos ao amor", + }, + }, + Synced: true, + }, + })) + }) + It("should handle LRC files with UTF-8 BOM marker (issue #4631)", func() { // The function looks for , so we need to pass // a MediaFile with .mp3 path and look for .lrc suffix @@ -142,5 +177,33 @@ var _ = Describe("sources", func() { Expect(lyrics[0].Line[1].Start).To(Equal(gg.P(int64(22801)))) Expect(lyrics[0].Line[1].Value).To(Equal("You know the rules and so do I")) }) + + It("should handle TTML files with UTF-8 BOM marker", func() { + mf := model.MediaFile{Path: "tests/fixtures/bom-test.mp3"} + lyrics, err := fromExternalFile(ctx, &mf, ".ttml") + + Expect(err).To(BeNil()) + Expect(lyrics).To(HaveLen(1)) + Expect(lyrics[0].Kind).To(Equal("main")) + Expect(lyrics[0].Synced).To(BeTrue()) + Expect(lyrics[0].Line).To(HaveLen(1)) + Expect(lyrics[0].Line[0].Start).To(Equal(gg.P(int64(0)))) + Expect(lyrics[0].Line[0].Value).To(Equal("BOM test line")) + }) + + It("should handle UTF-16 LE encoded TTML files", func() { + mf := model.MediaFile{Path: "tests/fixtures/bom-utf16-test.mp3"} + lyrics, err := fromExternalFile(ctx, &mf, ".ttml") + + Expect(err).To(BeNil()) + Expect(lyrics).To(HaveLen(1)) + Expect(lyrics[0].Kind).To(Equal("main")) + Expect(lyrics[0].Synced).To(BeTrue()) + Expect(lyrics[0].Line).To(HaveLen(2)) + Expect(lyrics[0].Line[0].Start).To(Equal(gg.P(int64(18800)))) + Expect(lyrics[0].Line[0].Value).To(Equal("UTF16 line one")) + Expect(lyrics[0].Line[1].Start).To(Equal(gg.P(int64(22801)))) + Expect(lyrics[0].Line[1].Value).To(Equal("UTF16 line two")) + }) }) }) diff --git a/core/lyrics/sources_ttml_test.go b/core/lyrics/sources_ttml_test.go new file mode 100644 index 000000000..217bf7b36 --- /dev/null +++ b/core/lyrics/sources_ttml_test.go @@ -0,0 +1,92 @@ +package lyrics + +import ( + "context" + "os" + "path/filepath" + "testing" + + "github.com/navidrome/navidrome/model" +) + +func TestFromExternalFileTTML(t *testing.T) { + ctx := context.Background() + mf := model.MediaFile{Path: fixturePath("test.mp3")} + + lyrics, err := fromExternalFile(ctx, &mf, ".ttml") + if err != nil { + t.Fatalf("fromExternalFile returned error: %v", err) + } + if len(lyrics) != 2 { + t.Fatalf("expected 2 lyric tracks, got %d", len(lyrics)) + } + if lyrics[0].Lang != "eng" { + t.Fatalf("expected first language 'eng', got %q", lyrics[0].Lang) + } + if len(lyrics[0].Line) != 2 { + t.Fatalf("expected 2 english lines, got %d", len(lyrics[0].Line)) + } + if lyrics[0].Line[0].Start == nil || *lyrics[0].Line[0].Start != 18800 { + t.Fatalf("expected first english line start to be 18800, got %v", lyrics[0].Line[0].Start) + } +} + +func TestFromExternalFileTTMLWithUTF8BOM(t *testing.T) { + ctx := context.Background() + mf := model.MediaFile{Path: fixturePath("bom-test.ttml")} + + lyrics, err := fromExternalFile(ctx, &mf, ".ttml") + if err != nil { + t.Fatalf("fromExternalFile returned error: %v", err) + } + if len(lyrics) != 1 { + t.Fatalf("expected 1 lyric track, got %d", len(lyrics)) + } + if !lyrics[0].Synced { + t.Fatal("expected BOM TTML lyrics to be synced") + } + if len(lyrics[0].Line) != 1 { + t.Fatalf("expected 1 lyric line, got %d", len(lyrics[0].Line)) + } + if lyrics[0].Line[0].Start == nil || *lyrics[0].Line[0].Start != 0 { + t.Fatalf("expected first line start 0, got %v", lyrics[0].Line[0].Start) + } +} + +func TestFromExternalFileTTMLUTF16(t *testing.T) { + ctx := context.Background() + mf := model.MediaFile{Path: fixturePath("bom-utf16-test.ttml")} + + lyrics, err := fromExternalFile(ctx, &mf, ".ttml") + if err != nil { + t.Fatalf("fromExternalFile returned error: %v", err) + } + if len(lyrics) != 1 { + t.Fatalf("expected 1 lyric track, got %d", len(lyrics)) + } + if !lyrics[0].Synced { + t.Fatal("expected UTF16 TTML lyrics to be synced") + } + if len(lyrics[0].Line) != 2 { + t.Fatalf("expected 2 lyric lines, got %d", len(lyrics[0].Line)) + } + if lyrics[0].Line[0].Start == nil || *lyrics[0].Line[0].Start != 18800 { + t.Fatalf("expected first line start 18800, got %v", lyrics[0].Line[0].Start) + } + if lyrics[0].Line[1].Start == nil || *lyrics[0].Line[1].Start != 22801 { + t.Fatalf("expected second line start 22801, got %v", lyrics[0].Line[1].Start) + } +} + +func fixturePath(name string) string { + candidates := []string{ + filepath.Join("tests", "fixtures", name), + filepath.Join("..", "..", "tests", "fixtures", name), + } + for _, candidate := range candidates { + if _, err := os.Stat(candidate); err == nil { + return candidate + } + } + return filepath.Join("tests", "fixtures", name) +} diff --git a/core/lyrics/ttml.go b/core/lyrics/ttml.go new file mode 100644 index 000000000..3aae53aa0 --- /dev/null +++ b/core/lyrics/ttml.go @@ -0,0 +1,886 @@ +package lyrics + +import ( + "bytes" + "encoding/xml" + "errors" + "io" + "math" + "regexp" + "sort" + "strconv" + "strings" + + "github.com/navidrome/navidrome/log" + "github.com/navidrome/navidrome/model" + "github.com/navidrome/navidrome/utils/str" +) + +const ( + defaultTTMLFrameRate = 30.0 + defaultTTMLSubFrameRate = 1.0 + defaultTTMLTickRate = 1.0 + + ttmlLyricKindMain = "main" + ttmlLyricKindTranslation = "translation" + ttmlLyricKindPronunciation = "pronunciation" +) + +var offsetTimeRegex = regexp.MustCompile(`^([0-9]+(?:\.[0-9]+)?)(h|m|s|ms|f|t)$`) +var xmlEncodingRegex = regexp.MustCompile(`(?i)<\?xml([^>]*?)encoding\s*=\s*["'][^"']+["']([^>]*)\?>`) + +type ttmlTimeKind int + +const ( + ttmlTimeAbsolute ttmlTimeKind = iota + ttmlTimeOffset + ttmlTimeAmbiguous +) + +type ttmlTimingParams struct { + frameRate float64 + subFrameRate float64 + tickRate float64 +} + +type ttmlTimingContext struct { + lang string + role string + begin int64 + hasBegin bool + end int64 + hasEnd bool + invalid bool +} + +type ttmlLineRef struct { + order int + line model.Line +} + +type ttmlMetadataEntry struct { + key string + line model.Line + seq int +} + +type ttmlResolvedMetadataLine struct { + order int + seq int + line model.Line +} + +type ttmlParser struct { + decoder *xml.Decoder + params ttmlTimingParams + + mainLangOrder []string + mainLinesByLang map[string][]model.Line + + mainLineRefsByKey map[string]ttmlLineRef + mainLineOrder int + + translationLangOrder []string + translationEntriesByLg map[string][]ttmlMetadataEntry + + pronunciationLangOrder []string + pronunciationEntriesByLg map[string][]ttmlMetadataEntry + + metadataSeq int +} + +func parseTTML(contents []byte) (model.LyricList, error) { + contents = xmlEncodingRegex.ReplaceAll(contents, []byte(``)) + + p := ttmlParser{ + decoder: xml.NewDecoder(bytes.NewReader(contents)), + params: ttmlTimingParams{ + frameRate: defaultTTMLFrameRate, + subFrameRate: defaultTTMLSubFrameRate, + tickRate: defaultTTMLTickRate, + }, + mainLinesByLang: make(map[string][]model.Line), + mainLineRefsByKey: make(map[string]ttmlLineRef), + translationEntriesByLg: make(map[string][]ttmlMetadataEntry), + pronunciationEntriesByLg: make(map[string][]ttmlMetadataEntry), + } + + root := ttmlTimingContext{lang: "xxx"} + + for { + token, err := p.decoder.Token() + if errors.Is(err, io.EOF) { + break + } + if err != nil { + return nil, err + } + + start, ok := token.(xml.StartElement) + if !ok { + continue + } + + if err := p.parseElement(start, root); err != nil { + return nil, err + } + } + + return p.toLyricList(), nil +} + +func (p *ttmlParser) parseElement(start xml.StartElement, parent ttmlTimingContext) error { + local := strings.ToLower(start.Name.Local) + if local == "tt" { + p.updateTimingParams(start.Attr) + } + + switch local { + case "translation": + return p.parseMetadataTrack(start, parent, ttmlLyricKindTranslation) + case "transliteration": + return p.parseMetadataTrack(start, parent, ttmlLyricKindPronunciation) + } + + ctx := p.childContext(start.Attr, parent) + if local == "p" { + lineText, tokens, err := p.parseParagraph(ctx) + if err != nil { + return err + } + if ctx.invalid || lineText == "" { + return nil + } + + parsedLine := model.Line{Value: lineText} + if ctx.hasBegin { + startMs := ctx.begin + parsedLine.Start = &startMs + } + if ctx.hasEnd { + endMs := ctx.end + parsedLine.End = &endMs + } + if len(tokens) > 0 { + parsedLine.Token = tokens + } + parsedLine = hydrateLineTimingFromTokens(parsedLine) + + lineKey, _ := attrValue(start.Attr, "key") + p.addMainLine(ctx.lang, lineKey, parsedLine) + return nil + } + + for { + token, err := p.decoder.Token() + if err != nil { + return err + } + + switch t := token.(type) { + case xml.StartElement: + nextParent := ctx + if ctx.invalid { + // Best effort: ignore invalid timing in container elements, and + // continue traversing descendants with parent context. + nextParent = parent + } + if err := p.parseElement(t, nextParent); err != nil { + return err + } + case xml.EndElement: + if strings.EqualFold(t.Name.Local, start.Name.Local) { + return nil + } + } + } +} + +func (p *ttmlParser) parseMetadataTrack(start xml.StartElement, parent ttmlTimingContext, kind string) error { + ctx := p.childContext(start.Attr, parent) + lang := normalizeTTMLLang(ctx.lang) + + for { + token, err := p.decoder.Token() + if err != nil { + return err + } + + switch t := token.(type) { + case xml.StartElement: + if strings.EqualFold(t.Name.Local, "text") { + entry, ok, err := p.parseMetadataText(t, ctx) + if err != nil { + return err + } + if ok { + p.addMetadataEntry(kind, lang, entry) + } + continue + } + + nextParent := ctx + if ctx.invalid { + nextParent = parent + } + if err := p.parseElement(t, nextParent); err != nil { + return err + } + case xml.EndElement: + if strings.EqualFold(t.Name.Local, start.Name.Local) { + return nil + } + } + } +} + +func (p *ttmlParser) parseMetadataText(start xml.StartElement, parent ttmlTimingContext) (ttmlMetadataEntry, bool, error) { + forKey, hasFor := attrValue(start.Attr, "for") + forKey = strings.TrimSpace(forKey) + + value, tokens, err := p.parseInlineElement(start, parent) + if err != nil { + return ttmlMetadataEntry{}, false, err + } + if !hasFor || forKey == "" { + return ttmlMetadataEntry{}, false, nil + } + + ctx := p.childContext(start.Attr, parent) + if ctx.invalid { + return ttmlMetadataEntry{}, false, nil + } + + line := model.Line{Value: sanitizeTTMLText(value)} + if ctx.hasBegin { + startMs := ctx.begin + line.Start = &startMs + } + if ctx.hasEnd { + endMs := ctx.end + line.End = &endMs + } + if len(tokens) > 0 { + line.Token = tokens + } + line = hydrateLineTimingFromTokens(line) + + if line.Value == "" && len(line.Token) == 0 { + return ttmlMetadataEntry{}, false, nil + } + + return ttmlMetadataEntry{key: forKey, line: line}, true, nil +} + +func (p *ttmlParser) parseParagraph(parent ttmlTimingContext) (string, []model.Token, error) { + var text strings.Builder + var tokens []model.Token + + for { + token, err := p.decoder.Token() + if err != nil { + return "", nil, err + } + + switch t := token.(type) { + case xml.StartElement: + value, inlineTokens, err := p.parseInlineElement(t, parent) + if err != nil { + return "", nil, err + } + text.WriteString(value) + tokens = append(tokens, inlineTokens...) + case xml.EndElement: + if strings.EqualFold(t.Name.Local, "p") { + return sanitizeTTMLText(text.String()), tokens, nil + } + case xml.CharData: + text.WriteString(string(t)) + } + } +} + +func (p *ttmlParser) parseInlineElement(start xml.StartElement, parent ttmlTimingContext) (string, []model.Token, error) { + local := strings.ToLower(start.Name.Local) + if local == "br" { + return "\n", nil, nil + } + + ctx := p.childContext(start.Attr, parent) + _, hasBegin := attrValue(start.Attr, "begin") + _, hasEnd := attrValue(start.Attr, "end") + _, hasDur := attrValue(start.Attr, "dur") + hasOwnTiming := hasBegin || hasEnd || hasDur + + var text strings.Builder + var tokens []model.Token + + for { + token, err := p.decoder.Token() + if err != nil { + return "", nil, err + } + + switch t := token.(type) { + case xml.StartElement: + value, inlineTokens, err := p.parseInlineElement(t, ctx) + if err != nil { + return "", nil, err + } + text.WriteString(value) + tokens = append(tokens, inlineTokens...) + case xml.EndElement: + if !strings.EqualFold(t.Name.Local, start.Name.Local) { + continue + } + + value := text.String() + tokenText := sanitizeTTMLText(value) + if local == "span" && hasOwnTiming && !ctx.invalid && tokenText != "" && len(tokens) == 0 { + parsedToken := model.Token{ + Value: tokenText, + Role: ctx.role, + } + if ctx.hasBegin { + startMs := ctx.begin + parsedToken.Start = &startMs + } + if ctx.hasEnd { + endMs := ctx.end + parsedToken.End = &endMs + } + tokens = append(tokens, parsedToken) + } + + return value, tokens, nil + case xml.CharData: + text.WriteString(string(t)) + } + } +} + +func (p *ttmlParser) toLyricList() model.LyricList { + res := make(model.LyricList, 0, len(p.mainLangOrder)+len(p.translationLangOrder)+len(p.pronunciationLangOrder)) + for _, lang := range p.mainLangOrder { + lines := p.mainLinesByLang[lang] + if len(lines) == 0 { + continue + } + res = append(res, model.Lyrics{ + Kind: ttmlLyricKindMain, + Lang: lang, + Line: lines, + Synced: linesAreSynced(lines), + }) + } + + res = append(res, p.buildMetadataLyrics(ttmlLyricKindTranslation, p.translationLangOrder, p.translationEntriesByLg)...) + res = append(res, p.buildMetadataLyrics(ttmlLyricKindPronunciation, p.pronunciationLangOrder, p.pronunciationEntriesByLg)...) + return res +} + +func (p *ttmlParser) buildMetadataLyrics(kind string, langOrder []string, entriesByLang map[string][]ttmlMetadataEntry) model.LyricList { + res := make(model.LyricList, 0, len(langOrder)) + + for _, lang := range langOrder { + entries := entriesByLang[lang] + if len(entries) == 0 { + continue + } + + seenKeys := make(map[string]struct{}, len(entries)) + resolved := make([]ttmlResolvedMetadataLine, 0, len(entries)) + for _, entry := range entries { + if _, exists := seenKeys[entry.key]; exists { + continue + } + seenKeys[entry.key] = struct{}{} + + ref, ok := p.mainLineRefsByKey[entry.key] + if !ok { + log.Warn("Skipping TTML metadata line without matching key", "kind", kind, "lang", lang, "key", entry.key) + continue + } + + line := entry.line + if line.Start == nil && ref.line.Start != nil { + startMs := *ref.line.Start + line.Start = &startMs + } + if line.End == nil && ref.line.End != nil { + endMs := *ref.line.End + line.End = &endMs + } + line = hydrateLineTimingFromTokens(line) + + if line.Value == "" && len(line.Token) == 0 { + continue + } + + resolved = append(resolved, ttmlResolvedMetadataLine{ + order: ref.order, + seq: entry.seq, + line: line, + }) + } + + if len(resolved) == 0 { + continue + } + + sort.SliceStable(resolved, func(i, j int) bool { + if resolved[i].order != resolved[j].order { + return resolved[i].order < resolved[j].order + } + return resolved[i].seq < resolved[j].seq + }) + + lines := make([]model.Line, len(resolved)) + for i := range resolved { + lines[i] = resolved[i].line + } + + res = append(res, model.Lyrics{ + Kind: kind, + Lang: lang, + Line: lines, + Synced: linesAreSynced(lines), + }) + } + + return res +} + +func (p *ttmlParser) addMainLine(lang string, lineKey string, line model.Line) { + lang = normalizeTTMLLang(lang) + if _, ok := p.mainLinesByLang[lang]; !ok { + p.mainLangOrder = append(p.mainLangOrder, lang) + } + p.mainLinesByLang[lang] = append(p.mainLinesByLang[lang], line) + + lineKey = strings.TrimSpace(lineKey) + if lineKey != "" { + if _, exists := p.mainLineRefsByKey[lineKey]; !exists { + p.mainLineRefsByKey[lineKey] = ttmlLineRef{ + order: p.mainLineOrder, + line: line, + } + } + } + p.mainLineOrder++ +} + +func (p *ttmlParser) addMetadataEntry(kind string, lang string, entry ttmlMetadataEntry) { + lang = normalizeTTMLLang(lang) + entry.seq = p.metadataSeq + p.metadataSeq++ + + switch kind { + case ttmlLyricKindTranslation: + if _, ok := p.translationEntriesByLg[lang]; !ok { + p.translationLangOrder = append(p.translationLangOrder, lang) + } + p.translationEntriesByLg[lang] = append(p.translationEntriesByLg[lang], entry) + case ttmlLyricKindPronunciation: + if _, ok := p.pronunciationEntriesByLg[lang]; !ok { + p.pronunciationLangOrder = append(p.pronunciationLangOrder, lang) + } + p.pronunciationEntriesByLg[lang] = append(p.pronunciationEntriesByLg[lang], entry) + } +} + +func (p *ttmlParser) childContext(attrs []xml.Attr, parent ttmlTimingContext) ttmlTimingContext { + ctx := parent + + if lang, ok := attrValue(attrs, "lang"); ok { + ctx.lang = normalizeTTMLLang(lang) + } + if role, ok := attrValue(attrs, "role"); ok { + role = strings.TrimSpace(role) + if role != "" { + if ctx.role == "" { + ctx.role = role + } else if !strings.Contains(ctx.role, role) { + ctx.role = ctx.role + " " + role + } + } + } + + beginExpr, hasBegin := attrValue(attrs, "begin") + endExpr, hasEnd := attrValue(attrs, "end") + durExpr, hasDur := attrValue(attrs, "dur") + + if hasBegin { + begin, kind, ok := parseTTMLTimeExpression(beginExpr, p.params) + if !ok { + ctx.invalid = true + return ctx + } + + base := int64(0) + if parent.hasBegin { + base = parent.begin + } + ctx.begin = resolveTTMLTime(begin, kind, base, parent) + ctx.hasBegin = true + } else { + ctx.begin = parent.begin + ctx.hasBegin = parent.hasBegin + } + + var calculatedEnd int64 + calculatedHasEnd := false + + if hasEnd { + end, kind, ok := parseTTMLTimeExpression(endExpr, p.params) + if !ok { + ctx.invalid = true + return ctx + } + + base := ctx.begin + if !ctx.hasBegin { + base = parent.begin + } + calculatedEnd = resolveTTMLTime(end, kind, base, parent) + calculatedHasEnd = true + } + + if hasDur { + dur, ok := parseTTMLDurationExpression(durExpr, p.params) + if !ok { + ctx.invalid = true + return ctx + } + if ctx.hasBegin { + durEnd := ctx.begin + dur + if !calculatedHasEnd || durEnd < calculatedEnd { + calculatedEnd = durEnd + calculatedHasEnd = true + } + } + } + + if !calculatedHasEnd && parent.hasEnd { + calculatedEnd = parent.end + calculatedHasEnd = true + } + + ctx.end = calculatedEnd + ctx.hasEnd = calculatedHasEnd + return ctx +} + +func (p *ttmlParser) updateTimingParams(attrs []xml.Attr) { + frameRate := p.params.frameRate + if value, ok := attrValue(attrs, "frameRate"); ok { + if parsed, err := strconv.ParseFloat(value, 64); err == nil && parsed > 0 { + frameRate = parsed + } + } + + if value, ok := attrValue(attrs, "frameRateMultiplier"); ok { + parts := strings.Fields(value) + if len(parts) == 2 { + numerator, errA := strconv.ParseFloat(parts[0], 64) + denominator, errB := strconv.ParseFloat(parts[1], 64) + if errA == nil && errB == nil && denominator > 0 { + frameRate = frameRate * (numerator / denominator) + } + } + } + + subFrameRate := p.params.subFrameRate + if value, ok := attrValue(attrs, "subFrameRate"); ok { + if parsed, err := strconv.ParseFloat(value, 64); err == nil && parsed > 0 { + subFrameRate = parsed + } + } + + tickRate := p.params.tickRate + if value, ok := attrValue(attrs, "tickRate"); ok { + if parsed, err := strconv.ParseFloat(value, 64); err == nil && parsed > 0 { + tickRate = parsed + } + } + + p.params.frameRate = max(frameRate, defaultTTMLFrameRate) + p.params.subFrameRate = max(subFrameRate, defaultTTMLSubFrameRate) + p.params.tickRate = max(tickRate, defaultTTMLTickRate) +} + +func parseTTMLDurationExpression(expr string, params ttmlTimingParams) (int64, bool) { + value, _, ok := parseTTMLTimeExpression(expr, params) + return value, ok +} + +func resolveTTMLTime(value int64, kind ttmlTimeKind, base int64, parent ttmlTimingContext) int64 { + switch kind { + case ttmlTimeAbsolute: + return value + case ttmlTimeOffset: + return base + value + case ttmlTimeAmbiguous: + absolute := value + offset := base + value + + // No parent timing context → no reference frame for offsets. + // Prefer absolute when offset differs (i.e., base > 0). + if !parent.hasBegin && !parent.hasEnd && base != 0 { + return absolute + } + + if parent.hasBegin && parent.hasEnd { + absoluteInParent := absolute >= parent.begin && absolute <= parent.end + offsetInParent := offset >= parent.begin && offset <= parent.end + if absoluteInParent && !offsetInParent { + return absolute + } + if offsetInParent && !absoluteInParent { + return offset + } + } + + if parent.hasBegin { + if absolute < parent.begin && offset >= parent.begin { + return offset + } + if absolute >= parent.begin && offset > absolute { + return absolute + } + } + return offset + default: + return base + value + } +} + +func parseTTMLTimeExpression(expr string, params ttmlTimingParams) (int64, ttmlTimeKind, bool) { + expr = strings.TrimSpace(expr) + if expr == "" { + return 0, ttmlTimeOffset, false + } + + lower := strings.ToLower(expr) + if strings.Contains(lower, "wallclock(") || + strings.Contains(lower, ".begin") || + strings.Contains(lower, ".end") { + log.Warn("Unsupported TTML time expression", "value", expr) + return 0, ttmlTimeOffset, false + } + + // Best-effort support for non-standard TTML seen in the wild where a + // bare decimal value is used (implicitly seconds), e.g. "0.170". + if value, err := strconv.ParseFloat(lower, 64); err == nil && value >= 0 { + return int64(math.Round(value * 1000)), ttmlTimeAmbiguous, true + } + + if matches := offsetTimeRegex.FindStringSubmatch(lower); len(matches) == 3 { + value, err := strconv.ParseFloat(matches[1], 64) + if err != nil { + return 0, ttmlTimeOffset, false + } + + unit := matches[2] + seconds := 0.0 + switch unit { + case "h": + seconds = value * 60 * 60 + case "m": + seconds = value * 60 + case "s": + seconds = value + case "ms": + seconds = value / 1000 + case "f": + seconds = value / params.frameRate + case "t": + seconds = value / params.tickRate + default: + return 0, ttmlTimeOffset, false + } + + return int64(math.Round(seconds * 1000)), ttmlTimeOffset, true + } + + colonCount := strings.Count(expr, ":") + switch colonCount { + case 1, 2: + clockMs, ok := parseTTMLClockTime(expr) + if !ok { + return 0, ttmlTimeAbsolute, false + } + return clockMs, ttmlTimeAbsolute, true + case 3: + framesMs, ok := parseTTMLFrameTime(expr, params) + if !ok { + return 0, ttmlTimeAbsolute, false + } + return framesMs, ttmlTimeAbsolute, true + default: + log.Warn("Unsupported TTML time expression", "value", expr) + return 0, ttmlTimeOffset, false + } +} + +func parseTTMLClockTime(value string) (int64, bool) { + parts := strings.Split(value, ":") + if len(parts) != 2 && len(parts) != 3 { + return 0, false + } + + hours := int64(0) + minutesIdx := 0 + if len(parts) == 3 { + h, err := strconv.ParseInt(parts[0], 10, 64) + if err != nil { + return 0, false + } + hours = h + minutesIdx = 1 + } + + minutes, err := strconv.ParseInt(parts[minutesIdx], 10, 64) + if err != nil { + return 0, false + } + + seconds, err := strconv.ParseFloat(parts[minutesIdx+1], 64) + if err != nil { + return 0, false + } + + totalSeconds := float64(hours*60*60+minutes*60) + seconds + return int64(math.Round(totalSeconds * 1000)), true +} + +func parseTTMLFrameTime(value string, params ttmlTimingParams) (int64, bool) { + parts := strings.Split(value, ":") + if len(parts) != 4 { + return 0, false + } + + hours, err := strconv.ParseInt(parts[0], 10, 64) + if err != nil { + return 0, false + } + + minutes, err := strconv.ParseInt(parts[1], 10, 64) + if err != nil { + return 0, false + } + + seconds, err := strconv.ParseInt(parts[2], 10, 64) + if err != nil { + return 0, false + } + + frameParts := strings.SplitN(parts[3], ".", 2) + frames, err := strconv.ParseFloat(frameParts[0], 64) + if err != nil { + return 0, false + } + + subFrames := 0.0 + if len(frameParts) == 2 { + subFrames, err = strconv.ParseFloat(frameParts[1], 64) + if err != nil { + return 0, false + } + } + + totalSeconds := float64(hours*60*60 + minutes*60 + seconds) + totalSeconds += frames / params.frameRate + totalSeconds += subFrames / (params.subFrameRate * params.frameRate) + + return int64(math.Round(totalSeconds * 1000)), true +} + +func attrValue(attrs []xml.Attr, key string) (string, bool) { + for _, attr := range attrs { + if strings.EqualFold(attr.Name.Local, key) { + return strings.TrimSpace(attr.Value), true + } + } + return "", false +} + +func normalizeTTMLLang(lang string) string { + lang = strings.ToLower(strings.TrimSpace(lang)) + if lang == "" { + return "xxx" + } + return lang +} + +func sanitizeTTMLText(raw string) string { + raw = str.SanitizeText(raw) + raw = strings.ReplaceAll(raw, "\r\n", "\n") + raw = strings.ReplaceAll(raw, "\r", "\n") + + lines := strings.Split(raw, "\n") + for i := range lines { + lines[i] = strings.TrimSpace(lines[i]) + } + return strings.TrimSpace(strings.Join(lines, "\n")) +} + +func linesAreSynced(lines []model.Line) bool { + for i := range lines { + if lines[i].Start != nil { + return true + } + for j := range lines[i].Token { + if lines[i].Token[j].Start != nil { + return true + } + } + } + return false +} + +func hydrateLineTimingFromTokens(line model.Line) model.Line { + if len(line.Token) == 0 { + return line + } + + var earliestStart *int64 + var latestEnd *int64 + for i := range line.Token { + token := line.Token[i] + if token.Start != nil { + if earliestStart == nil || *token.Start < *earliestStart { + v := *token.Start + earliestStart = &v + } + } + + candidateEnd := token.End + if candidateEnd == nil { + candidateEnd = token.Start + } + if candidateEnd != nil { + if latestEnd == nil || *candidateEnd > *latestEnd { + v := *candidateEnd + latestEnd = &v + } + } + } + + if line.Start == nil && earliestStart != nil { + v := *earliestStart + line.Start = &v + } + if line.End == nil && latestEnd != nil { + v := *latestEnd + line.End = &v + } + return line +} + +func max(v float64, fallback float64) float64 { + if v <= 0 { + return fallback + } + return v +} diff --git a/core/lyrics/ttml_test.go b/core/lyrics/ttml_test.go new file mode 100644 index 000000000..12270c27d --- /dev/null +++ b/core/lyrics/ttml_test.go @@ -0,0 +1,398 @@ +package lyrics + +import ( + "testing" + + "github.com/navidrome/navidrome/model" +) + +func TestParseTTML_MultiLanguageAndTiming(t *testing.T) { + content := []byte(` + + +
+

Line one

+

Line two
with break

+
+
+

Linha

+
+ +
`) + + list, err := parseTTML(content) + if err != nil { + t.Fatalf("parseTTML returned error: %v", err) + } + if len(list) != 2 { + t.Fatalf("expected 2 lyric tracks, got %d", len(list)) + } + + eng := list[0] + if eng.Lang != "eng" { + t.Fatalf("expected first track language 'eng', got %q", eng.Lang) + } + if !eng.Synced { + t.Fatal("expected first track to be synced") + } + assertTimedLine(t, eng.Line[0], 3000, "Line one") + assertTimedLine(t, eng.Line[1], 4517, "Line two\nwith break") + + por := list[1] + if por.Lang != "por" { + t.Fatalf("expected second track language 'por', got %q", por.Lang) + } + assertTimedLine(t, por.Line[0], 4500, "Linha") +} + +func TestParseTTML_UnsupportedCueSkipped(t *testing.T) { + content := []byte(` + + +
+

Skip me

+

Keep me

+
+ +
`) + + list, err := parseTTML(content) + if err != nil { + t.Fatalf("parseTTML returned error: %v", err) + } + if len(list) != 1 { + t.Fatalf("expected 1 lyric track, got %d", len(list)) + } + if len(list[0].Line) != 1 { + t.Fatalf("expected 1 line in lyric track, got %d", len(list[0].Line)) + } + assertTimedLine(t, list[0].Line[0], 1000, "Keep me") +} + +func TestParseTTML_BeginEndDurWithInheritance(t *testing.T) { + content := []byte(` + + +
+

First line

+

Second line

+
+ +
`) + + list, err := parseTTML(content) + if err != nil { + t.Fatalf("parseTTML returned error: %v", err) + } + if len(list) != 1 { + t.Fatalf("expected 1 lyric track, got %d", len(list)) + } + if list[0].Lang != "eng" { + t.Fatalf("expected language 'eng', got %q", list[0].Lang) + } + if len(list[0].Line) != 2 { + t.Fatalf("expected 2 lines, got %d", len(list[0].Line)) + } + assertTimedLine(t, list[0].Line[0], 16000, "First line") + assertTimedLine(t, list[0].Line[1], 18000, "Second line") +} + +func TestParseTTML_NonStandardBareSecondOffsets(t *testing.T) { + content := []byte(` + + +
+

First line

+

Second line

+
+ +
`) + + list, err := parseTTML(content) + if err != nil { + t.Fatalf("parseTTML returned error: %v", err) + } + if len(list) != 1 { + t.Fatalf("expected 1 lyric track, got %d", len(list)) + } + if len(list[0].Line) != 2 { + t.Fatalf("expected 2 lines, got %d", len(list[0].Line)) + } + assertTimedLine(t, list[0].Line[0], 10170, "First line") + assertTimedLine(t, list[0].Line[1], 13710, "Second line") +} + +func TestParseTTML_WordTimingTokens(t *testing.T) { + content := []byte(` + + +
+

+ Hello + echo +

+
+ +
`) + + list, err := parseTTML(content) + if err != nil { + t.Fatalf("parseTTML returned error: %v", err) + } + if len(list) != 1 { + t.Fatalf("expected 1 lyric track, got %d", len(list)) + } + if len(list[0].Line) != 1 { + t.Fatalf("expected 1 line, got %d", len(list[0].Line)) + } + + line := list[0].Line[0] + assertTimedLine(t, line, 1000, "Hello\necho") + if line.End == nil || *line.End != 3000 { + t.Fatalf("expected line end 3000, got %v", line.End) + } + if len(line.Token) != 3 { + t.Fatalf("expected 3 timed tokens, got %d", len(line.Token)) + } + + assertToken(t, line.Token[0], 1000, 1400, "He", "") + assertToken(t, line.Token[1], 1400, 1800, "llo", "") + assertToken(t, line.Token[2], 2000, 2500, "echo", "x-bg") +} + +func TestParseTTML_AmbiguousDecimalTimingPrefersAbsoluteWhenInsideParentWindow(t *testing.T) { + content := []byte(` + + +
+

+ go + go +

+
+ +
`) + + list, err := parseTTML(content) + if err != nil { + t.Fatalf("parseTTML returned error: %v", err) + } + if len(list) != 1 || len(list[0].Line) != 1 { + t.Fatalf("expected one parsed lyric line, got %#v", list) + } + + line := list[0].Line[0] + assertTimedLine(t, line, 43444, "go\ngo") + if line.End == nil || *line.End != 45570 { + t.Fatalf("expected line end 45570, got %v", line.End) + } + if len(line.Token) != 2 { + t.Fatalf("expected 2 timed tokens, got %d", len(line.Token)) + } + assertToken(t, line.Token[0], 43444, 43716, "go", "") + assertToken(t, line.Token[1], 43716, 43887, "go", "") +} + +func TestParseTTML_UnsyncedFallback(t *testing.T) { + content := []byte(` + + +
+

No timing here

+
+ +
`) + + list, err := parseTTML(content) + if err != nil { + t.Fatalf("parseTTML returned error: %v", err) + } + if len(list) != 1 { + t.Fatalf("expected 1 lyric track, got %d", len(list)) + } + if list[0].Lang != "xxx" { + t.Fatalf("expected default language 'xxx', got %q", list[0].Lang) + } + if list[0].Synced { + t.Fatal("expected lyric track to be unsynced") + } + if len(list[0].Line) != 1 { + t.Fatalf("expected 1 line, got %d", len(list[0].Line)) + } + if list[0].Line[0].Start != nil { + t.Fatalf("expected line start to be nil, got %v", *list[0].Line[0].Start) + } + if list[0].Line[0].Value != "No timing here" { + t.Fatalf("expected line value %q, got %q", "No timing here", list[0].Line[0].Value) + } +} + +func TestParseTTML_MetadataTracksByKey(t *testing.T) { + content := []byte(` + + + + + + + Hola + Skip me + + + + + konni + + + + + + +
+

こんにちは

+

こんばんは

+
+ +
`) + + list, err := parseTTML(content) + if err != nil { + t.Fatalf("parseTTML returned error: %v", err) + } + if len(list) != 3 { + t.Fatalf("expected 3 lyric tracks, got %d", len(list)) + } + + main := list[0] + if main.Kind != "main" { + t.Fatalf("expected main track kind %q, got %q", "main", main.Kind) + } + if main.Lang != "ja" { + t.Fatalf("expected main track language %q, got %q", "ja", main.Lang) + } + if len(main.Line) != 2 { + t.Fatalf("expected 2 lines in main track, got %d", len(main.Line)) + } + + translation := list[1] + if translation.Kind != "translation" { + t.Fatalf("expected translation kind %q, got %q", "translation", translation.Kind) + } + if translation.Lang != "es" { + t.Fatalf("expected translation language %q, got %q", "es", translation.Lang) + } + if len(translation.Line) != 1 { + t.Fatalf("expected 1 translation line, got %d", len(translation.Line)) + } + assertTimedLine(t, translation.Line[0], 1000, "Hola") + if translation.Line[0].End == nil || *translation.Line[0].End != 1500 { + t.Fatalf("expected translation line end %d, got %v", 1500, translation.Line[0].End) + } + + pronunciation := list[2] + if pronunciation.Kind != "pronunciation" { + t.Fatalf("expected pronunciation kind %q, got %q", "pronunciation", pronunciation.Kind) + } + if pronunciation.Lang != "ja-latn" { + t.Fatalf("expected pronunciation language %q, got %q", "ja-latn", pronunciation.Lang) + } + if len(pronunciation.Line) != 1 { + t.Fatalf("expected 1 pronunciation line, got %d", len(pronunciation.Line)) + } + assertTimedLine(t, pronunciation.Line[0], 2000, "konni") + if pronunciation.Line[0].End == nil || *pronunciation.Line[0].End != 2600 { + t.Fatalf("expected pronunciation line end %d, got %v", 2600, pronunciation.Line[0].End) + } + if len(pronunciation.Line[0].Token) != 2 { + t.Fatalf("expected 2 pronunciation tokens, got %d", len(pronunciation.Line[0].Token)) + } + assertToken(t, pronunciation.Line[0].Token[0], 2000, 2300, "ko", "") + assertToken(t, pronunciation.Line[0].Token[1], 2300, 2600, "nni", "") +} + +func TestParseTTML_PronunciationBareDecimalEndTimes(t *testing.T) { + content := []byte(` + + + + + + + I woke up + + + + + + +
+

起きた

+
+ +
`) + + list, err := parseTTML(content) + if err != nil { + t.Fatalf("parseTTML returned error: %v", err) + } + + var pronunciation *model.Lyrics + for i := range list { + if list[i].Kind == "pronunciation" { + pronunciation = &list[i] + break + } + } + if pronunciation == nil { + t.Fatal("expected a pronunciation track") + } + if len(pronunciation.Line) != 1 { + t.Fatalf("expected 1 pronunciation line, got %d", len(pronunciation.Line)) + } + + line := pronunciation.Line[0] + assertTimedLine(t, line, 2747, "I woke up") + if len(line.Token) != 3 { + t.Fatalf("expected 3 tokens, got %d", len(line.Token)) + } + assertToken(t, line.Token[0], 2747, 3018, "I", "") + assertToken(t, line.Token[1], 3018, 3179, "woke", "") + assertToken(t, line.Token[2], 3179, 3582, "up", "") +} + +func assertTimedLine(t *testing.T, line model.Line, expectedStart int64, expectedValue string) { + t.Helper() + + if line.Start == nil { + t.Fatal("expected line start to be set, got nil") + } + if *line.Start != expectedStart { + t.Fatalf("expected line start %d, got %d", expectedStart, *line.Start) + } + if line.Value != expectedValue { + t.Fatalf("expected line value %q, got %q", expectedValue, line.Value) + } +} + +func assertToken(t *testing.T, token model.Token, expectedStart int64, expectedEnd int64, expectedValue string, expectedRole string) { + t.Helper() + + if token.Start == nil { + t.Fatal("expected token start to be set, got nil") + } + if *token.Start != expectedStart { + t.Fatalf("expected token start %d, got %d", expectedStart, *token.Start) + } + if token.End == nil { + t.Fatal("expected token end to be set, got nil") + } + if *token.End != expectedEnd { + t.Fatalf("expected token end %d, got %d", expectedEnd, *token.End) + } + if token.Value != expectedValue { + t.Fatalf("expected token value %q, got %q", expectedValue, token.Value) + } + if token.Role != expectedRole { + t.Fatalf("expected token role %q, got %q", expectedRole, token.Role) + } +} diff --git a/model/lyrics.go b/model/lyrics.go index f75f3b11b..220eec7b5 100644 --- a/model/lyrics.go +++ b/model/lyrics.go @@ -11,14 +11,24 @@ import ( "github.com/navidrome/navidrome/utils/str" ) -type Line struct { +type Token struct { Start *int64 `structs:"start,omitempty" json:"start,omitempty"` + End *int64 `structs:"end,omitempty" json:"end,omitempty"` Value string `structs:"value" json:"value"` + Role string `structs:"role,omitempty" json:"role,omitempty"` +} + +type Line struct { + Start *int64 `structs:"start,omitempty" json:"start,omitempty"` + End *int64 `structs:"end,omitempty" json:"end,omitempty"` + Value string `structs:"value" json:"value"` + Token []Token `structs:"token,omitempty" json:"token,omitempty"` } type Lyrics struct { DisplayArtist string `structs:"displayArtist,omitempty" json:"displayArtist,omitempty"` DisplayTitle string `structs:"displayTitle,omitempty" json:"displayTitle,omitempty"` + Kind string `structs:"kind,omitempty" json:"kind,omitempty"` Lang string `structs:"lang" json:"lang"` Line []Line `structs:"line" json:"line"` Offset *int64 `structs:"offset,omitempty" json:"offset,omitempty"` diff --git a/server/subsonic/helpers.go b/server/subsonic/helpers.go index 74d57ade4..3b9412fb1 100644 --- a/server/subsonic/helpers.go +++ b/server/subsonic/helpers.go @@ -495,19 +495,47 @@ func mapExplicitStatus(explicitStatus string) string { func buildStructuredLyric(mf *model.MediaFile, lyrics model.Lyrics) responses.StructuredLyric { lines := make([]responses.Line, len(lyrics.Line)) + tokenLines := make([]responses.TokenLine, 0, len(lyrics.Line)) for i, line := range lyrics.Line { lines[i] = responses.Line{ Start: line.Start, Value: line.Value, } + if len(line.Token) == 0 { + continue + } + + tokens := make([]responses.LyricToken, len(line.Token)) + for j, token := range line.Token { + tokens[j] = responses.LyricToken{ + Start: token.Start, + End: token.End, + Value: token.Value, + Role: token.Role, + } + } + tokenLines = append(tokenLines, responses.TokenLine{ + Index: int32(i), + Start: line.Start, + End: line.End, + Value: line.Value, + Token: tokens, + }) + } + + kind := strings.TrimSpace(lyrics.Kind) + if kind == "" { + kind = "main" } structured := responses.StructuredLyric{ DisplayArtist: lyrics.DisplayArtist, DisplayTitle: lyrics.DisplayTitle, + Kind: kind, Lang: lyrics.Lang, Line: lines, + TokenLine: tokenLines, Offset: lyrics.Offset, Synced: lyrics.Synced, } diff --git a/server/subsonic/media_retrieval.go b/server/subsonic/media_retrieval.go index 3faae1650..963db067c 100644 --- a/server/subsonic/media_retrieval.go +++ b/server/subsonic/media_retrieval.go @@ -98,7 +98,9 @@ func (api *Router) GetLyrics(r *http.Request) (*responses.Subsonic, error) { response := newResponse() lyricsResponse := responses.Lyrics{} response.Lyrics = &lyricsResponse - mediaFiles, err := api.ds.MediaFile(r.Context()).GetAll(filter.SongsByArtistTitleWithLyricsFirst(artist, title)) + opts := filter.SongsByArtistTitleWithLyricsFirst(artist, title) + opts.Max = 0 + mediaFiles, err := api.ds.MediaFile(r.Context()).GetAll(opts) if err != nil { return nil, err @@ -108,25 +110,26 @@ func (api *Router) GetLyrics(r *http.Request) (*responses.Subsonic, error) { return response, nil } - structuredLyrics, err := api.lyrics.GetLyrics(r.Context(), &mediaFiles[0]) - if err != nil { - return nil, err + for i := range mediaFiles { + structuredLyrics, err := api.lyrics.GetLyrics(r.Context(), &mediaFiles[i]) + if err != nil { + return nil, err + } + if len(structuredLyrics) == 0 { + continue + } + + lyricsResponse.Artist = artist + lyricsResponse.Title = title + + var lyricsText strings.Builder + for _, line := range structuredLyrics[0].Line { + lyricsText.WriteString(line.Value + "\n") + } + lyricsResponse.Value = lyricsText.String() + break } - if len(structuredLyrics) == 0 { - return response, nil - } - - lyricsResponse.Artist = artist - lyricsResponse.Title = title - - var lyricsText strings.Builder - for _, line := range structuredLyrics[0].Line { - lyricsText.WriteString(line.Value + "\n") - } - - lyricsResponse.Value = lyricsText.String() - return response, nil } diff --git a/server/subsonic/media_retrieval_test.go b/server/subsonic/media_retrieval_test.go index 7f64fb47f..6c52d38bc 100644 --- a/server/subsonic/media_retrieval_test.go +++ b/server/subsonic/media_retrieval_test.go @@ -186,6 +186,36 @@ var _ = Describe("MediaRetrievalController", func() { Expect(response.Lyrics.Title).To(Equal("Never Gonna Give You Up")) Expect(response.Lyrics.Value).To(Equal("We're no strangers to love\nYou know the rules and so do I\n")) }) + + It("should continue searching candidates for sidecar lyrics", func() { + conf.Server.LyricsPriority = ".ttml,embedded" + r := newGetRequest("artist=Rick+Astley", "title=Never+Gonna+Give+You+Up") + baseTime := time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC) + mockRepo.SetData(model.MediaFiles{ + { + ID: "1", + Path: "tests/fixtures/01 Invisible (RED) Edit Version.mp3", + Artist: "Rick Astley", + Title: "Never Gonna Give You Up", + Lyrics: "[]", + UpdatedAt: baseTime.Add(2 * time.Hour), // Newer, but no TTML sidecar + }, + { + ID: "2", + Path: "tests/fixtures/test.mp3", + Artist: "Rick Astley", + Title: "Never Gonna Give You Up", + Lyrics: "[]", + UpdatedAt: baseTime.Add(1 * time.Hour), // Older, but has TTML sidecar + }, + }) + + response, err := router.GetLyrics(r) + Expect(err).ToNot(HaveOccurred()) + Expect(response.Lyrics.Artist).To(Equal("Rick Astley")) + Expect(response.Lyrics.Title).To(Equal("Never Gonna Give You Up")) + Expect(response.Lyrics.Value).To(Equal("We're no strangers to love\nYou know the rules and so do I\n")) + }) }) Describe("GetLyricsBySongId", func() { @@ -202,6 +232,11 @@ var _ = Describe("MediaRetrievalController", func() { Expect(realLyric.DisplayArtist).To(Equal(expectedLyric.DisplayArtist)) Expect(realLyric.DisplayTitle).To(Equal(expectedLyric.DisplayTitle)) + expectedKind := expectedLyric.Kind + if expectedKind == "" { + expectedKind = "main" + } + Expect(realLyric.Kind).To(Equal(expectedKind)) Expect(realLyric.Lang).To(Equal(expectedLyric.Lang)) Expect(realLyric.Synced).To(Equal(expectedLyric.Synced)) @@ -222,6 +257,40 @@ var _ = Describe("MediaRetrievalController", func() { Expect(*realLine.Start).To(Equal(*expectedLine.Start)) } } + + Expect(realLyric.TokenLine).To(HaveLen(len(expectedLyric.TokenLine))) + for j, realTokenLine := range realLyric.TokenLine { + expectedTokenLine := expectedLyric.TokenLine[j] + Expect(realTokenLine.Index).To(Equal(expectedTokenLine.Index)) + Expect(realTokenLine.Value).To(Equal(expectedTokenLine.Value)) + if expectedTokenLine.Start == nil { + Expect(realTokenLine.Start).To(BeNil()) + } else { + Expect(*realTokenLine.Start).To(Equal(*expectedTokenLine.Start)) + } + if expectedTokenLine.End == nil { + Expect(realTokenLine.End).To(BeNil()) + } else { + Expect(*realTokenLine.End).To(Equal(*expectedTokenLine.End)) + } + + Expect(realTokenLine.Token).To(HaveLen(len(expectedTokenLine.Token))) + for k, realToken := range realTokenLine.Token { + expectedToken := expectedTokenLine.Token[k] + Expect(realToken.Value).To(Equal(expectedToken.Value)) + Expect(realToken.Role).To(Equal(expectedToken.Role)) + if expectedToken.Start == nil { + Expect(realToken.Start).To(BeNil()) + } else { + Expect(*realToken.Start).To(Equal(*expectedToken.Start)) + } + if expectedToken.End == nil { + Expect(realToken.End).To(BeNil()) + } else { + Expect(*realToken.End).To(Equal(*expectedToken.End)) + } + } + } } } @@ -323,6 +392,238 @@ var _ = Describe("MediaRetrievalController", func() { }, }) }) + + It("should return multilingual TTML sidecar lyrics", func() { + conf.Server.LyricsPriority = ".ttml,embedded" + r := newGetRequest("id=1") + + mockRepo.SetData(model.MediaFiles{ + { + ID: "1", + Path: "tests/fixtures/test.mp3", + Artist: "Rick Astley", + Title: "Never Gonna Give You Up", + Lyrics: "[]", + }, + }) + + response, err := router.GetLyricsBySongId(r) + Expect(err).ToNot(HaveOccurred()) + + porTime := int64(18800) + ttmlTime := int64(22800) + compareResponses(response.LyricsList, responses.LyricsList{ + StructuredLyrics: responses.StructuredLyrics{ + { + DisplayArtist: "Rick Astley", + DisplayTitle: "Never Gonna Give You Up", + Lang: "eng", + Synced: true, + Line: []responses.Line{ + { + Start: ×[0], + Value: "We're no strangers to love", + }, + { + Start: &ttmlTime, + Value: "You know the rules and so do I", + }, + }, + }, + { + DisplayArtist: "Rick Astley", + DisplayTitle: "Never Gonna Give You Up", + Lang: "por", + Synced: true, + Line: []responses.Line{ + { + Start: &porTime, + Value: "Nao somos estranhos ao amor", + }, + }, + }, + }, + }) + }) + + It("should return metadata-linked translation and pronunciation tracks from TTML", func() { + conf.Server.LyricsPriority = ".ttml,embedded" + r := newGetRequest("id=1") + + mockRepo.SetData(model.MediaFiles{ + { + ID: "1", + Path: "tests/fixtures/test-metadata.mp3", + Artist: "Rick Astley", + Title: "Never Gonna Give You Up", + Lyrics: "[]", + }, + }) + + response, err := router.GetLyricsBySongId(r) + Expect(err).ToNot(HaveOccurred()) + + mainStartA := int64(1000) + mainStartB := int64(2000) + tokenStartA := int64(2000) + tokenEndA := int64(2300) + tokenStartB := int64(2300) + tokenEndB := int64(2600) + compareResponses(response.LyricsList, responses.LyricsList{ + StructuredLyrics: responses.StructuredLyrics{ + { + DisplayArtist: "Rick Astley", + DisplayTitle: "Never Gonna Give You Up", + Kind: "main", + Lang: "ja", + Synced: true, + Line: []responses.Line{ + { + Start: &mainStartA, + Value: "こんにちは", + }, + { + Start: &mainStartB, + Value: "こんばんは", + }, + }, + }, + { + DisplayArtist: "Rick Astley", + DisplayTitle: "Never Gonna Give You Up", + Kind: "translation", + Lang: "es", + Synced: true, + Line: []responses.Line{ + { + Start: &mainStartA, + Value: "Hola", + }, + }, + }, + { + DisplayArtist: "Rick Astley", + DisplayTitle: "Never Gonna Give You Up", + Kind: "pronunciation", + Lang: "ja-latn", + Synced: true, + Line: []responses.Line{ + { + Start: &mainStartB, + Value: "konni", + }, + }, + TokenLine: []responses.TokenLine{ + { + Index: 0, + Start: &mainStartB, + End: &tokenEndB, + Value: "konni", + Token: []responses.LyricToken{ + { + Start: &tokenStartA, + End: &tokenEndA, + Value: "ko", + }, + { + Start: &tokenStartB, + End: &tokenEndB, + Value: "nni", + }, + }, + }, + }, + }, + }, + }) + }) + + It("should return tokenized lines for songLyrics v2 clients", func() { + r := newGetRequest("id=1") + + lineStart := int64(1000) + lineEnd := int64(3000) + tokenStartA := int64(1000) + tokenEndA := int64(1400) + tokenStartB := int64(2000) + tokenEndB := int64(2500) + lyricsJson, err := json.Marshal(model.LyricList{ + { + Lang: "eng", + Synced: true, + Line: []model.Line{ + { + Start: &lineStart, + End: &lineEnd, + Value: "Hello echo", + Token: []model.Token{ + { + Start: &tokenStartA, + End: &tokenEndA, + Value: "Hello", + }, + { + Start: &tokenStartB, + End: &tokenEndB, + Value: "echo", + Role: "x-bg", + }, + }, + }, + }, + }, + }) + Expect(err).ToNot(HaveOccurred()) + + mockRepo.SetData(model.MediaFiles{ + { + ID: "1", + Artist: "Rick Astley", + Title: "Never Gonna Give You Up", + Lyrics: string(lyricsJson), + }, + }) + + response, err := router.GetLyricsBySongId(r) + Expect(err).ToNot(HaveOccurred()) + compareResponses(response.LyricsList, responses.LyricsList{ + StructuredLyrics: responses.StructuredLyrics{ + { + DisplayArtist: "Rick Astley", + DisplayTitle: "Never Gonna Give You Up", + Lang: "eng", + Synced: true, + Line: []responses.Line{ + { + Start: &lineStart, + Value: "Hello echo", + }, + }, + TokenLine: []responses.TokenLine{ + { + Index: 0, + Start: &lineStart, + End: &lineEnd, + Value: "Hello echo", + Token: []responses.LyricToken{ + { + Start: &tokenStartA, + End: &tokenEndA, + Value: "Hello", + }, + { + Start: &tokenStartB, + End: &tokenEndB, + Value: "echo", + Role: "x-bg", + }, + }, + }, + }, + }, + }, + }) + }) }) }) diff --git a/server/subsonic/opensubsonic.go b/server/subsonic/opensubsonic.go index 353cf1077..f0917baa2 100644 --- a/server/subsonic/opensubsonic.go +++ b/server/subsonic/opensubsonic.go @@ -11,7 +11,7 @@ func (api *Router) GetOpenSubsonicExtensions(_ *http.Request) (*responses.Subson response.OpenSubsonicExtensions = &responses.OpenSubsonicExtensions{ {Name: "transcodeOffset", Versions: []int32{1}}, {Name: "formPost", Versions: []int32{1}}, - {Name: "songLyrics", Versions: []int32{1}}, + {Name: "songLyrics", Versions: []int32{1, 2}}, {Name: "indexBasedQueue", Versions: []int32{1}}, {Name: "transcoding", Versions: []int32{1}}, } diff --git a/server/subsonic/opensubsonic_test.go b/server/subsonic/opensubsonic_test.go index 92d1c3e84..068030ec8 100644 --- a/server/subsonic/opensubsonic_test.go +++ b/server/subsonic/opensubsonic_test.go @@ -38,7 +38,7 @@ var _ = Describe("GetOpenSubsonicExtensions", func() { HaveLen(5), ContainElement(responses.OpenSubsonicExtension{Name: "transcodeOffset", Versions: []int32{1}}), ContainElement(responses.OpenSubsonicExtension{Name: "formPost", Versions: []int32{1}}), - ContainElement(responses.OpenSubsonicExtension{Name: "songLyrics", Versions: []int32{1}}), + ContainElement(responses.OpenSubsonicExtension{Name: "songLyrics", Versions: []int32{1, 2}}), ContainElement(responses.OpenSubsonicExtension{Name: "indexBasedQueue", Versions: []int32{1}}), ContainElement(responses.OpenSubsonicExtension{Name: "transcoding", Versions: []int32{1}}), )) diff --git a/server/subsonic/responses/responses.go b/server/subsonic/responses/responses.go index f0bb26f66..ff5ae0d3b 100644 --- a/server/subsonic/responses/responses.go +++ b/server/subsonic/responses/responses.go @@ -537,13 +537,30 @@ type Line struct { Value string `xml:",chardata" json:"value"` } +type LyricToken struct { + Start *int64 `xml:"start,attr,omitempty" json:"start,omitempty"` + End *int64 `xml:"end,attr,omitempty" json:"end,omitempty"` + Value string `xml:"value,attr" json:"value"` + Role string `xml:"role,attr,omitempty" json:"role,omitempty"` +} + +type TokenLine struct { + Index int32 `xml:"index,attr" json:"index"` + Start *int64 `xml:"start,attr,omitempty" json:"start,omitempty"` + End *int64 `xml:"end,attr,omitempty" json:"end,omitempty"` + Value string `xml:"value,attr,omitempty" json:"value,omitempty"` + Token []LyricToken `xml:"token,omitempty" json:"token,omitempty"` +} + type StructuredLyric struct { - DisplayArtist string `xml:"displayArtist,attr,omitempty" json:"displayArtist,omitempty"` - DisplayTitle string `xml:"displayTitle,attr,omitempty" json:"displayTitle,omitempty"` - Lang string `xml:"lang,attr" json:"lang"` - Line []Line `xml:"line" json:"line"` - Offset *int64 `xml:"offset,attr,omitempty" json:"offset,omitempty"` - Synced bool `xml:"synced,attr" json:"synced"` + DisplayArtist string `xml:"displayArtist,attr,omitempty" json:"displayArtist,omitempty"` + DisplayTitle string `xml:"displayTitle,attr,omitempty" json:"displayTitle,omitempty"` + Kind string `xml:"kind,attr,omitempty" json:"kind,omitempty"` + Lang string `xml:"lang,attr" json:"lang"` + Line []Line `xml:"line" json:"line"` + TokenLine []TokenLine `xml:"tokenLine,omitempty" json:"tokenLine,omitempty"` + Offset *int64 `xml:"offset,attr,omitempty" json:"offset,omitempty"` + Synced bool `xml:"synced,attr" json:"synced"` } type StructuredLyrics []StructuredLyric diff --git a/tests/fixtures/bom-test.ttml b/tests/fixtures/bom-test.ttml new file mode 100644 index 000000000..319ab1f07 --- /dev/null +++ b/tests/fixtures/bom-test.ttml @@ -0,0 +1,2 @@ + +

BOM test line

diff --git a/tests/fixtures/bom-utf16-test.ttml b/tests/fixtures/bom-utf16-test.ttml new file mode 100644 index 0000000000000000000000000000000000000000..a5621ef5d54ddd1a6a748046809f0ac7cf81ead1 GIT binary patch literal 414 zcmaKo;R=F45QOJjrDAFy+*oGX-)?$ZJ_4%3VF`Ruc_(Sm07EE;wB(%fl?J8dKcwxBxju2j!wj#8~$lHQ_` + + + + + + + Hola + + + + + konni + + + + + + +
+

こんにちは

+

こんばんは

+
+ +
diff --git a/tests/fixtures/test.ttml b/tests/fixtures/test.ttml new file mode 100644 index 000000000..a85673a1b --- /dev/null +++ b/tests/fixtures/test.ttml @@ -0,0 +1,12 @@ + + + +
+

We're no strangers to love

+

You know the rules and so do I

+
+
+

Nao somos estranhos ao amor

+
+ +
diff --git a/ui/src/actions/player.js b/ui/src/actions/player.js index 9056abeb6..f55102207 100644 --- a/ui/src/actions/player.js +++ b/ui/src/actions/player.js @@ -9,6 +9,7 @@ export const PLAYER_SET_VOLUME = 'PLAYER_SET_VOLUME' export const PLAYER_SET_MODE = 'PLAYER_SET_MODE' export const TRANSCODING_SET_PROFILE = 'TRANSCODING_SET_PROFILE' export const PLAYER_REFRESH_QUEUE = 'PLAYER_REFRESH_QUEUE' +export const PLAYER_UPDATE_LYRIC = 'PLAYER_UPDATE_LYRIC' export const setTrack = (data) => ({ type: PLAYER_SET_TRACK, @@ -114,3 +115,8 @@ export const refreshQueue = (resolvedUrls) => ({ type: PLAYER_REFRESH_QUEUE, data: resolvedUrls, }) + +export const updateQueueLyric = (trackId, lyric) => ({ + type: PLAYER_UPDATE_LYRIC, + data: { trackId, lyric }, +}) diff --git a/ui/src/audioplayer/KaraokeLyricsOverlay.jsx b/ui/src/audioplayer/KaraokeLyricsOverlay.jsx new file mode 100644 index 000000000..3814cbee6 --- /dev/null +++ b/ui/src/audioplayer/KaraokeLyricsOverlay.jsx @@ -0,0 +1,1228 @@ +import React, { + memo, + useCallback, + useEffect, + useMemo, + useRef, + useState, +} from 'react' +import clsx from 'clsx' +import Button from '@material-ui/core/Button' +import IconButton from '@material-ui/core/IconButton' +import Popover from '@material-ui/core/Popover' +import Slider from '@material-ui/core/Slider' +import Typography from '@material-ui/core/Typography' +import CloseIcon from '@material-ui/icons/Close' +import TuneIcon from '@material-ui/icons/Tune' +import { makeStyles } from '@material-ui/core/styles' +import { + buildKaraokeLines, + getActiveKaraokeState, + hasStructuredLyricContent, + resolveLayerLineForMain, + resolveKaraokeTokenWindow, +} from './lyrics' + +const KARAOKE_RENDER_LEAD_MS = 24 +const KARAOKE_CLOCK_DRIFT_RESET_MS = 140 +const KARAOKE_CLOCK_RESET_THRESHOLD_MS = 320 +const KARAOKE_MONOTONIC_JITTER_MS = 60 +const KARAOKE_RENDER_UPDATE_EPSILON_MS = 6 +const KARAOKE_WORD_SETTLE_MS = 96 +const KARAOKE_ANIMATION_MS = 150 +const KARAOKE_DEFAULT_HEIGHT_PX = 300 +const KARAOKE_MIN_HEIGHT_PX = 150 +const KARAOKE_MAX_HEIGHT_RATIO = 0.72 +const KARAOKE_MAX_HEIGHT_PX = 760 +const KARAOKE_CENTER_SPACER_RATIO = 0.5 +const KARAOKE_CENTER_SPACER_MIN_PX = 132 + +const TOKEN_DONE_ALPHA = 1 +const TOKEN_FUTURE_ALPHA = 0.34 +const TOKEN_ACTIVE_ALPHA = 1 +const TOKEN_WIPE_EDGE_PCT = 8 +const TOKEN_WIPE_GLOW_PCT = 16 + +const COLOR_PRESETS = [ + { key: 'white', label: 'White', value: 'rgba(255, 255, 255, 0.92)' }, + { key: 'blue', label: 'Blue', value: 'rgba(120, 160, 220, 0.75)' }, + { key: 'green', label: 'Green', value: 'rgba(100, 200, 130, 0.7)' }, + { key: 'pink', label: 'Pink', value: 'rgba(240, 140, 170, 0.75)' }, + { key: 'purple', label: 'Purple', value: 'rgba(180, 140, 240, 0.75)' }, + { key: 'orange', label: 'Orange', value: 'rgba(240, 180, 100, 0.75)' }, + { key: 'cyan', label: 'Cyan', value: 'rgba(100, 210, 220, 0.75)' }, + { key: 'yellow', label: 'Yellow', value: 'rgba(240, 230, 110, 0.75)' }, +] + +const DEFAULT_LYRICS_SETTINGS = { + tr: { fontSize: 14, colorKey: 'blue' }, + main: { fontSize: 24, colorKey: 'white' }, + pr: { fontSize: 14, colorKey: 'green' }, +} + +const SETTINGS_STORAGE_KEY = 'karaoke-lyrics-settings' + +const loadLyricsSettings = () => { + try { + const raw = localStorage.getItem(SETTINGS_STORAGE_KEY) + if (raw) { + const parsed = JSON.parse(raw) + return { + tr: { ...DEFAULT_LYRICS_SETTINGS.tr, ...parsed.tr }, + main: { ...DEFAULT_LYRICS_SETTINGS.main, ...parsed.main }, + pr: { ...DEFAULT_LYRICS_SETTINGS.pr, ...parsed.pr }, + } + } + } catch { + /* ignore */ + } + return { ...DEFAULT_LYRICS_SETTINGS } +} + +const saveLyricsSettings = (settings) => { + try { + localStorage.setItem(SETTINGS_STORAGE_KEY, JSON.stringify(settings)) + } catch { + /* ignore */ + } +} + +const getColorValue = (colorKey) => + COLOR_PRESETS.find((c) => c.key === colorKey)?.value || COLOR_PRESETS[0].value + +const useStyles = makeStyles((theme) => ({ + overlay: { + position: 'fixed', + left: '50%', + bottom: 100, + transform: 'translateX(-50%)', + zIndex: 1400, + width: 'min(900px, calc(100vw - 32px))', + minHeight: KARAOKE_MIN_HEIGHT_PX, + background: 'rgba(6, 8, 12, 0.9)', + borderRadius: 12, + border: '1px solid rgba(255, 255, 255, 0.12)', + boxShadow: '0 18px 48px rgba(0, 0, 0, 0.42)', + backdropFilter: 'blur(10px)', + color: theme.palette.common.white, + display: 'flex', + flexDirection: 'column', + overflow: 'hidden', + '@media (max-width:810px)': { + bottom: 78, + width: 'calc(100vw - 12px)', + borderRadius: 8, + minHeight: 180, + maxHeight: '65vh', + }, + }, + resizeHandle: { + height: 14, + cursor: 'ns-resize', + flexShrink: 0, + position: 'relative', + '&::after': { + content: '""', + position: 'absolute', + left: '50%', + top: 4, + transform: 'translateX(-50%)', + width: 56, + height: 3, + borderRadius: 999, + background: 'rgba(255, 255, 255, 0.22)', + }, + '@media (max-width:810px)': { + display: 'none', + }, + }, + header: { + display: 'flex', + alignItems: 'center', + justifyContent: 'space-between', + gap: theme.spacing(1), + padding: theme.spacing(0.3, 1.3, 0.4, 1.3), + }, + headerLeft: { + display: 'flex', + alignItems: 'center', + gap: theme.spacing(1), + minWidth: 0, + }, + language: { + fontSize: 11, + letterSpacing: '0.08em', + opacity: 0.72, + textTransform: 'uppercase', + whiteSpace: 'nowrap', + }, + layerControls: { + display: 'flex', + alignItems: 'center', + gap: theme.spacing(0.5), + }, + layerToggle: { + minWidth: 34, + minHeight: 24, + padding: theme.spacing(0, 0.8), + fontSize: 10, + letterSpacing: '0.08em', + borderRadius: 999, + color: 'rgba(203, 213, 225, 0.95)', + background: 'rgba(100, 116, 139, 0.26)', + border: '1px solid rgba(148, 163, 184, 0.45)', + transition: `all ${KARAOKE_ANIMATION_MS}ms ease-in-out`, + '&.Mui-disabled': { + color: 'rgba(148, 163, 184, 0.45)', + borderColor: 'rgba(100, 116, 139, 0.3)', + background: 'rgba(71, 85, 105, 0.2)', + }, + }, + layerToggleActive: { + color: 'rgba(220, 252, 231, 0.98)', + borderColor: 'rgba(34, 197, 94, 0.96)', + background: 'rgba(34, 197, 94, 0.28)', + }, + closeButton: { + color: 'rgba(255, 255, 255, 0.72)', + }, + inlineTr: { + margin: '0 0 2px 0', + textAlign: 'center', + fontWeight: 400, + lineHeight: 1.2, + letterSpacing: '0.01em', + transition: `opacity ${KARAOKE_ANIMATION_MS}ms ease-in-out, font-size ${KARAOKE_ANIMATION_MS}ms ease-in-out`, + }, + inlinePr: { + margin: '2px 0 0 0', + textAlign: 'center', + fontWeight: 400, + lineHeight: 1.2, + letterSpacing: '0.01em', + transition: `opacity ${KARAOKE_ANIMATION_MS}ms ease-in-out, font-size ${KARAOKE_ANIMATION_MS}ms ease-in-out`, + }, + body: { + padding: theme.spacing(0.5, 2, 1.4, 2), + overflowY: 'auto', + overflowX: 'hidden', + scrollBehavior: 'smooth', + flex: 1, + overscrollBehavior: 'contain', + scrollbarWidth: 'none', + msOverflowStyle: 'none', + '&::-webkit-scrollbar': { + display: 'none', + width: 0, + height: 0, + }, + '@media (max-width:810px)': { + padding: theme.spacing(0.35, 1.2, 1.2, 1.2), + }, + }, + lines: { + display: 'flex', + flexDirection: 'column', + gap: theme.spacing(1.24), + paddingBottom: theme.spacing(1), + }, + line: { + margin: 0, + fontWeight: 600, + lineHeight: 1.24, + letterSpacing: '0.01em', + textAlign: 'center', + color: 'rgba(255, 255, 255, 0.62)', + transition: `opacity ${KARAOKE_ANIMATION_MS}ms ease-in-out, color ${KARAOKE_ANIMATION_MS}ms ease-in-out, font-size 280ms ease-in-out`, + }, + token: { + display: 'inline-block', + whiteSpace: 'pre-wrap', + transition: `color ${KARAOKE_ANIMATION_MS}ms ease-in-out, text-shadow ${KARAOKE_ANIMATION_MS}ms ease-in-out`, + }, + settingsButton: { + color: 'rgba(255, 255, 255, 0.55)', + padding: 4, + '&:hover': { + color: 'rgba(255, 255, 255, 0.85)', + }, + }, + settingsPanel: { + background: 'rgba(12, 14, 20, 0.96)', + border: '1px solid rgba(255, 255, 255, 0.12)', + borderRadius: 10, + padding: theme.spacing(1.5, 2), + width: 260, + backdropFilter: 'blur(12px)', + }, + settingsSection: { + marginBottom: theme.spacing(1.2), + '&:last-child': { + marginBottom: 0, + }, + }, + settingsLabel: { + fontSize: 10, + fontWeight: 600, + letterSpacing: '0.1em', + textTransform: 'uppercase', + color: 'rgba(255, 255, 255, 0.55)', + marginBottom: 4, + }, + settingsRow: { + display: 'flex', + alignItems: 'center', + gap: theme.spacing(1), + }, + settingsSlider: { + flex: 1, + color: 'rgba(255, 255, 255, 0.6)', + '& .MuiSlider-thumb': { + width: 12, + height: 12, + }, + '& .MuiSlider-rail': { + opacity: 0.3, + }, + }, + settingsSliderValue: { + fontSize: 11, + color: 'rgba(255, 255, 255, 0.5)', + minWidth: 22, + textAlign: 'right', + }, + colorDots: { + display: 'flex', + gap: 5, + marginTop: 4, + }, + colorDot: { + width: 16, + height: 16, + borderRadius: '50%', + border: '2px solid transparent', + cursor: 'pointer', + transition: 'border-color 120ms ease, transform 120ms ease', + '&:hover': { + transform: 'scale(1.2)', + }, + }, + colorDotActive: { + borderColor: 'rgba(255, 255, 255, 0.85)', + }, +})) + +const clamp = (v, min, max) => Math.max(min, Math.min(max, v)) +const lerp = (from, to, t) => from + (to - from) * t + +const normalizeForComparison = (text) => + (text || '').replace(/[\s\p{P}]/gu, '').toLowerCase() + +const shouldShowAuxLine = (mainLine, auxLine) => { + if (!auxLine || !auxLine.value) return false + return ( + normalizeForComparison(auxLine.value) !== + normalizeForComparison(mainLine.value) + ) +} + +const SettingsSection = ({ label, layer, settings, onChange, classes }) => { + const s = settings[layer] + return ( +
+
{label}
+
+ + onChange({ ...settings, [layer]: { ...s, fontSize: val } }) + } + /> + {s.fontSize} +
+
+ {COLOR_PRESETS.map((preset) => ( +
+ onChange({ ...settings, [layer]: { ...s, colorKey: preset.key } }) + } + /> + ))} +
+
+ ) +} + +const LyricsSettingsPopover = ({ settings, onChange }) => { + const classes = useStyles() + const [anchorEl, setAnchorEl] = useState(null) + + const handleToggle = useCallback((e) => { + e.stopPropagation() + setAnchorEl((prev) => (prev ? null : e.currentTarget)) + }, []) + + const handleClose = useCallback(() => setAnchorEl(null), []) + + return ( + <> + + + + + + + + + + ) +} + +const easeInOut = (v) => { + const clamped = clamp(v, 0, 1) + return clamped < 0.5 + ? 2 * clamped * clamped + : 1 - Math.pow(-2 * clamped + 2, 2) / 2 +} + +const getMaxHeightPx = () => { + if (typeof window === 'undefined') { + return KARAOKE_MAX_HEIGHT_PX + } + return Math.min( + Math.floor(window.innerHeight * KARAOKE_MAX_HEIGHT_RATIO), + KARAOKE_MAX_HEIGHT_PX, + ) +} + +const buildSegmentsFromLine = (line) => { + if (!line || !Array.isArray(line.tokens) || line.tokens.length === 0) { + return [{ text: line?.value || '', token: null, tokenIndex: -1 }] + } + + const text = line.value || '' + const matchedSegments = [] + const fallbackSegments = [] + let cursor = 0 + let allMatched = text.length > 0 + let anyMatched = false + + const pushFallbackSeparatorIfNeeded = (nextTokenText) => { + if (fallbackSegments.length === 0) { + return + } + const prevText = fallbackSegments[fallbackSegments.length - 1].text || '' + if (!prevText || !nextTokenText) { + return + } + if (/\s$/.test(prevText) || /^\s/.test(nextTokenText)) { + return + } + if (/[A-Za-z0-9]$/.test(prevText) && /^[A-Za-z0-9]/.test(nextTokenText)) { + fallbackSegments.push({ text: ' ', token: null, tokenIndex: -1 }) + } + } + + for (let tokenIndex = 0; tokenIndex < line.tokens.length; tokenIndex += 1) { + const token = line.tokens[tokenIndex] + const tokenText = token.value || '' + if (!tokenText) { + continue + } + + pushFallbackSeparatorIfNeeded(tokenText) + fallbackSegments.push({ text: tokenText, token, tokenIndex }) + + if (!text) { + allMatched = false + continue + } + + const foundAt = text.indexOf(tokenText, cursor) + const normalizedFoundAt = + foundAt >= 0 + ? foundAt + : text.toLowerCase().indexOf(tokenText.toLowerCase(), cursor) + + if (normalizedFoundAt >= 0) { + anyMatched = true + if (normalizedFoundAt > cursor) { + matchedSegments.push({ + text: text.slice(cursor, normalizedFoundAt), + token: null, + tokenIndex: -1, + }) + } + const matchedTokenText = text.slice( + normalizedFoundAt, + normalizedFoundAt + tokenText.length, + ) + matchedSegments.push({ + text: matchedTokenText || tokenText, + token, + tokenIndex, + }) + cursor = normalizedFoundAt + tokenText.length + } else { + allMatched = false + } + } + + if (allMatched && anyMatched) { + if (cursor < text.length) { + matchedSegments.push({ + text: text.slice(cursor), + token: null, + tokenIndex: -1, + }) + } + return matchedSegments + } + + if (fallbackSegments.length > 0) { + return fallbackSegments + } + + return [{ text, token: null, tokenIndex: -1 }] +} + +const getLineRenderWindow = (line, nextLineStart) => { + let start = Number.isFinite(Number(line?.start)) ? Number(line.start) : null + let end = Number.isFinite(Number(line?.end)) ? Number(line.end) : null + const fallbackEnd = Number.isFinite(Number(nextLineStart)) + ? Number(nextLineStart) + : null + + if (end == null) { + end = fallbackEnd + } + + const tokens = Array.isArray(line?.tokens) ? line.tokens : [] + if (tokens.length > 0) { + const firstWindow = resolveKaraokeTokenWindow(line, 0, nextLineStart) + const lastWindow = resolveKaraokeTokenWindow( + line, + tokens.length - 1, + nextLineStart, + ) + + if ( + firstWindow.start != null && + (start == null || firstWindow.start < start) + ) { + start = firstWindow.start + } + if (lastWindow.end != null && (end == null || lastWindow.end > end)) { + end = lastWindow.end + } + } + + return { start, end } +} + +const shouldSkipLineFrame = ( + prevPlaybackMs, + nextPlaybackMs, + line, + nextLineStart, +) => { + if (prevPlaybackMs === nextPlaybackMs) { + return true + } + + const { start, end } = getLineRenderWindow(line, nextLineStart) + + if (start != null) { + const activationStart = start - 220 + if (prevPlaybackMs < activationStart && nextPlaybackMs < activationStart) { + return true + } + } + + if (end != null) { + const settleEnd = end + KARAOKE_WORD_SETTLE_MS + 160 + if (prevPlaybackMs > settleEnd && nextPlaybackMs > settleEnd) { + return true + } + } + + return false +} + +const areLineStylesEqual = (prevStyle, nextStyle) => { + const a = prevStyle || {} + const b = nextStyle || {} + return ( + a.opacity === b.opacity && + a.color === b.color && + a.fontSize === b.fontSize && + a.fontWeight === b.fontWeight + ) +} + +const parseColorRGB = (rgba) => { + const m = (rgba || '').match(/rgba?\((\d+),\s*(\d+),\s*(\d+)/) + return m ? [parseInt(m[1]), parseInt(m[2]), parseInt(m[3])] : [255, 255, 255] +} + +const buildTokenWipeStyle = ({ + fillProgress, + highlightAlpha, + futureAlpha, + rgb, +}) => { + const [r, g, b] = rgb || [255, 255, 255] + const fillPct = clamp(fillProgress, 0, 1) * 100 + const doneColor = `rgba(${r}, ${g}, ${b}, ${clamp(highlightAlpha, TOKEN_DONE_ALPHA, TOKEN_ACTIVE_ALPHA)})` + const futureColor = `rgba(${r}, ${g}, ${b}, ${futureAlpha})` + const activeShadow = `0 0 8px rgba(${r}, ${g}, ${b}, 0.34)` + + if (fillPct <= 0) { + return { color: futureColor, textShadow: 'none' } + } + + const edgeStart = clamp(fillPct - TOKEN_WIPE_EDGE_PCT, 0, 100) + const glowStop = clamp(fillPct + TOKEN_WIPE_GLOW_PCT, 0, 100) + const glowColor = `rgba(${r}, ${g}, ${b}, ${clamp(highlightAlpha + 0.18, TOKEN_DONE_ALPHA, TOKEN_ACTIVE_ALPHA)})` + return { + color: 'transparent', + WebkitTextFillColor: 'transparent', + backgroundImage: `linear-gradient(90deg, ${doneColor} 0%, ${doneColor} ${edgeStart}%, ${glowColor} ${fillPct}%, ${futureColor} ${glowStop}%, ${futureColor} 100%)`, + backgroundClip: 'text', + WebkitBackgroundClip: 'text', + textShadow: activeShadow, + } +} + +const KaraokeLineRow = memo( + ({ + line, + nextLineStart, + renderPlaybackMs, + className, + style, + tokenClassName, + highlightTokens = true, + }) => { + const segments = buildSegmentsFromLine(line) + const tokenRGB = useMemo( + () => (style?.color ? parseColorRGB(style.color) : [255, 255, 255]), + [style?.color], + ) + + return ( + + {segments.map((segment, idx) => { + if (!segment.token) { + return {segment.text} + } + + if (!highlightTokens) { + return {segment.text} + } + + const { start: tokenStart, end: tokenEnd } = + resolveKaraokeTokenWindow(line, segment.tokenIndex, nextLineStart) + + const isDone = tokenEnd != null ? renderPlaybackMs >= tokenEnd : false + const isActive = + !isDone && tokenStart != null && renderPlaybackMs >= tokenStart + + const progress = + isDone || + tokenStart == null || + tokenEnd == null || + tokenEnd <= tokenStart + ? isDone + ? 1 + : 0 + : clamp( + (renderPlaybackMs - tokenStart) / (tokenEnd - tokenStart), + 0, + 1, + ) + + const justEnded = + tokenEnd != null && + renderPlaybackMs > tokenEnd && + renderPlaybackMs <= tokenEnd + KARAOKE_WORD_SETTLE_MS + + const settleProgress = + justEnded && tokenEnd != null + ? clamp( + (renderPlaybackMs - tokenEnd) / KARAOKE_WORD_SETTLE_MS, + 0, + 1, + ) + : 0 + + let alpha = TOKEN_FUTURE_ALPHA + if (isDone) { + alpha = TOKEN_DONE_ALPHA + } else if (isActive) { + alpha = lerp( + TOKEN_FUTURE_ALPHA, + TOKEN_ACTIVE_ALPHA, + easeInOut(progress), + ) + } + if (justEnded) { + alpha = lerp( + TOKEN_ACTIVE_ALPHA, + TOKEN_DONE_ALPHA, + easeInOut(settleProgress), + ) + } + alpha = clamp(alpha, TOKEN_FUTURE_ALPHA, TOKEN_ACTIVE_ALPHA) + const fillProgress = isDone ? 1 : isActive ? progress : 0 + + return ( + + {segment.text} + + ) + })} + + ) + }, + (prevProps, nextProps) => { + if ( + prevProps.line !== nextProps.line || + prevProps.nextLineStart !== nextProps.nextLineStart || + prevProps.className !== nextProps.className || + prevProps.tokenClassName !== nextProps.tokenClassName || + prevProps.highlightTokens !== nextProps.highlightTokens || + !areLineStylesEqual(prevProps.style, nextProps.style) + ) { + return false + } + + return shouldSkipLineFrame( + prevProps.renderPlaybackMs, + nextProps.renderPlaybackMs, + nextProps.line, + nextProps.nextLineStart, + ) + }, +) + +KaraokeLineRow.displayName = 'KaraokeLineRow' + +const KaraokeLyricsOverlay = ({ + visible, + mainLyric, + translationLyric, + pronunciationLyric, + showTranslation, + showPronunciation, + translationEnabled, + pronunciationEnabled, + onToggleTranslation, + onTogglePronunciation, + audioInstance, + onClose, +}) => { + const classes = useStyles() + const [playbackMs, setPlaybackMs] = useState(0) + const [overlayHeight, setOverlayHeight] = useState(KARAOKE_DEFAULT_HEIGHT_PX) + const [maxHeightPx, setMaxHeightPx] = useState(getMaxHeightPx()) + const [bodyViewportHeight, setBodyViewportHeight] = useState(0) + const [isCompact, setIsCompact] = useState( + typeof window !== 'undefined' ? window.innerWidth <= 810 : false, + ) + const [lyricsSettings, setLyricsSettings] = useState(loadLyricsSettings) + + const handleSettingsChange = useCallback((next) => { + setLyricsSettings(next) + saveLyricsSettings(next) + }, []) + + const bodyRef = useRef(null) + const activeLineRef = useRef(null) + + const mainLines = useMemo(() => buildKaraokeLines(mainLyric), [mainLyric]) + const translationLines = useMemo( + () => buildKaraokeLines(translationLyric), + [translationLyric], + ) + const pronunciationLines = useMemo( + () => buildKaraokeLines(pronunciationLyric), + [pronunciationLyric], + ) + + useEffect(() => { + const onResize = () => { + const nextMaxHeight = getMaxHeightPx() + setIsCompact(window.innerWidth <= 810) + setMaxHeightPx(nextMaxHeight) + setOverlayHeight((previous) => + clamp(previous, KARAOKE_MIN_HEIGHT_PX, nextMaxHeight), + ) + } + + onResize() + window.addEventListener('resize', onResize) + return () => window.removeEventListener('resize', onResize) + }, []) + + useEffect(() => { + const body = bodyRef.current + if (!body) { + return undefined + } + + const updateViewportHeight = () => { + setBodyViewportHeight(body.clientHeight || 0) + } + + updateViewportHeight() + + if (typeof ResizeObserver !== 'undefined') { + const observer = new ResizeObserver(updateViewportHeight) + observer.observe(body) + return () => observer.disconnect() + } + + window.addEventListener('resize', updateViewportHeight) + return () => window.removeEventListener('resize', updateViewportHeight) + }, [overlayHeight, isCompact, showTranslation, showPronunciation, visible]) + + const onResizeStart = useCallback( + (event) => { + if (isCompact) { + return + } + + event.preventDefault() + const startY = event.clientY + const startHeight = overlayHeight + + const onMove = (moveEvent) => { + const delta = startY - moveEvent.clientY + setOverlayHeight( + clamp(startHeight + delta, KARAOKE_MIN_HEIGHT_PX, maxHeightPx), + ) + } + + const onUp = () => { + window.removeEventListener('mousemove', onMove) + window.removeEventListener('mouseup', onUp) + } + + window.addEventListener('mousemove', onMove) + window.addEventListener('mouseup', onUp) + }, + [isCompact, maxHeightPx, overlayHeight], + ) + + useEffect(() => { + if (!visible || !audioInstance) { + setPlaybackMs(0) + return + } + + let rafId = 0 + let cancelled = false + let anchorAudioMs = 0 + let anchorPerfMs = 0 + let lastRenderMs = 0 + + const readPlaybackMs = () => { + const seconds = Number(audioInstance.currentTime) + if (!Number.isFinite(seconds) || seconds < 0) { + return 0 + } + return seconds * 1000 + } + + const resetAnchor = (perfNow, observedMs) => { + anchorAudioMs = observedMs + anchorPerfMs = perfNow + } + + const tick = () => { + if (cancelled) { + return + } + + const observedMs = readPlaybackMs() + const perfNow = performance.now() + const playbackRate = Number(audioInstance.playbackRate) + const canInterpolate = + !audioInstance.paused && + !audioInstance.seeking && + Number.isFinite(playbackRate) && + playbackRate > 0 + + let nowMs = observedMs + + if (!canInterpolate) { + resetAnchor(perfNow, observedMs) + } else if (anchorPerfMs === 0) { + resetAnchor(perfNow, observedMs) + } else { + const predicted = + anchorAudioMs + (perfNow - anchorPerfMs) * playbackRate + const drift = observedMs - predicted + if (Math.abs(drift) > KARAOKE_CLOCK_DRIFT_RESET_MS) { + nowMs = observedMs + resetAnchor(perfNow, observedMs) + } else { + nowMs = predicted + } + } + + const backwardsDrift = lastRenderMs - nowMs + if (canInterpolate && backwardsDrift > 0) { + nowMs = lastRenderMs + } + + if (canInterpolate && backwardsDrift > KARAOKE_CLOCK_RESET_THRESHOLD_MS) { + resetAnchor(perfNow, observedMs) + } else if ( + !canInterpolate && + backwardsDrift > 0 && + backwardsDrift <= KARAOKE_MONOTONIC_JITTER_MS + ) { + nowMs = lastRenderMs + } + + nowMs = Math.max(0, nowMs) + lastRenderMs = nowMs + + setPlaybackMs((prev) => + Math.abs(prev - nowMs) >= KARAOKE_RENDER_UPDATE_EPSILON_MS + ? nowMs + : prev, + ) + rafId = window.requestAnimationFrame(tick) + } + + const initialMs = readPlaybackMs() + resetAnchor(performance.now(), initialMs) + lastRenderMs = initialMs + setPlaybackMs(initialMs) + rafId = window.requestAnimationFrame(tick) + + return () => { + cancelled = true + if (rafId) { + window.cancelAnimationFrame(rafId) + } + } + }, [audioInstance, visible]) + + const renderPlaybackMs = playbackMs + KARAOKE_RENDER_LEAD_MS + + const { lineIndex } = useMemo( + () => getActiveKaraokeState(mainLines, renderPlaybackMs), + [mainLines, renderPlaybackMs], + ) + + const activeIndex = lineIndex >= 0 ? lineIndex : 0 + + const trByMainIndex = useMemo(() => { + if (!showTranslation || translationLines.length === 0) return {} + const map = {} + for (let i = 0; i < mainLines.length; i++) { + const { line } = resolveLayerLineForMain(mainLines, translationLines, i) + if (line) map[i] = line + } + return map + }, [mainLines, translationLines, showTranslation]) + + const prByMainIndex = useMemo(() => { + if (!showPronunciation || pronunciationLines.length === 0) return {} + const map = {} + for (let i = 0; i < mainLines.length; i++) { + const { line } = resolveLayerLineForMain(mainLines, pronunciationLines, i) + if (line) map[i] = line + } + return map + }, [mainLines, pronunciationLines, showPronunciation]) + + const hasTranslationLine = showTranslation && translationLines.length > 0 + const hasPronunciationLine = + showPronunciation && pronunciationLines.length > 0 + const measuredViewportHeight = bodyRef.current?.clientHeight || 0 + const estimatedViewportHeight = + measuredViewportHeight > 0 + ? measuredViewportHeight + : bodyViewportHeight > 0 + ? bodyViewportHeight + : isCompact + ? 260 + : Math.max(220, overlayHeight - 170) + const centerSpacerPx = Math.max( + KARAOKE_CENTER_SPACER_MIN_PX, + Math.floor(estimatedViewportHeight * KARAOKE_CENTER_SPACER_RATIO), + ) + + useEffect(() => { + if (!visible) { + return + } + + const rafId = window.requestAnimationFrame(() => { + const body = bodyRef.current + const activeNode = activeLineRef.current + if (!body || !activeNode) { + return + } + + const bodyRect = body.getBoundingClientRect() + const activeRect = activeNode.getBoundingClientRect() + const deltaWithinBody = + activeRect.top - + bodyRect.top - + (body.clientHeight - activeRect.height) / 2 + const maxTop = Math.max(0, body.scrollHeight - body.clientHeight) + const centeredTop = clamp(body.scrollTop + deltaWithinBody, 0, maxTop) + + if (Math.abs(body.scrollTop - centeredTop) < 2) { + return + } + + if (typeof body.scrollTo === 'function') { + body.scrollTo({ + top: centeredTop, + behavior: 'smooth', + }) + } else { + body.scrollTop = centeredTop + } + }) + + return () => window.cancelAnimationFrame(rafId) + }, [ + centerSpacerPx, + hasPronunciationLine, + hasTranslationLine, + lineIndex, + overlayHeight, + visible, + ]) + + if ( + !visible || + !hasStructuredLyricContent(mainLyric) || + mainLines.length === 0 + ) { + return null + } + + const getMainLineStyle = (idx) => { + const delta = idx - activeIndex + const isActive = delta === 0 + let opacity = isActive ? 1 : delta < 0 ? 0.6 : 0.72 + const [r, g, b] = parseColorRGB(getColorValue(lyricsSettings.main.colorKey)) + let color = isActive + ? `rgba(${r}, ${g}, ${b}, 0.98)` + : delta < 0 + ? `rgba(${r}, ${g}, ${b}, 0.4)` + : `rgba(${r}, ${g}, ${b}, 0.54)` + + if (delta > 1) { + const level = clamp(delta, 1, 6) + opacity = Math.max(0.36, 0.74 - level * 0.08) + } + + if (delta < -1) { + const level = clamp(Math.abs(delta), 1, 6) + opacity = Math.max(0.28, 0.62 - level * 0.08) + } + + const baseFontSize = lyricsSettings.main.fontSize + const fontSize = isActive ? baseFontSize : Math.round(baseFontSize * 0.8) + + return { + opacity, + color, + fontSize, + } + } + + const overlayStyle = isCompact + ? undefined + : { + height: overlayHeight, + maxHeight: maxHeightPx, + } + + return ( +
+
+ +
+
+ + {mainLyric?.lang || 'xxx'} + +
+ + +
+
+ +
+ + + + +
+
+ +
+
+
+ {mainLines.map((line, idx) => { + const trLine = trByMainIndex[idx] + const prLine = prByMainIndex[idx] + const showTr = shouldShowAuxLine(line, trLine) + const showPr = shouldShowAuxLine(line, prLine) + const lineStyle = getMainLineStyle(idx) + const auxOpacity = + lineStyle.opacity != null ? lineStyle.opacity * 0.85 : 1 + const trStyle = { + opacity: auxOpacity, + fontSize: lyricsSettings.tr.fontSize, + color: getColorValue(lyricsSettings.tr.colorKey), + } + const prStyle = { + opacity: auxOpacity, + fontSize: lyricsSettings.pr.fontSize, + color: getColorValue(lyricsSettings.pr.colorKey), + } + return ( +
{ + if (audioInstance && line.start != null) { + audioInstance.currentTime = line.start / 1000 + } + }} + > + {showTr && ( + + )} + + {showPr && ( + + )} +
+ ) + })} +
+
+
+
+ ) +} + +export default KaraokeLyricsOverlay diff --git a/ui/src/audioplayer/Player.jsx b/ui/src/audioplayer/Player.jsx index eba3b82d7..b8b33b6d5 100644 --- a/ui/src/audioplayer/Player.jsx +++ b/ui/src/audioplayer/Player.jsx @@ -22,6 +22,7 @@ import { refreshQueue, setPlayMode, setTranscodingProfile, + updateQueueLyric, setVolume, syncQueue, } from '../actions' @@ -33,6 +34,25 @@ import { keyMap } from '../hotkeys' import keyHandlers from './keyHandlers' import { calculateGain } from '../utils/calculateReplayGain' import { detectBrowserProfile, decisionService } from '../transcode' +import { + getPreferredLyricLanguage, + hasStructuredLyricContent, + selectLyricLayers, + structuredLyricToLrc, +} from './lyrics' +import KaraokeLyricsOverlay from './KaraokeLyricsOverlay' + +const emptyLyricLayers = { + main: null, + translation: null, + pronunciation: null, +} + +const normalizeLyricLayers = (layers) => ({ + main: layers?.main || null, + translation: layers?.translation || null, + pronunciation: layers?.pronunciation || null, +}) const Player = () => { const theme = useCurrentTheme() @@ -120,6 +140,72 @@ const Player = () => { const gainInfo = useSelector((state) => state.replayGain) const [context, setContext] = useState(null) const [gainNode, setGainNode] = useState(null) + const lyricCacheRef = useRef(new Map()) + const lyricRequestIdRef = useRef(0) + const playerRef = useRef(null) + const [karaokeVisible, setKaraokeVisible] = useState(false) + const [selectedLyricLayers, setSelectedLyricLayers] = + useState(emptyLyricLayers) + const [showTranslation, setShowTranslation] = useState(false) + const [showPronunciation, setShowPronunciation] = useState(false) + const currentTrackId = playerState.current?.trackId + const currentTrackIsRadio = playerState.current?.isRadio + const selectedStructuredLyric = selectedLyricLayers.main + const hasKaraokeLyric = hasStructuredLyricContent(selectedStructuredLyric) + const hasTranslationLyric = hasStructuredLyricContent( + selectedLyricLayers.translation, + ) + const hasPronunciationLyric = hasStructuredLyricContent( + selectedLyricLayers.pronunciation, + ) + + const applyLyricToRuntimePlayer = useCallback((trackId, lyric) => { + if (!trackId) { + return + } + + const player = playerRef.current + if (!player || typeof player.setState !== 'function') { + return + } + + player.setState((prevState) => { + const prevLists = Array.isArray(prevState.audioLists) + ? prevState.audioLists + : [] + let changed = false + const audioLists = prevLists.map((item) => { + if (item.trackId !== trackId) { + return item + } + if (item.lyric === lyric) { + return item + } + changed = true + return { + ...item, + lyric, + } + }) + + const currentItem = audioLists.find( + (item) => item.musicSrc === prevState.musicSrc, + ) + const currentLyric = + typeof currentItem?.lyric === 'string' + ? currentItem.lyric + : prevState.lyric + + if (!changed && currentLyric === prevState.lyric) { + return null + } + + return { + audioLists, + lyric: currentLyric, + } + }) + }, []) useEffect(() => { if ( @@ -166,6 +252,107 @@ const Player = () => { return () => window.removeEventListener('beforeunload', handleBeforeUnload) }, [playerState, audioInstance]) + useEffect(() => { + if (!currentTrackId || currentTrackIsRadio) { + setSelectedLyricLayers(emptyLyricLayers) + setShowTranslation(false) + setShowPronunciation(false) + setKaraokeVisible(false) + return + } + + const cached = lyricCacheRef.current.get(currentTrackId) + let layers = emptyLyricLayers + if (cached && typeof cached !== 'string') { + if (cached.layers) { + layers = normalizeLyricLayers(cached.layers) + } else if (cached.structuredLyric) { + layers = normalizeLyricLayers({ + main: cached.structuredLyric, + }) + } + } + setSelectedLyricLayers(layers) + setShowTranslation(false) + setShowPronunciation(hasStructuredLyricContent(layers.pronunciation)) + }, [currentTrackId, currentTrackIsRadio]) + + useEffect(() => { + lyricRequestIdRef.current += 1 + const requestId = lyricRequestIdRef.current + + if (!currentTrackId || currentTrackIsRadio) { + return + } + + const cached = lyricCacheRef.current.get(currentTrackId) + if (cached !== undefined) { + const cachedLyric = + typeof cached === 'string' ? cached : cached?.lrc || '' + const cachedLayers = + typeof cached === 'string' + ? emptyLyricLayers + : cached?.layers + ? normalizeLyricLayers(cached.layers) + : normalizeLyricLayers({ main: cached?.structuredLyric }) + + setSelectedLyricLayers(cachedLayers) + setShowTranslation(false) + setShowPronunciation( + hasStructuredLyricContent(cachedLayers.pronunciation), + ) + if (cachedLyric) { + dispatch(updateQueueLyric(currentTrackId, cachedLyric)) + applyLyricToRuntimePlayer(currentTrackId, cachedLyric) + } + return + } + + subsonic + .getLyricsBySongId(currentTrackId) + .then((resp) => { + if (lyricRequestIdRef.current !== requestId) { + return + } + + const structuredLyrics = + resp?.json?.['subsonic-response']?.lyricsList?.structuredLyrics || [] + const layers = selectLyricLayers( + structuredLyrics, + getPreferredLyricLanguage(), + ) + const lyric = layers.main ? structuredLyricToLrc(layers.main) : '' + lyricCacheRef.current.set(currentTrackId, { + lrc: lyric, + layers, + }) + setSelectedLyricLayers(layers) + setShowTranslation(false) + setShowPronunciation(hasStructuredLyricContent(layers.pronunciation)) + + if (lyric !== '') { + dispatch(updateQueueLyric(currentTrackId, lyric)) + applyLyricToRuntimePlayer(currentTrackId, lyric) + } + }) + .catch(() => { + if (lyricRequestIdRef.current !== requestId) { + return + } + setSelectedLyricLayers(emptyLyricLayers) + setShowTranslation(false) + setShowPronunciation(false) + // Do not cache network/request failures as empty lyrics, so we can retry. + lyricCacheRef.current.delete(currentTrackId) + }) + }, [dispatch, currentTrackId, currentTrackIsRadio, applyLyricToRuntimePlayer]) + + useEffect(() => { + if (!hasKaraokeLyric && karaokeVisible) { + setKaraokeVisible(false) + } + }, [hasKaraokeLyric, karaokeVisible]) + const defaultOptions = useMemo( () => ({ theme: playerTheme, @@ -177,7 +364,7 @@ const Player = () => { clearPriorAudioLists: false, showDestroy: true, showDownload: false, - showLyric: true, + showLyric: false, showReload: false, toggleMode: !isDesktop, glassBg: false, @@ -214,12 +401,24 @@ const Player = () => { (playerState.clear || playerState.playIndex === 0), clearPriorAudioLists: playerState.clear, extendsContent: ( - + setKaraokeVisible((visible) => !visible)} + lyricsActive={karaokeVisible} + lyricsDisabled={!hasKaraokeLyric} + /> ), defaultVolume: isMobilePlayer ? 1 : playerState.volume, showMediaSession: !current.isRadio, } - }, [playerState, defaultOptions, isMobilePlayer]) + }, [ + playerState, + defaultOptions, + isMobilePlayer, + karaokeVisible, + hasKaraokeLyric, + ]) const onAudioListsChange = useCallback( (_, audioLists, audioInfo) => dispatch(syncQueue(audioInfo, audioLists)), @@ -391,6 +590,7 @@ const Player = () => { return ( { onBeforeDestroy={onBeforeDestroy} getAudioInstance={setAudioInstance} /> + + setShowTranslation((previous) => + hasTranslationLyric ? !previous : false, + ) + } + onTogglePronunciation={() => + setShowPronunciation((previous) => + hasPronunciationLyric ? !previous : false, + ) + } + audioInstance={audioInstance} + onClose={() => setKaraokeVisible(false)} + /> ) diff --git a/ui/src/audioplayer/PlayerToolbar.jsx b/ui/src/audioplayer/PlayerToolbar.jsx index 4812141ab..869df475d 100644 --- a/ui/src/audioplayer/PlayerToolbar.jsx +++ b/ui/src/audioplayer/PlayerToolbar.jsx @@ -4,7 +4,9 @@ import { useGetOne } from 'react-admin' import { GlobalHotKeys } from 'react-hotkeys' import IconButton from '@material-ui/core/IconButton' import { useMediaQuery } from '@material-ui/core' +import Tooltip from '@material-ui/core/Tooltip' import { RiSaveLine } from 'react-icons/ri' +import { RiFileMusicLine } from 'react-icons/ri' import { LoveButton, useToggleLove } from '../common' import { openSaveQueueDialog } from '../actions' import { keyMap } from '../hotkeys' @@ -55,7 +57,13 @@ const useStyles = makeStyles((theme) => ({ }, })) -const PlayerToolbar = ({ id, isRadio }) => { +const PlayerToolbar = ({ + id, + isRadio, + onToggleLyrics, + lyricsActive = false, + lyricsDisabled = false, +}) => { const dispatch = useDispatch() const { data, loading } = useGetOne('song', id, { enabled: !!id && !isRadio }) const [toggleLove, toggling] = useToggleLove('song', data) @@ -99,6 +107,25 @@ const PlayerToolbar = ({ id, isRadio }) => { /> ) + const toggleLyricsButton = ( + + + + + + + + ) + return ( <> @@ -106,11 +133,13 @@ const PlayerToolbar = ({ id, isRadio }) => {
  • {saveQueueButton} {loveButton} + {toggleLyricsButton}
  • ) : ( <>
  • {saveQueueButton}
  • {loveButton}
  • +
  • {toggleLyricsButton}
  • )} diff --git a/ui/src/audioplayer/PlayerToolbar.test.jsx b/ui/src/audioplayer/PlayerToolbar.test.jsx index d0368b0f0..3041001eb 100644 --- a/ui/src/audioplayer/PlayerToolbar.test.jsx +++ b/ui/src/audioplayer/PlayerToolbar.test.jsx @@ -71,6 +71,7 @@ describe('', () => { // Verify both buttons are rendered expect(screen.getByTestId('save-queue-button')).toBeInTheDocument() expect(screen.getByTestId('love-button')).toBeInTheDocument() + expect(screen.getByTestId('toggle-lyrics-button')).toBeInTheDocument() // Verify desktop classes are applied expect(listItems[0].className).toContain('toolbar') @@ -102,6 +103,14 @@ describe('', () => { type: 'OPEN_SAVE_QUEUE_DIALOG', }) }) + + it('triggers lyric toggle callback when lyrics button is clicked', () => { + const onToggleLyrics = vi.fn() + render() + + fireEvent.click(screen.getByTestId('toggle-lyrics-button')) + expect(onToggleLyrics).toHaveBeenCalledTimes(1) + }) }) describe('Mobile layout', () => { @@ -114,11 +123,12 @@ describe('', () => { // Each button should be in its own list item const listItems = screen.getAllByRole('listitem') - expect(listItems).toHaveLength(2) + expect(listItems).toHaveLength(3) // Verify both buttons are rendered expect(screen.getByTestId('save-queue-button')).toBeInTheDocument() expect(screen.getByTestId('love-button')).toBeInTheDocument() + expect(screen.getByTestId('toggle-lyrics-button')).toBeInTheDocument() // Verify mobile classes are applied expect(listItems[0].className).toContain('mobileListItem') @@ -140,6 +150,13 @@ describe('', () => { const loveButton = screen.getByTestId('love-button') expect(loveButton).toBeDisabled() }) + + it('disables lyrics button when lyrics are unavailable', () => { + render() + + const lyricsButton = screen.getByTestId('toggle-lyrics-button') + expect(lyricsButton).toBeDisabled() + }) }) describe('Common behavior', () => { diff --git a/ui/src/audioplayer/lyrics.js b/ui/src/audioplayer/lyrics.js new file mode 100644 index 000000000..3dcf9b0f9 --- /dev/null +++ b/ui/src/audioplayer/lyrics.js @@ -0,0 +1,617 @@ +const normalizeLanguageTag = (language) => + (language || '').toLowerCase().replace('_', '-') + +const KARAOKE_SWITCH_EPSILON_MS = 18 +const LYRIC_KIND_MAIN = 'main' +const LYRIC_KIND_TRANSLATION = 'translation' +const LYRIC_KIND_PRONUNCIATION = 'pronunciation' + +const padTime = (value) => { + const str = value.toString() + return str.length === 1 ? `0${str}` : str +} + +const toTime = (value) => { + const numeric = Number(value) + return Number.isFinite(numeric) ? numeric : null +} + +const compareNullableTime = (a, b) => { + if (a == null && b == null) { + return 0 + } + if (a == null) { + return 1 + } + if (b == null) { + return -1 + } + return a - b +} + +const sortTokensByStart = (tokens) => + tokens + .map((token, order) => ({ ...token, order })) + .sort((a, b) => { + const byStart = compareNullableTime(a.start, b.start) + if (byStart !== 0) { + return byStart + } + const byEnd = compareNullableTime(a.end, b.end) + if (byEnd !== 0) { + return byEnd + } + return a.order - b.order + }) + .map(({ order, ...token }) => token) + +const languageMatch = (candidate, preferred) => { + if (!candidate || !preferred) { + return false + } + return ( + candidate === preferred || + candidate.startsWith(`${preferred}-`) || + preferred.startsWith(`${candidate}-`) + ) +} + +const hasTimedLines = (lyric) => + lyric && + lyric.synced && + Array.isArray(lyric.line) && + lyric.line.some((line) => Number.isFinite(Number(line.start))) + +const normalizeToken = (token) => { + if (!token) { + return null + } + const value = typeof token.value === 'string' ? token.value : '' + if (!value.trim()) { + return null + } + return { + start: toTime(token.start), + end: toTime(token.end), + value, + role: typeof token.role === 'string' ? token.role : '', + } +} + +const normalizeTokenLine = (tokenLine, fallbackIndex) => { + const index = Number.isFinite(Number(tokenLine?.index)) + ? Number(tokenLine.index) + : fallbackIndex + const tokens = sortTokensByStart( + Array.isArray(tokenLine?.token) + ? tokenLine.token.map(normalizeToken).filter(Boolean) + : [], + ) + + return { + index, + start: toTime(tokenLine?.start), + end: toTime(tokenLine?.end), + value: typeof tokenLine?.value === 'string' ? tokenLine.value : '', + tokens, + } +} + +const normalizeLyricKind = (kind) => { + const normalized = (kind || '').toLowerCase().trim() + switch (normalized) { + case LYRIC_KIND_TRANSLATION: + return LYRIC_KIND_TRANSLATION + case LYRIC_KIND_PRONUNCIATION: + return LYRIC_KIND_PRONUNCIATION + default: + return LYRIC_KIND_MAIN + } +} + +const pickLyricByLanguage = (lyrics, preferredLanguage) => { + if (!Array.isArray(lyrics) || lyrics.length === 0) { + return null + } + + const preferred = normalizeLanguageTag(preferredLanguage) + const preferredBase = preferred.split('-')[0] + + return ( + lyrics.find((lyric) => + languageMatch(normalizeLanguageTag(lyric.lang), preferred), + ) || + lyrics.find((lyric) => + languageMatch(normalizeLanguageTag(lyric.lang), preferredBase), + ) || + lyrics.find((lyric) => + languageMatch(normalizeLanguageTag(lyric.lang), 'en'), + ) || + lyrics[0] + ) +} + +const lineTimeWindow = (lines, index) => { + const line = lines[index] + if (!line) { + return { start: null, end: null } + } + + const start = toTime(line.start) + const end = toTime(line.end) ?? toTime(lines[index + 1]?.start) + return { start, end } +} + +const buildSyntheticWordTokens = (line, token) => { + const text = typeof line?.value === 'string' ? line.value : '' + if (!text.trim()) { + return null + } + + const chunks = text.match(/\S+\s*/g) || [] + if (chunks.length < 2) { + return null + } + + const normalizedLine = text.replace(/\s+/g, ' ').trim().toLowerCase() + const normalizedTokenValue = (token?.value || '') + .replace(/\s+/g, ' ') + .trim() + .toLowerCase() + if (!normalizedTokenValue || !normalizedLine) { + return null + } + + const compressedLine = normalizedLine.replace(/\s+/g, '') + const compressedToken = normalizedTokenValue.replace(/\s+/g, '') + const tokenLooksLikeWholeLine = + compressedToken === compressedLine || + compressedToken.length >= Math.floor(compressedLine.length * 0.8) + if (!tokenLooksLikeWholeLine) { + return null + } + + const tokenStart = toTime(token?.start) + const tokenEnd = toTime(token?.end) + const lineStart = toTime(line?.start) + const lineEnd = toTime(line?.end) + + const baseStart = tokenStart ?? lineStart + const baseEnd = tokenEnd ?? lineEnd + if ( + baseStart == null || + baseEnd == null || + !Number.isFinite(baseStart) || + !Number.isFinite(baseEnd) || + baseEnd <= baseStart + ) { + return null + } + + const duration = baseEnd - baseStart + return chunks.map((chunk, idx) => ({ + start: baseStart + (duration * idx) / chunks.length, + end: baseStart + (duration * (idx + 1)) / chunks.length, + value: chunk, + role: typeof token?.role === 'string' ? token.role : '', + })) +} + +export const hasTokenTiming = (structuredLyric) => + Boolean( + structuredLyric && + Array.isArray(structuredLyric.tokenLine) && + structuredLyric.tokenLine.some( + (tokenLine) => + Array.isArray(tokenLine?.token) && + tokenLine.token.some((token) => Number.isFinite(Number(token?.start))), + ), + ) + +export const hasStructuredLyricContent = (structuredLyric) => + Boolean( + structuredLyric && + ((Array.isArray(structuredLyric.line) && + structuredLyric.line.some( + (line) => typeof line?.value === 'string' && line.value.trim() !== '', + )) || + hasTokenTiming(structuredLyric)), + ) + +export const getPreferredLyricLanguage = () => { + if (typeof window !== 'undefined' && window.localStorage) { + const stored = window.localStorage.getItem('locale') + if (stored) { + return stored + } + } + if (typeof navigator !== 'undefined' && navigator.language) { + return navigator.language + } + return 'en' +} + +export const selectLyricLayers = (structuredLyrics, preferredLanguage) => { + if (!Array.isArray(structuredLyrics)) { + return { + main: null, + translation: null, + pronunciation: null, + } + } + + const synced = structuredLyrics.filter(hasTimedLines) + if (synced.length === 0) { + return { + main: null, + translation: null, + pronunciation: null, + } + } + + const grouped = { + [LYRIC_KIND_MAIN]: [], + [LYRIC_KIND_TRANSLATION]: [], + [LYRIC_KIND_PRONUNCIATION]: [], + } + + for (const lyric of synced) { + grouped[normalizeLyricKind(lyric?.kind)].push(lyric) + } + + const mainCandidates = grouped[LYRIC_KIND_MAIN].length + ? grouped[LYRIC_KIND_MAIN] + : synced + + return { + main: pickLyricByLanguage(mainCandidates, preferredLanguage), + translation: pickLyricByLanguage( + grouped[LYRIC_KIND_TRANSLATION], + preferredLanguage, + ), + pronunciation: pickLyricByLanguage( + grouped[LYRIC_KIND_PRONUNCIATION], + preferredLanguage, + ), + } +} + +export const pickStructuredLyric = (structuredLyrics, preferredLanguage) => + selectLyricLayers(structuredLyrics, preferredLanguage).main + +export const structuredLyricToLrc = (structuredLyric) => { + if (!structuredLyric || !Array.isArray(structuredLyric.line)) { + return '' + } + + let lyricText = '' + for (const line of structuredLyric.line) { + const start = Number(line.start) + if (!Number.isFinite(start) || start < 0) { + continue + } + + let time = Math.floor(start / 10) + const ms = time % 100 + time = Math.floor(time / 100) + const sec = time % 60 + time = Math.floor(time / 60) + const min = time % 60 + + lyricText += `[${padTime(min)}:${padTime(sec)}.${padTime(ms)}] ${line.value || ''}\n` + } + return lyricText +} + +export const structuredLyricsToLrc = (structuredLyrics, preferredLanguage) => { + const selected = pickStructuredLyric(structuredLyrics, preferredLanguage) + if (!selected) { + return '' + } + return structuredLyricToLrc(selected) +} + +export const buildKaraokeLines = (structuredLyric) => { + if (!structuredLyric) { + return [] + } + + const baseLines = Array.isArray(structuredLyric.line) + ? structuredLyric.line + : [] + const rawTokenLines = Array.isArray(structuredLyric.tokenLine) + ? structuredLyric.tokenLine + : [] + + const lines = + rawTokenLines.length > 0 + ? rawTokenLines.map((tokenLine, fallbackIndex) => { + const normalized = normalizeTokenLine(tokenLine, fallbackIndex) + const baseLine = baseLines[normalized.index] || {} + const tokens = normalized.tokens + const fallbackStart = + tokens.find((token) => token.start != null)?.start ?? null + const fallbackEnd = + [...tokens].reverse().find((token) => token.end != null)?.end ?? + null + const value = + normalized.value || + (typeof baseLine.value === 'string' ? baseLine.value : '') || + tokens.map((token) => token.value).join('') + + return { + index: normalized.index, + start: normalized.start ?? toTime(baseLine.start) ?? fallbackStart, + end: normalized.end ?? toTime(baseLine.end) ?? fallbackEnd, + value, + tokens, + } + }) + : baseLines.map((line, index) => ({ + index, + start: toTime(line.start), + end: toTime(line.end), + value: typeof line.value === 'string' ? line.value : '', + tokens: [], + })) + + const normalized = lines + .filter((line) => line.value || line.tokens.length > 0) + .sort((a, b) => { + if (a.start == null && b.start == null) { + return a.index - b.index + } + if (a.start == null) { + return 1 + } + if (b.start == null) { + return -1 + } + if (a.start !== b.start) { + return a.start - b.start + } + return a.index - b.index + }) + .map((line) => { + const nextLine = { ...line } + if (nextLine.tokens.length === 1) { + const syntheticTokens = buildSyntheticWordTokens( + nextLine, + nextLine.tokens[0], + ) + if (syntheticTokens) { + nextLine.tokens = syntheticTokens + } + } + return nextLine + }) + + for (let i = 0; i < normalized.length; i += 1) { + if (normalized[i].end == null) { + const nextStart = normalized[i + 1]?.start + if (nextStart != null) { + normalized[i].end = nextStart + } + } + } + + return normalized +} + +export const resolveKaraokeTokenWindow = ( + line, + tokenIndex, + lineEndFallback = null, +) => { + const tokens = Array.isArray(line?.tokens) ? line.tokens : [] + const token = tokens[tokenIndex] + if (!token) { + return { start: null, end: null } + } + + const prevToken = tokenIndex > 0 ? tokens[tokenIndex - 1] : null + const nextToken = + tokenIndex + 1 < tokens.length ? tokens[tokenIndex + 1] : null + + const lineStart = toTime(line?.start) + const lineEnd = toTime(line?.end) ?? toTime(lineEndFallback) + const tokenCount = tokens.length + const hasLineWindow = + lineStart != null && + lineEnd != null && + Number.isFinite(lineStart) && + Number.isFinite(lineEnd) && + lineEnd > lineStart + const estimatedStart = + hasLineWindow && tokenCount > 0 + ? lineStart + ((lineEnd - lineStart) * tokenIndex) / tokenCount + : null + const estimatedEnd = + hasLineWindow && tokenCount > 0 + ? lineStart + ((lineEnd - lineStart) * (tokenIndex + 1)) / tokenCount + : null + + let explicitStartCount = 0 + let explicitEndCount = 0 + const uniqueStarts = new Set() + const uniqueEnds = new Set() + + for (let i = 0; i < tokenCount; i += 1) { + const explicitStart = toTime(tokens[i]?.start) + if (explicitStart != null) { + explicitStartCount += 1 + uniqueStarts.add(explicitStart) + } + + const explicitEnd = toTime(tokens[i]?.end) + if (explicitEnd != null) { + explicitEndCount += 1 + uniqueEnds.add(explicitEnd) + } + } + + const collapsedStarts = + explicitStartCount > 1 && uniqueStarts.size <= Math.max(1, tokenCount / 4) + const collapsedEnds = + explicitEndCount > 1 && uniqueEnds.size <= Math.max(1, tokenCount / 4) + const shouldForceEstimated = + hasLineWindow && tokenCount > 1 && (collapsedStarts || collapsedEnds) + + if (shouldForceEstimated) { + return { + start: estimatedStart, + end: estimatedEnd, + } + } + const prevEnd = toTime(prevToken?.end) ?? toTime(prevToken?.start) + + let start = toTime(token.start) + if (start == null) { + start = prevEnd ?? estimatedStart ?? lineStart + } + + let end = toTime(token.end) + if (end == null) { + const nextDirectStart = toTime(nextToken?.start) + const nextEstimatedStart = + hasLineWindow && tokenIndex + 1 < tokenCount + ? lineStart + ((lineEnd - lineStart) * (tokenIndex + 1)) / tokenCount + : null + end = nextDirectStart ?? nextEstimatedStart ?? estimatedEnd ?? lineEnd + } + + if ( + tokenCount === 1 && + hasLineWindow && + (start == null || end == null || end <= start + 1) + ) { + start = lineStart + end = lineEnd + } + + if (start != null && end != null && end < start) { + end = start + } + + return { start, end } +} + +export const getActiveKaraokeState = (lines, currentTimeMs) => { + if (!Array.isArray(lines) || lines.length === 0) { + return { lineIndex: -1, tokenIndex: -1 } + } + + const current = Number.isFinite(Number(currentTimeMs)) + ? Number(currentTimeMs) + : 0 + let lineIndex = 0 + for (let i = 0; i < lines.length; i += 1) { + const lineStart = toTime(lines[i]?.start) + if (lineStart == null || lineStart <= current + KARAOKE_SWITCH_EPSILON_MS) { + lineIndex = i + continue + } + break + } + + for (let i = lineIndex; i >= 0; i -= 1) { + const lineStart = toTime(lines[i]?.start) + const lineEnd = toTime(lines[i]?.end) ?? toTime(lines[i + 1]?.start) + if (lineStart != null && current + KARAOKE_SWITCH_EPSILON_MS < lineStart) { + continue + } + if (lineEnd == null || current <= lineEnd + KARAOKE_SWITCH_EPSILON_MS) { + lineIndex = i + break + } + } + + const activeLine = lines[lineIndex] || null + const tokens = Array.isArray(activeLine?.tokens) ? activeLine.tokens : [] + let tokenIndex = -1 + for (let i = 0; i < tokens.length; i += 1) { + const { start: tokenStart, end: tokenEnd } = resolveKaraokeTokenWindow( + activeLine, + i, + lines[lineIndex + 1]?.start, + ) + if ( + tokenStart == null || + tokenStart <= current + KARAOKE_SWITCH_EPSILON_MS + ) { + tokenIndex = i + if (tokenEnd != null && current <= tokenEnd + KARAOKE_SWITCH_EPSILON_MS) { + break + } + continue + } + break + } + + return { lineIndex, tokenIndex } +} + +export const findLayerLineIndexForMain = (mainLines, layerLines, mainIndex) => { + if ( + !Array.isArray(mainLines) || + !Array.isArray(layerLines) || + mainLines.length === 0 || + layerLines.length === 0 || + mainIndex < 0 || + mainIndex >= mainLines.length + ) { + return -1 + } + + const { start: mainStart, end: mainEnd } = lineTimeWindow( + mainLines, + mainIndex, + ) + + if (mainStart == null) { + return -1 + } + const mainWindowEnd = mainEnd ?? mainStart + const mainWindowDuration = Math.max(0, mainWindowEnd - mainStart) + const maxDelta = Math.max(550, Math.min(1400, mainWindowDuration + 420)) + + let bestIdx = -1 + let bestScore = Number.POSITIVE_INFINITY + + for (let i = 0; i < layerLines.length; i += 1) { + const { start, end } = lineTimeWindow(layerLines, i) + + if (start != null && end != null) { + const overlap = Math.min(end, mainEnd ?? end) - Math.max(start, mainStart) + if (overlap >= 0) { + const score = Math.abs(start - mainStart) + Math.abs(i - mainIndex) * 30 + if (score < bestScore) { + bestScore = score + bestIdx = i + } + continue + } + } + + if (start != null) { + if (Math.abs(start - mainStart) > maxDelta) { + continue + } + const score = Math.abs(start - mainStart) + Math.abs(i - mainIndex) * 45 + if (score < bestScore) { + bestScore = score + bestIdx = i + } + } + } + + return bestIdx +} + +export const resolveLayerLineForMain = (mainLines, layerLines, mainIndex) => { + const index = findLayerLineIndexForMain(mainLines, layerLines, mainIndex) + return { + index, + line: index >= 0 ? layerLines[index] : null, + } +} diff --git a/ui/src/audioplayer/lyrics.test.js b/ui/src/audioplayer/lyrics.test.js new file mode 100644 index 000000000..c60605a6f --- /dev/null +++ b/ui/src/audioplayer/lyrics.test.js @@ -0,0 +1,416 @@ +import { + buildKaraokeLines, + findLayerLineIndexForMain, + getPreferredLyricLanguage, + getActiveKaraokeState, + hasStructuredLyricContent, + pickStructuredLyric, + resolveKaraokeTokenWindow, + resolveLayerLineForMain, + selectLyricLayers, + structuredLyricToLrc, + structuredLyricsToLrc, +} from './lyrics' + +describe('lyrics helpers', () => { + beforeEach(() => { + localStorage.clear() + }) + + it('prefers a lyric track that matches the locale', () => { + const selected = pickStructuredLyric( + [ + { + lang: 'eng', + synced: true, + line: [{ start: 1000, value: 'English line' }], + }, + { + lang: 'pt-BR', + synced: true, + line: [{ start: 1000, value: 'Linha em portugues' }], + }, + ], + 'pt-BR', + ) + + expect(selected.lang).toBe('pt-BR') + }) + + it('falls back to english when preferred locale is not available', () => { + const selected = pickStructuredLyric( + [ + { + lang: 'eng', + synced: true, + line: [{ start: 1000, value: 'English line' }], + }, + { + lang: 'deu', + synced: true, + line: [{ start: 1000, value: 'Deutsche Zeile' }], + }, + ], + 'pt-BR', + ) + + expect(selected.lang).toBe('eng') + }) + + it('falls back to first synced track when english is missing', () => { + const selected = pickStructuredLyric( + [ + { + lang: 'jpn', + synced: true, + line: [{ start: 1000, value: 'Nihongo' }], + }, + { + lang: 'deu', + synced: true, + line: [{ start: 1000, value: 'Deutsch' }], + }, + ], + 'pt-BR', + ) + + expect(selected.lang).toBe('jpn') + }) + + it('selects translation and pronunciation layers by kind', () => { + const layers = selectLyricLayers( + [ + { + kind: 'main', + lang: 'ja', + synced: true, + line: [{ start: 1000, value: 'こんにちは' }], + }, + { + kind: 'translation', + lang: 'es', + synced: true, + line: [{ start: 1000, value: 'Hola' }], + }, + { + kind: 'pronunciation', + lang: 'ja-Latn', + synced: true, + line: [{ start: 1000, value: 'konnichiwa' }], + }, + ], + 'es-MX', + ) + + expect(layers.main.lang).toBe('ja') + expect(layers.translation.lang).toBe('es') + expect(layers.pronunciation.lang).toBe('ja-Latn') + }) + + it('treats missing kind as main for backward compatibility', () => { + const layers = selectLyricLayers( + [ + { + lang: 'eng', + synced: true, + line: [{ start: 1000, value: 'Main' }], + }, + ], + 'eng', + ) + + expect(layers.main.lang).toBe('eng') + expect(layers.translation).toBeNull() + expect(layers.pronunciation).toBeNull() + }) + + it('matches layer line by timing for the active main line', () => { + const mainLines = [ + { index: 0, start: 1000, end: 1800, value: 'Line A', tokens: [] }, + { index: 1, start: 2000, end: 2800, value: 'Line B', tokens: [] }, + ] + const layerLines = [ + { index: 0, start: 900, end: 1750, value: 'A2', tokens: [] }, + { index: 1, start: 2050, end: 2900, value: 'B2', tokens: [] }, + ] + + expect(findLayerLineIndexForMain(mainLines, layerLines, 1)).toBe(1) + expect(resolveLayerLineForMain(mainLines, layerLines, 0).line.value).toBe( + 'A2', + ) + }) + + it('matches metadata layers by nearest timing even when indexes differ', () => { + const mainLines = [ + { index: 0, start: 1000, end: 1800, value: 'Line A', tokens: [] }, + { index: 1, start: 2000, end: 2800, value: 'Line B', tokens: [] }, + { index: 2, start: 3000, end: 3800, value: 'Line C', tokens: [] }, + ] + const layerLines = [ + { index: 2, start: 3020, end: 3820, value: 'C2', tokens: [] }, + { index: 0, start: 980, end: 1760, value: 'A2', tokens: [] }, + { index: 1, start: 2010, end: 2810, value: 'B2', tokens: [] }, + ] + + expect(findLayerLineIndexForMain(mainLines, layerLines, 1)).toBe(2) + expect(resolveLayerLineForMain(mainLines, layerLines, 2).line.value).toBe( + 'C2', + ) + }) + + it('returns no layer match when the nearest line is too far in time', () => { + const mainLines = [ + { index: 0, start: 1000, end: 1800, value: 'Line A', tokens: [] }, + { index: 1, start: 2000, end: 2800, value: 'Line B', tokens: [] }, + ] + const layerLines = [ + { index: 0, start: 60000, end: 60800, value: 'Far line', tokens: [] }, + ] + + expect(findLayerLineIndexForMain(mainLines, layerLines, 1)).toBe(-1) + expect(resolveLayerLineForMain(mainLines, layerLines, 1).line).toBeNull() + }) + + it('converts a structured lyric track to LRC', () => { + const lrc = structuredLyricToLrc({ + lang: 'eng', + synced: true, + line: [ + { start: 18800, value: "We're no strangers to love" }, + { start: 22801, value: 'You know the rules and so do I' }, + ], + }) + + expect(lrc).toBe( + "[00:18.80] We're no strangers to love\n[00:22.80] You know the rules and so do I\n", + ) + }) + + it('returns empty text when no synced lyrics are available', () => { + const lrc = structuredLyricsToLrc( + [{ lang: 'eng', synced: false, line: [{ value: 'Unsynced line' }] }], + 'eng', + ) + + expect(lrc).toBe('') + }) + + it('reads preferred language from localStorage first', () => { + localStorage.setItem('locale', 'pt-BR') + expect(getPreferredLyricLanguage()).toBe('pt-BR') + }) + + it('builds karaoke lines from tokenLine payload', () => { + const lines = buildKaraokeLines({ + lang: 'eng', + synced: true, + line: [{ start: 1000, end: 3000, value: 'Hello world' }], + tokenLine: [ + { + index: 0, + start: 1000, + end: 3000, + value: 'Hello world', + token: [ + { start: 1000, end: 1500, value: 'Hello' }, + { start: 2000, end: 2500, value: 'world', role: 'x-bg' }, + ], + }, + ], + }) + + expect(lines).toEqual([ + { + index: 0, + start: 1000, + end: 3000, + value: 'Hello world', + tokens: [ + { start: 1000, end: 1500, value: 'Hello', role: '' }, + { start: 2000, end: 2500, value: 'world', role: 'x-bg' }, + ], + }, + ]) + }) + + it('sorts token timing by start to keep playback stable', () => { + const lines = buildKaraokeLines({ + lang: 'eng', + synced: true, + line: [{ start: 1000, end: 3000, value: 'Hello world' }], + tokenLine: [ + { + index: 0, + start: 1000, + end: 3000, + value: 'Hello world', + token: [ + { start: 2000, end: 2500, value: 'world', role: '' }, + { start: 1000, end: 1500, value: 'Hello', role: '' }, + ], + }, + ], + }) + + expect(lines[0].tokens.map((token) => token.value)).toEqual([ + 'Hello', + 'world', + ]) + }) + + it('splits a single full-line token into synthetic word tokens', () => { + const lines = buildKaraokeLines({ + lang: 'ko-Latn', + synced: true, + line: [{ start: 1000, end: 2000, value: 'Da-la-lun, dun' }], + tokenLine: [ + { + index: 0, + start: 1000, + end: 2000, + value: 'Da-la-lun, dun', + token: [{ start: 1000, end: 2000, value: 'Da-la-lun, dun' }], + }, + ], + }) + + expect(lines).toHaveLength(1) + expect(lines[0].tokens).toHaveLength(2) + expect(lines[0].tokens[0].value).toBe('Da-la-lun, ') + expect(lines[0].tokens[1].value).toBe('dun') + + const firstWindow = resolveKaraokeTokenWindow(lines[0], 0) + const secondWindow = resolveKaraokeTokenWindow(lines[0], 1) + + expect(firstWindow.start).toBeCloseTo(1000) + expect(firstWindow.end).toBeCloseTo(1500) + expect(secondWindow.start).toBeCloseTo(1500) + expect(secondWindow.end).toBeCloseTo(2000) + }) + + it('detects active line and token for karaoke timing', () => { + const state = getActiveKaraokeState( + [ + { + index: 0, + start: 1000, + end: 3000, + value: 'Hello world', + tokens: [ + { start: 1000, end: 1500, value: 'Hello', role: '' }, + { start: 2000, end: 2500, value: 'world', role: '' }, + ], + }, + { + index: 1, + start: 3500, + end: 5000, + value: 'Second line', + tokens: [], + }, + ], + 2200, + ) + + expect(state).toEqual({ lineIndex: 0, tokenIndex: 1 }) + }) + + it('resolves token window fallback boundaries from neighboring tokens', () => { + const line = { + start: 1000, + end: 3000, + value: 'Hello world', + tokens: [ + { start: 1200, value: 'Hello', role: '' }, + { start: 1800, value: 'world', role: '' }, + ], + } + + expect(resolveKaraokeTokenWindow(line, 0)).toEqual({ + start: 1200, + end: 1800, + }) + expect(resolveKaraokeTokenWindow(line, 1)).toEqual({ + start: 1800, + end: 3000, + }) + }) + + it('infers sequential token windows when token timings are missing', () => { + const line = { + start: 1000, + end: 2000, + value: 'A B C', + tokens: [ + { value: 'A', role: '' }, + { value: 'B', role: '' }, + { value: 'C', role: '' }, + ], + } + + const first = resolveKaraokeTokenWindow(line, 0) + const second = resolveKaraokeTokenWindow(line, 1) + const third = resolveKaraokeTokenWindow(line, 2) + + expect(first.start).toBeCloseTo(1000) + expect(first.end).toBeCloseTo(1333.3333333333333) + + expect(second.start).toBeCloseTo(1333.3333333333333) + expect(second.end).toBeCloseTo(1666.6666666666667) + + expect(third.start).toBeCloseTo(1666.6666666666667) + expect(third.end).toBeCloseTo(2000) + }) + + it('falls back to sequential windows when token timings are collapsed', () => { + const line = { + start: 1000, + end: 2000, + value: 'A B C', + tokens: [ + { start: 1000, end: 2000, value: 'A', role: '' }, + { start: 1000, end: 2000, value: 'B', role: '' }, + { start: 1000, end: 2000, value: 'C', role: '' }, + ], + } + + const first = resolveKaraokeTokenWindow(line, 0) + const second = resolveKaraokeTokenWindow(line, 1) + const third = resolveKaraokeTokenWindow(line, 2) + + expect(first.start).toBeCloseTo(1000) + expect(first.end).toBeCloseTo(1333.3333333333333) + expect(second.start).toBeCloseTo(1333.3333333333333) + expect(second.end).toBeCloseTo(1666.6666666666667) + expect(third.start).toBeCloseTo(1666.6666666666667) + expect(third.end).toBeCloseTo(2000) + }) + + it('keeps token selection stable near tight token boundaries', () => { + const state = getActiveKaraokeState( + [ + { + index: 0, + start: 1000, + end: 2000, + value: 'A B', + tokens: [ + { start: 1000, end: 1100, value: 'A', role: '' }, + { start: 1110, end: 1300, value: 'B', role: '' }, + ], + }, + ], + 1108, + ) + + expect(state).toEqual({ lineIndex: 0, tokenIndex: 0 }) + }) + + it('reports structured lyric content when token timing exists', () => { + expect( + hasStructuredLyricContent({ + tokenLine: [{ token: [{ start: 100, value: 'a' }] }], + }), + ).toBe(true) + }) +}) diff --git a/ui/src/reducers/playerReducer.js b/ui/src/reducers/playerReducer.js index 466a3ec87..2c3b2ba7a 100644 --- a/ui/src/reducers/playerReducer.js +++ b/ui/src/reducers/playerReducer.js @@ -7,6 +7,7 @@ import { PLAYER_CURRENT, PLAYER_PLAY_NEXT, PLAYER_PLAY_TRACKS, + PLAYER_UPDATE_LYRIC, PLAYER_SET_TRACK, PLAYER_SET_VOLUME, PLAYER_SYNC_QUEUE, @@ -60,21 +61,25 @@ const mapToAudioLists = (item) => { let lyricText = '' if (lyrics) { - const structured = JSON.parse(lyrics) - for (const structuredLyric of structured) { - if (structuredLyric.synced) { - for (const line of structuredLyric.line) { - let time = Math.floor(line.start / 10) - const ms = time % 100 - time = Math.floor(time / 100) - const sec = time % 60 - time = Math.floor(time / 60) - const min = time % 60 + try { + const structured = JSON.parse(lyrics) + for (const structuredLyric of structured) { + if (structuredLyric.synced) { + for (const line of structuredLyric.line) { + let time = Math.floor(line.start / 10) + const ms = time % 100 + time = Math.floor(time / 100) + const sec = time % 60 + time = Math.floor(time / 60) + const min = time % 60 - ms.toString() - lyricText += `[${pad(min)}:${pad(sec)}.${pad(ms)}] ${line.value}\n` + ms.toString() + lyricText += `[${pad(min)}:${pad(sec)}.${pad(ms)}] ${line.value}\n` + } } } + } catch { + lyricText = '' } } @@ -206,6 +211,45 @@ const reduceMode = (state, { data: { mode } }) => { } } +const reduceUpdateLyric = (state, { data: { trackId, lyric } }) => { + if (!trackId) { + return state + } + + let changed = false + const queue = state.queue.map((item) => { + if (item.trackId !== trackId) { + return item + } + if (item.lyric === lyric) { + return item + } + changed = true + return { + ...item, + lyric, + } + }) + + if (!changed) { + return state + } + + const current = + state.current?.trackId === trackId + ? { + ...state.current, + lyric, + } + : state.current + + return { + ...state, + queue, + current, + } +} + export const playerReducer = (previousState = initialState, payload) => { const { type } = payload switch (type) { @@ -243,6 +287,8 @@ export const playerReducer = (previousState = initialState, payload) => { previousState.savedPlayIndex >= 0 ? previousState.savedPlayIndex : 0, } } + case PLAYER_UPDATE_LYRIC: + return reduceUpdateLyric(previousState, payload) default: return previousState } diff --git a/ui/src/reducers/playerReducer.test.js b/ui/src/reducers/playerReducer.test.js index 10e9512d7..1c399859d 100644 --- a/ui/src/reducers/playerReducer.test.js +++ b/ui/src/reducers/playerReducer.test.js @@ -1,11 +1,24 @@ -import { describe, it, expect } from 'vitest' +import { describe, expect, it, vi } from 'vitest' import { playerReducer } from './playerReducer' import { - PLAYER_SYNC_QUEUE, PLAYER_CURRENT, PLAYER_REFRESH_QUEUE, + PLAYER_SET_TRACK, + PLAYER_SYNC_QUEUE, + PLAYER_UPDATE_LYRIC, } from '../actions' +vi.mock('uuid', () => ({ + v4: () => 'test-uuid', +})) + +vi.mock('../subsonic', () => ({ + default: { + streamUrl: vi.fn((id) => `/rest/stream?id=${id}`), + getCoverArtUrl: vi.fn(() => '/rest/getCoverArt?id=test'), + }, +})) + describe('playerReducer', () => { describe('pending track selection survives SYNC_QUEUE and premature CURRENT', () => { // Simulates the real sequence when clicking a new song while one is playing: @@ -54,8 +67,6 @@ describe('playerReducer', () => { }) it('CURRENT for old track preserves pending playIndex', () => { - // After SYNC_QUEUE, queue has new UUIDs. The old track's UUID (zzz) - // is at index 2, but playIndex is 0. This is a premature callback. const stateAfterSync = { ...stateAfterPlayTracks, queue: [ @@ -71,7 +82,7 @@ describe('playerReducer', () => { const result = playerReducer(stateAfterSync, action) expect(result.playIndex).toBe(0) expect(result.clear).toBe(true) - expect(result.savedPlayIndex).toBe(2) // preserved from before + expect(result.savedPlayIndex).toBe(2) }) it('CURRENT for correct track consumes pending playIndex', () => { @@ -83,7 +94,6 @@ describe('playerReducer', () => { { trackId: 's3', uuid: 'zzz', name: 'Song 3' }, ], } - // Player switched to Song 1 (uuid 'xxx', index 0 == playIndex) const action = { type: PLAYER_CURRENT, data: { uuid: 'xxx', name: 'Song 1', volume: 1 }, @@ -142,4 +152,80 @@ describe('playerReducer', () => { expect(result.playIndex).toBe(0) }) }) + + it('maps embedded synced lyrics to LRC text', () => { + const lyrics = JSON.stringify([ + { + lang: 'eng', + synced: true, + line: [{ start: 1000, value: 'Line one' }], + }, + { + lang: 'eng', + synced: false, + line: [{ value: 'Unsynced line' }], + }, + ]) + + const state = playerReducer(undefined, { + type: PLAYER_SET_TRACK, + data: { + id: 'song-1', + title: 'Test Song', + artist: 'Test Artist', + album: 'Test Album', + duration: 60, + lyrics, + }, + }) + + expect(state.queue).toHaveLength(1) + expect(state.queue[0].lyric).toBe('[00:01.00] Line one\n') + }) + + it('updates queue lyric by track id', () => { + const initial = playerReducer(undefined, { + type: PLAYER_SET_TRACK, + data: { + id: 'song-1', + title: 'Test Song', + artist: 'Test Artist', + album: 'Test Album', + duration: 60, + }, + }) + + const updated = playerReducer(initial, { + type: PLAYER_UPDATE_LYRIC, + data: { + trackId: 'song-1', + lyric: '[00:01.00] Updated lyric\n', + }, + }) + + expect(updated.queue[0].lyric).toBe('[00:01.00] Updated lyric\n') + }) + + it('returns same state when lyric update does not match any track', () => { + const initial = playerReducer(undefined, { + type: PLAYER_SET_TRACK, + data: { + id: 'song-1', + title: 'Test Song', + artist: 'Test Artist', + album: 'Test Album', + duration: 60, + }, + }) + + const updated = playerReducer(initial, { + type: PLAYER_UPDATE_LYRIC, + data: { + trackId: 'missing-track', + lyric: '[00:01.00] Updated lyric\n', + }, + }) + + expect(updated).toBe(initial) + }) }) diff --git a/ui/src/subsonic/index.js b/ui/src/subsonic/index.js index 3579619aa..b311d5e14 100644 --- a/ui/src/subsonic/index.js +++ b/ui/src/subsonic/index.js @@ -120,6 +120,10 @@ const getTopSongs = (artist, count = 50) => { return httpClient(url('getTopSongs', null, { artist, count })) } +const getLyricsBySongId = (id) => { + return httpClient(url('getLyricsBySongId', id)) +} + const streamUrl = (id, options) => { return baseUrl( url('stream', id, { @@ -149,4 +153,5 @@ export default { getArtistInfo, getTopSongs, getSimilarSongs2, + getLyricsBySongId, } diff --git a/ui/src/subsonic/index.test.js b/ui/src/subsonic/index.test.js index a750694f4..6910fdc8d 100644 --- a/ui/src/subsonic/index.test.js +++ b/ui/src/subsonic/index.test.js @@ -1,7 +1,12 @@ import { vi } from 'vitest' -import config from '../config' +import { COVER_ART_SIZE } from '../consts' +import { httpClient } from '../dataProvider' import subsonic from './index' +vi.mock('../dataProvider', () => ({ + httpClient: vi.fn(() => Promise.resolve({})), +})) + describe('getCoverArtUrl', () => { beforeEach(() => { // Mock window.location @@ -31,11 +36,7 @@ describe('getCoverArtUrl', () => { updatedAt: '2023-01-01T00:00:00Z', } - const url = subsonic.getCoverArtUrl( - playlistRecord, - config.uiCoverArtSize, - true, - ) + const url = subsonic.getCoverArtUrl(playlistRecord, COVER_ART_SIZE, true) expect(url).toContain('pl-playlist-123') expect(url).toContain('size=600') @@ -49,11 +50,7 @@ describe('getCoverArtUrl', () => { sync: true, } - const url = subsonic.getCoverArtUrl( - playlistRecord, - config.uiCoverArtSize, - true, - ) + const url = subsonic.getCoverArtUrl(playlistRecord, COVER_ART_SIZE, true) expect(url).toContain('pl-playlist-123') expect(url).toContain('size=600') @@ -68,11 +65,7 @@ describe('getCoverArtUrl', () => { updatedAt: '2023-01-01T00:00:00Z', } - const url = subsonic.getCoverArtUrl( - albumRecord, - config.uiCoverArtSize, - true, - ) + const url = subsonic.getCoverArtUrl(albumRecord, COVER_ART_SIZE, true) expect(url).toContain('al-album-123') expect(url).toContain('size=600') @@ -86,7 +79,7 @@ describe('getCoverArtUrl', () => { updatedAt: '2023-01-01T00:00:00Z', } - const url = subsonic.getCoverArtUrl(songRecord, config.uiCoverArtSize, true) + const url = subsonic.getCoverArtUrl(songRecord, COVER_ART_SIZE, true) expect(url).toContain('mf-song-123') expect(url).toContain('size=600') @@ -99,11 +92,7 @@ describe('getCoverArtUrl', () => { updatedAt: '2023-01-01T00:00:00Z', } - const url = subsonic.getCoverArtUrl( - artistRecord, - config.uiCoverArtSize, - true, - ) + const url = subsonic.getCoverArtUrl(artistRecord, COVER_ART_SIZE, true) expect(url).toContain('ar-artist-123') expect(url).toContain('size=600') @@ -194,3 +183,30 @@ describe('getAvatarUrl', () => { expect(url).toContain('username=john') }) }) + +describe('getLyricsBySongId', () => { + beforeEach(() => { + vi.clearAllMocks() + const localStorageMock = { + getItem: vi.fn((key) => { + const values = { + username: 'testuser', + 'subsonic-token': 'testtoken', + 'subsonic-salt': 'testsalt', + } + return values[key] || null + }), + } + Object.defineProperty(window, 'localStorage', { value: localStorageMock }) + }) + + it('calls the getLyricsBySongId endpoint with enhanced=true', async () => { + await subsonic.getLyricsBySongId('song-1') + + expect(httpClient).toHaveBeenCalledTimes(1) + const calledUrl = httpClient.mock.calls[0][0] + expect(calledUrl).toContain('/rest/getLyricsBySongId?') + expect(calledUrl).toContain('id=song-1') + expect(calledUrl).toContain('enhanced=true') + }) +}) From 2f6f0bca797843a4fad08a43c7e1a0d3810b1555 Mon Sep 17 00:00:00 2001 From: ranokay Date: Sun, 22 Feb 2026 14:53:27 +0200 Subject: [PATCH 02/14] test: rewrite TTML tests to Ginkgo/Gomega framework - Convert ttml_test.go from testing.T/Fatalf to Ginkgo Describe/It with Gomega matchers - Remove sources_ttml_test.go (duplicate tests already exist in sources_test.go using Ginkgo) - All 26 lyrics specs pass --- core/lyrics/sources_ttml_test.go | 92 ------- core/lyrics/ttml_test.go | 425 ++++++++++++------------------- 2 files changed, 165 insertions(+), 352 deletions(-) delete mode 100644 core/lyrics/sources_ttml_test.go diff --git a/core/lyrics/sources_ttml_test.go b/core/lyrics/sources_ttml_test.go deleted file mode 100644 index 217bf7b36..000000000 --- a/core/lyrics/sources_ttml_test.go +++ /dev/null @@ -1,92 +0,0 @@ -package lyrics - -import ( - "context" - "os" - "path/filepath" - "testing" - - "github.com/navidrome/navidrome/model" -) - -func TestFromExternalFileTTML(t *testing.T) { - ctx := context.Background() - mf := model.MediaFile{Path: fixturePath("test.mp3")} - - lyrics, err := fromExternalFile(ctx, &mf, ".ttml") - if err != nil { - t.Fatalf("fromExternalFile returned error: %v", err) - } - if len(lyrics) != 2 { - t.Fatalf("expected 2 lyric tracks, got %d", len(lyrics)) - } - if lyrics[0].Lang != "eng" { - t.Fatalf("expected first language 'eng', got %q", lyrics[0].Lang) - } - if len(lyrics[0].Line) != 2 { - t.Fatalf("expected 2 english lines, got %d", len(lyrics[0].Line)) - } - if lyrics[0].Line[0].Start == nil || *lyrics[0].Line[0].Start != 18800 { - t.Fatalf("expected first english line start to be 18800, got %v", lyrics[0].Line[0].Start) - } -} - -func TestFromExternalFileTTMLWithUTF8BOM(t *testing.T) { - ctx := context.Background() - mf := model.MediaFile{Path: fixturePath("bom-test.ttml")} - - lyrics, err := fromExternalFile(ctx, &mf, ".ttml") - if err != nil { - t.Fatalf("fromExternalFile returned error: %v", err) - } - if len(lyrics) != 1 { - t.Fatalf("expected 1 lyric track, got %d", len(lyrics)) - } - if !lyrics[0].Synced { - t.Fatal("expected BOM TTML lyrics to be synced") - } - if len(lyrics[0].Line) != 1 { - t.Fatalf("expected 1 lyric line, got %d", len(lyrics[0].Line)) - } - if lyrics[0].Line[0].Start == nil || *lyrics[0].Line[0].Start != 0 { - t.Fatalf("expected first line start 0, got %v", lyrics[0].Line[0].Start) - } -} - -func TestFromExternalFileTTMLUTF16(t *testing.T) { - ctx := context.Background() - mf := model.MediaFile{Path: fixturePath("bom-utf16-test.ttml")} - - lyrics, err := fromExternalFile(ctx, &mf, ".ttml") - if err != nil { - t.Fatalf("fromExternalFile returned error: %v", err) - } - if len(lyrics) != 1 { - t.Fatalf("expected 1 lyric track, got %d", len(lyrics)) - } - if !lyrics[0].Synced { - t.Fatal("expected UTF16 TTML lyrics to be synced") - } - if len(lyrics[0].Line) != 2 { - t.Fatalf("expected 2 lyric lines, got %d", len(lyrics[0].Line)) - } - if lyrics[0].Line[0].Start == nil || *lyrics[0].Line[0].Start != 18800 { - t.Fatalf("expected first line start 18800, got %v", lyrics[0].Line[0].Start) - } - if lyrics[0].Line[1].Start == nil || *lyrics[0].Line[1].Start != 22801 { - t.Fatalf("expected second line start 22801, got %v", lyrics[0].Line[1].Start) - } -} - -func fixturePath(name string) string { - candidates := []string{ - filepath.Join("tests", "fixtures", name), - filepath.Join("..", "..", "tests", "fixtures", name), - } - for _, candidate := range candidates { - if _, err := os.Stat(candidate); err == nil { - return candidate - } - } - return filepath.Join("tests", "fixtures", name) -} diff --git a/core/lyrics/ttml_test.go b/core/lyrics/ttml_test.go index 12270c27d..c8596243b 100644 --- a/core/lyrics/ttml_test.go +++ b/core/lyrics/ttml_test.go @@ -1,13 +1,16 @@ package lyrics import ( - "testing" - "github.com/navidrome/navidrome/model" + "github.com/navidrome/navidrome/utils/gg" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" ) -func TestParseTTML_MultiLanguageAndTiming(t *testing.T) { - content := []byte(` +var _ = Describe("parseTTML", func() { + Describe("Multi-language and timing", func() { + It("should parse multiple language divs with inherited offsets and frame/tick timing", func() { + content := []byte(`
    @@ -20,33 +23,30 @@ func TestParseTTML_MultiLanguageAndTiming(t *testing.T) { `) - list, err := parseTTML(content) - if err != nil { - t.Fatalf("parseTTML returned error: %v", err) - } - if len(list) != 2 { - t.Fatalf("expected 2 lyric tracks, got %d", len(list)) - } + list, err := parseTTML(content) + Expect(err).ToNot(HaveOccurred()) + Expect(list).To(HaveLen(2)) - eng := list[0] - if eng.Lang != "eng" { - t.Fatalf("expected first track language 'eng', got %q", eng.Lang) - } - if !eng.Synced { - t.Fatal("expected first track to be synced") - } - assertTimedLine(t, eng.Line[0], 3000, "Line one") - assertTimedLine(t, eng.Line[1], 4517, "Line two\nwith break") + By("parsing the English track") + eng := list[0] + Expect(eng.Lang).To(Equal("eng")) + Expect(eng.Synced).To(BeTrue()) + Expect(eng.Line[0].Start).To(Equal(gg.P(int64(3000)))) + Expect(eng.Line[0].Value).To(Equal("Line one")) + Expect(eng.Line[1].Start).To(Equal(gg.P(int64(4517)))) + Expect(eng.Line[1].Value).To(Equal("Line two\nwith break")) - por := list[1] - if por.Lang != "por" { - t.Fatalf("expected second track language 'por', got %q", por.Lang) - } - assertTimedLine(t, por.Line[0], 4500, "Linha") -} + By("parsing the Portuguese track") + por := list[1] + Expect(por.Lang).To(Equal("por")) + Expect(por.Line[0].Start).To(Equal(gg.P(int64(4500)))) + Expect(por.Line[0].Value).To(Equal("Linha")) + }) + }) -func TestParseTTML_UnsupportedCueSkipped(t *testing.T) { - content := []byte(` + Describe("Unsupported cue handling", func() { + It("should skip wallclock cues and keep valid ones", func() { + content := []byte(`
    @@ -56,21 +56,18 @@ func TestParseTTML_UnsupportedCueSkipped(t *testing.T) { `) - list, err := parseTTML(content) - if err != nil { - t.Fatalf("parseTTML returned error: %v", err) - } - if len(list) != 1 { - t.Fatalf("expected 1 lyric track, got %d", len(list)) - } - if len(list[0].Line) != 1 { - t.Fatalf("expected 1 line in lyric track, got %d", len(list[0].Line)) - } - assertTimedLine(t, list[0].Line[0], 1000, "Keep me") -} + list, err := parseTTML(content) + Expect(err).ToNot(HaveOccurred()) + Expect(list).To(HaveLen(1)) + Expect(list[0].Line).To(HaveLen(1)) + Expect(list[0].Line[0].Start).To(Equal(gg.P(int64(1000)))) + Expect(list[0].Line[0].Value).To(Equal("Keep me")) + }) + }) -func TestParseTTML_BeginEndDurWithInheritance(t *testing.T) { - content := []byte(` + Describe("Begin/End/Dur with inheritance", func() { + It("should correctly accumulate nested timing from body, div, and p elements", func() { + content := []byte(`
    @@ -80,25 +77,21 @@ func TestParseTTML_BeginEndDurWithInheritance(t *testing.T) { `) - list, err := parseTTML(content) - if err != nil { - t.Fatalf("parseTTML returned error: %v", err) - } - if len(list) != 1 { - t.Fatalf("expected 1 lyric track, got %d", len(list)) - } - if list[0].Lang != "eng" { - t.Fatalf("expected language 'eng', got %q", list[0].Lang) - } - if len(list[0].Line) != 2 { - t.Fatalf("expected 2 lines, got %d", len(list[0].Line)) - } - assertTimedLine(t, list[0].Line[0], 16000, "First line") - assertTimedLine(t, list[0].Line[1], 18000, "Second line") -} + list, err := parseTTML(content) + Expect(err).ToNot(HaveOccurred()) + Expect(list).To(HaveLen(1)) + Expect(list[0].Lang).To(Equal("eng")) + Expect(list[0].Line).To(HaveLen(2)) + Expect(list[0].Line[0].Start).To(Equal(gg.P(int64(16000)))) + Expect(list[0].Line[0].Value).To(Equal("First line")) + Expect(list[0].Line[1].Start).To(Equal(gg.P(int64(18000)))) + Expect(list[0].Line[1].Value).To(Equal("Second line")) + }) + }) -func TestParseTTML_NonStandardBareSecondOffsets(t *testing.T) { - content := []byte(` + Describe("Non-standard bare second offsets", func() { + It("should parse bare decimal numbers as seconds", func() { + content := []byte(`
    @@ -108,22 +101,20 @@ func TestParseTTML_NonStandardBareSecondOffsets(t *testing.T) { `) - list, err := parseTTML(content) - if err != nil { - t.Fatalf("parseTTML returned error: %v", err) - } - if len(list) != 1 { - t.Fatalf("expected 1 lyric track, got %d", len(list)) - } - if len(list[0].Line) != 2 { - t.Fatalf("expected 2 lines, got %d", len(list[0].Line)) - } - assertTimedLine(t, list[0].Line[0], 10170, "First line") - assertTimedLine(t, list[0].Line[1], 13710, "Second line") -} + list, err := parseTTML(content) + Expect(err).ToNot(HaveOccurred()) + Expect(list).To(HaveLen(1)) + Expect(list[0].Line).To(HaveLen(2)) + Expect(list[0].Line[0].Start).To(Equal(gg.P(int64(10170)))) + Expect(list[0].Line[0].Value).To(Equal("First line")) + Expect(list[0].Line[1].Start).To(Equal(gg.P(int64(13710)))) + Expect(list[0].Line[1].Value).To(Equal("Second line")) + }) + }) -func TestParseTTML_WordTimingTokens(t *testing.T) { - content := []byte(` + Describe("Word timing tokens", func() { + It("should extract timed tokens from spans including background role", func() { + content := []byte(`
    @@ -135,33 +126,26 @@ func TestParseTTML_WordTimingTokens(t *testing.T) { `) - list, err := parseTTML(content) - if err != nil { - t.Fatalf("parseTTML returned error: %v", err) - } - if len(list) != 1 { - t.Fatalf("expected 1 lyric track, got %d", len(list)) - } - if len(list[0].Line) != 1 { - t.Fatalf("expected 1 line, got %d", len(list[0].Line)) - } + list, err := parseTTML(content) + Expect(err).ToNot(HaveOccurred()) + Expect(list).To(HaveLen(1)) + Expect(list[0].Line).To(HaveLen(1)) - line := list[0].Line[0] - assertTimedLine(t, line, 1000, "Hello\necho") - if line.End == nil || *line.End != 3000 { - t.Fatalf("expected line end 3000, got %v", line.End) - } - if len(line.Token) != 3 { - t.Fatalf("expected 3 timed tokens, got %d", len(line.Token)) - } + line := list[0].Line[0] + Expect(line.Start).To(Equal(gg.P(int64(1000)))) + Expect(line.Value).To(Equal("Hello\necho")) + Expect(line.End).To(Equal(gg.P(int64(3000)))) + Expect(line.Token).To(HaveLen(3)) - assertToken(t, line.Token[0], 1000, 1400, "He", "") - assertToken(t, line.Token[1], 1400, 1800, "llo", "") - assertToken(t, line.Token[2], 2000, 2500, "echo", "x-bg") -} + Expect(line.Token[0]).To(Equal(model.Token{Start: gg.P(int64(1000)), End: gg.P(int64(1400)), Value: "He"})) + Expect(line.Token[1]).To(Equal(model.Token{Start: gg.P(int64(1400)), End: gg.P(int64(1800)), Value: "llo"})) + Expect(line.Token[2]).To(Equal(model.Token{Start: gg.P(int64(2000)), End: gg.P(int64(2500)), Value: "echo", Role: "x-bg"})) + }) + }) -func TestParseTTML_AmbiguousDecimalTimingPrefersAbsoluteWhenInsideParentWindow(t *testing.T) { - content := []byte(` + Describe("Ambiguous decimal timing", func() { + It("should prefer absolute timing when values fall inside parent window", func() { + content := []byte(`
    @@ -173,28 +157,24 @@ func TestParseTTML_AmbiguousDecimalTimingPrefersAbsoluteWhenInsideParentWindow(t `) - list, err := parseTTML(content) - if err != nil { - t.Fatalf("parseTTML returned error: %v", err) - } - if len(list) != 1 || len(list[0].Line) != 1 { - t.Fatalf("expected one parsed lyric line, got %#v", list) - } + list, err := parseTTML(content) + Expect(err).ToNot(HaveOccurred()) + Expect(list).To(HaveLen(1)) + Expect(list[0].Line).To(HaveLen(1)) - line := list[0].Line[0] - assertTimedLine(t, line, 43444, "go\ngo") - if line.End == nil || *line.End != 45570 { - t.Fatalf("expected line end 45570, got %v", line.End) - } - if len(line.Token) != 2 { - t.Fatalf("expected 2 timed tokens, got %d", len(line.Token)) - } - assertToken(t, line.Token[0], 43444, 43716, "go", "") - assertToken(t, line.Token[1], 43716, 43887, "go", "") -} + line := list[0].Line[0] + Expect(line.Start).To(Equal(gg.P(int64(43444)))) + Expect(line.Value).To(Equal("go\ngo")) + Expect(line.End).To(Equal(gg.P(int64(45570)))) + Expect(line.Token).To(HaveLen(2)) + Expect(line.Token[0]).To(Equal(model.Token{Start: gg.P(int64(43444)), End: gg.P(int64(43716)), Value: "go"})) + Expect(line.Token[1]).To(Equal(model.Token{Start: gg.P(int64(43716)), End: gg.P(int64(43887)), Value: "go"})) + }) + }) -func TestParseTTML_UnsyncedFallback(t *testing.T) { - content := []byte(` + Describe("Unsynced fallback", func() { + It("should return unsynced lyrics when no timing is present", func() { + content := []byte(`
    @@ -203,32 +183,20 @@ func TestParseTTML_UnsyncedFallback(t *testing.T) { `) - list, err := parseTTML(content) - if err != nil { - t.Fatalf("parseTTML returned error: %v", err) - } - if len(list) != 1 { - t.Fatalf("expected 1 lyric track, got %d", len(list)) - } - if list[0].Lang != "xxx" { - t.Fatalf("expected default language 'xxx', got %q", list[0].Lang) - } - if list[0].Synced { - t.Fatal("expected lyric track to be unsynced") - } - if len(list[0].Line) != 1 { - t.Fatalf("expected 1 line, got %d", len(list[0].Line)) - } - if list[0].Line[0].Start != nil { - t.Fatalf("expected line start to be nil, got %v", *list[0].Line[0].Start) - } - if list[0].Line[0].Value != "No timing here" { - t.Fatalf("expected line value %q, got %q", "No timing here", list[0].Line[0].Value) - } -} + list, err := parseTTML(content) + Expect(err).ToNot(HaveOccurred()) + Expect(list).To(HaveLen(1)) + Expect(list[0].Lang).To(Equal("xxx")) + Expect(list[0].Synced).To(BeFalse()) + Expect(list[0].Line).To(HaveLen(1)) + Expect(list[0].Line[0].Start).To(BeNil()) + Expect(list[0].Line[0].Value).To(Equal("No timing here")) + }) + }) -func TestParseTTML_MetadataTracksByKey(t *testing.T) { - content := []byte(` + Describe("Metadata tracks", func() { + It("should produce main, translation, and pronunciation tracks from iTunesMetadata", func() { + content := []byte(` @@ -255,63 +223,42 @@ func TestParseTTML_MetadataTracksByKey(t *testing.T) { `) - list, err := parseTTML(content) - if err != nil { - t.Fatalf("parseTTML returned error: %v", err) - } - if len(list) != 3 { - t.Fatalf("expected 3 lyric tracks, got %d", len(list)) - } + list, err := parseTTML(content) + Expect(err).ToNot(HaveOccurred()) + Expect(list).To(HaveLen(3)) - main := list[0] - if main.Kind != "main" { - t.Fatalf("expected main track kind %q, got %q", "main", main.Kind) - } - if main.Lang != "ja" { - t.Fatalf("expected main track language %q, got %q", "ja", main.Lang) - } - if len(main.Line) != 2 { - t.Fatalf("expected 2 lines in main track, got %d", len(main.Line)) - } + By("checking the main track") + main := list[0] + Expect(main.Kind).To(Equal("main")) + Expect(main.Lang).To(Equal("ja")) + Expect(main.Line).To(HaveLen(2)) - translation := list[1] - if translation.Kind != "translation" { - t.Fatalf("expected translation kind %q, got %q", "translation", translation.Kind) - } - if translation.Lang != "es" { - t.Fatalf("expected translation language %q, got %q", "es", translation.Lang) - } - if len(translation.Line) != 1 { - t.Fatalf("expected 1 translation line, got %d", len(translation.Line)) - } - assertTimedLine(t, translation.Line[0], 1000, "Hola") - if translation.Line[0].End == nil || *translation.Line[0].End != 1500 { - t.Fatalf("expected translation line end %d, got %v", 1500, translation.Line[0].End) - } + By("checking the translation track") + translation := list[1] + Expect(translation.Kind).To(Equal("translation")) + Expect(translation.Lang).To(Equal("es")) + Expect(translation.Line).To(HaveLen(1)) + Expect(translation.Line[0].Start).To(Equal(gg.P(int64(1000)))) + Expect(translation.Line[0].Value).To(Equal("Hola")) + Expect(translation.Line[0].End).To(Equal(gg.P(int64(1500)))) - pronunciation := list[2] - if pronunciation.Kind != "pronunciation" { - t.Fatalf("expected pronunciation kind %q, got %q", "pronunciation", pronunciation.Kind) - } - if pronunciation.Lang != "ja-latn" { - t.Fatalf("expected pronunciation language %q, got %q", "ja-latn", pronunciation.Lang) - } - if len(pronunciation.Line) != 1 { - t.Fatalf("expected 1 pronunciation line, got %d", len(pronunciation.Line)) - } - assertTimedLine(t, pronunciation.Line[0], 2000, "konni") - if pronunciation.Line[0].End == nil || *pronunciation.Line[0].End != 2600 { - t.Fatalf("expected pronunciation line end %d, got %v", 2600, pronunciation.Line[0].End) - } - if len(pronunciation.Line[0].Token) != 2 { - t.Fatalf("expected 2 pronunciation tokens, got %d", len(pronunciation.Line[0].Token)) - } - assertToken(t, pronunciation.Line[0].Token[0], 2000, 2300, "ko", "") - assertToken(t, pronunciation.Line[0].Token[1], 2300, 2600, "nni", "") -} + By("checking the pronunciation track") + pronunciation := list[2] + Expect(pronunciation.Kind).To(Equal("pronunciation")) + Expect(pronunciation.Lang).To(Equal("ja-latn")) + Expect(pronunciation.Line).To(HaveLen(1)) + Expect(pronunciation.Line[0].Start).To(Equal(gg.P(int64(2000)))) + Expect(pronunciation.Line[0].Value).To(Equal("konni")) + Expect(pronunciation.Line[0].End).To(Equal(gg.P(int64(2600)))) + Expect(pronunciation.Line[0].Token).To(HaveLen(2)) + Expect(pronunciation.Line[0].Token[0]).To(Equal(model.Token{Start: gg.P(int64(2000)), End: gg.P(int64(2300)), Value: "ko"})) + Expect(pronunciation.Line[0].Token[1]).To(Equal(model.Token{Start: gg.P(int64(2300)), End: gg.P(int64(2600)), Value: "nni"})) + }) + }) -func TestParseTTML_PronunciationBareDecimalEndTimes(t *testing.T) { - content := []byte(` + Describe("Pronunciation with bare decimal end times", func() { + It("should correctly parse bare decimal times in transliteration spans", func() { + content := []byte(` @@ -331,68 +278,26 @@ func TestParseTTML_PronunciationBareDecimalEndTimes(t *testing.T) { `) - list, err := parseTTML(content) - if err != nil { - t.Fatalf("parseTTML returned error: %v", err) - } + list, err := parseTTML(content) + Expect(err).ToNot(HaveOccurred()) - var pronunciation *model.Lyrics - for i := range list { - if list[i].Kind == "pronunciation" { - pronunciation = &list[i] - break - } - } - if pronunciation == nil { - t.Fatal("expected a pronunciation track") - } - if len(pronunciation.Line) != 1 { - t.Fatalf("expected 1 pronunciation line, got %d", len(pronunciation.Line)) - } + var pronunciation *model.Lyrics + for i := range list { + if list[i].Kind == "pronunciation" { + pronunciation = &list[i] + break + } + } + Expect(pronunciation).ToNot(BeNil()) + Expect(pronunciation.Line).To(HaveLen(1)) - line := pronunciation.Line[0] - assertTimedLine(t, line, 2747, "I woke up") - if len(line.Token) != 3 { - t.Fatalf("expected 3 tokens, got %d", len(line.Token)) - } - assertToken(t, line.Token[0], 2747, 3018, "I", "") - assertToken(t, line.Token[1], 3018, 3179, "woke", "") - assertToken(t, line.Token[2], 3179, 3582, "up", "") -} - -func assertTimedLine(t *testing.T, line model.Line, expectedStart int64, expectedValue string) { - t.Helper() - - if line.Start == nil { - t.Fatal("expected line start to be set, got nil") - } - if *line.Start != expectedStart { - t.Fatalf("expected line start %d, got %d", expectedStart, *line.Start) - } - if line.Value != expectedValue { - t.Fatalf("expected line value %q, got %q", expectedValue, line.Value) - } -} - -func assertToken(t *testing.T, token model.Token, expectedStart int64, expectedEnd int64, expectedValue string, expectedRole string) { - t.Helper() - - if token.Start == nil { - t.Fatal("expected token start to be set, got nil") - } - if *token.Start != expectedStart { - t.Fatalf("expected token start %d, got %d", expectedStart, *token.Start) - } - if token.End == nil { - t.Fatal("expected token end to be set, got nil") - } - if *token.End != expectedEnd { - t.Fatalf("expected token end %d, got %d", expectedEnd, *token.End) - } - if token.Value != expectedValue { - t.Fatalf("expected token value %q, got %q", expectedValue, token.Value) - } - if token.Role != expectedRole { - t.Fatalf("expected token role %q, got %q", expectedRole, token.Role) - } -} + line := pronunciation.Line[0] + Expect(line.Start).To(Equal(gg.P(int64(2747)))) + Expect(line.Value).To(Equal("I woke up")) + Expect(line.Token).To(HaveLen(3)) + Expect(line.Token[0]).To(Equal(model.Token{Start: gg.P(int64(2747)), End: gg.P(int64(3018)), Value: "I"})) + Expect(line.Token[1]).To(Equal(model.Token{Start: gg.P(int64(3018)), End: gg.P(int64(3179)), Value: "woke"})) + Expect(line.Token[2]).To(Equal(model.Token{Start: gg.P(int64(3179)), End: gg.P(int64(3582)), Value: "up"})) + }) + }) +}) From 1d78373b778008f8e994dbe85cbd53d124696d27 Mon Sep 17 00:00:00 2001 From: ranokay Date: Sun, 22 Feb 2026 21:05:14 +0200 Subject: [PATCH 03/14] refactor: align with OpenSubsonic spec feedback - Rename token/tokenLine to cue/cueLine across Go backend and JS frontend - Move role from individual cue to cueLine level (server pre-splits by role) - Add enhanced query parameter to getLyricsBySongId for backward compat - Add enhanced=true to UI API client so translations/pronunciations load - Update all Go and JS tests to match new naming and structure --- core/lyrics/ttml.go | 28 +++---- core/lyrics/ttml_test.go | 28 +++---- model/lyrics.go | 10 +-- server/subsonic/helpers.go | 70 ++++++++++------ server/subsonic/media_retrieval.go | 4 +- server/subsonic/media_retrieval_test.go | 70 ++++++++-------- server/subsonic/responses/responses.go | 32 ++++---- ui/src/audioplayer/lyrics.js | 101 +++++++++++++++--------- ui/src/audioplayer/lyrics.test.js | 38 +++++---- ui/src/subsonic/index.js | 4 +- 10 files changed, 226 insertions(+), 159 deletions(-) diff --git a/core/lyrics/ttml.go b/core/lyrics/ttml.go index 3aae53aa0..a0bdcac5a 100644 --- a/core/lyrics/ttml.go +++ b/core/lyrics/ttml.go @@ -162,7 +162,7 @@ func (p *ttmlParser) parseElement(start xml.StartElement, parent ttmlTimingConte parsedLine.End = &endMs } if len(tokens) > 0 { - parsedLine.Token = tokens + parsedLine.Cue = tokens } parsedLine = hydrateLineTimingFromTokens(parsedLine) @@ -261,20 +261,20 @@ func (p *ttmlParser) parseMetadataText(start xml.StartElement, parent ttmlTiming line.End = &endMs } if len(tokens) > 0 { - line.Token = tokens + line.Cue = tokens } line = hydrateLineTimingFromTokens(line) - if line.Value == "" && len(line.Token) == 0 { + if line.Value == "" && len(line.Cue) == 0 { return ttmlMetadataEntry{}, false, nil } return ttmlMetadataEntry{key: forKey, line: line}, true, nil } -func (p *ttmlParser) parseParagraph(parent ttmlTimingContext) (string, []model.Token, error) { +func (p *ttmlParser) parseParagraph(parent ttmlTimingContext) (string, []model.Cue, error) { var text strings.Builder - var tokens []model.Token + var tokens []model.Cue for { token, err := p.decoder.Token() @@ -300,7 +300,7 @@ func (p *ttmlParser) parseParagraph(parent ttmlTimingContext) (string, []model.T } } -func (p *ttmlParser) parseInlineElement(start xml.StartElement, parent ttmlTimingContext) (string, []model.Token, error) { +func (p *ttmlParser) parseInlineElement(start xml.StartElement, parent ttmlTimingContext) (string, []model.Cue, error) { local := strings.ToLower(start.Name.Local) if local == "br" { return "\n", nil, nil @@ -313,7 +313,7 @@ func (p *ttmlParser) parseInlineElement(start xml.StartElement, parent ttmlTimin hasOwnTiming := hasBegin || hasEnd || hasDur var text strings.Builder - var tokens []model.Token + var tokens []model.Cue for { token, err := p.decoder.Token() @@ -337,7 +337,7 @@ func (p *ttmlParser) parseInlineElement(start xml.StartElement, parent ttmlTimin value := text.String() tokenText := sanitizeTTMLText(value) if local == "span" && hasOwnTiming && !ctx.invalid && tokenText != "" && len(tokens) == 0 { - parsedToken := model.Token{ + parsedToken := model.Cue{ Value: tokenText, Role: ctx.role, } @@ -413,7 +413,7 @@ func (p *ttmlParser) buildMetadataLyrics(kind string, langOrder []string, entrie } line = hydrateLineTimingFromTokens(line) - if line.Value == "" && len(line.Token) == 0 { + if line.Value == "" && len(line.Cue) == 0 { continue } @@ -830,8 +830,8 @@ func linesAreSynced(lines []model.Line) bool { if lines[i].Start != nil { return true } - for j := range lines[i].Token { - if lines[i].Token[j].Start != nil { + for j := range lines[i].Cue { + if lines[i].Cue[j].Start != nil { return true } } @@ -840,14 +840,14 @@ func linesAreSynced(lines []model.Line) bool { } func hydrateLineTimingFromTokens(line model.Line) model.Line { - if len(line.Token) == 0 { + if len(line.Cue) == 0 { return line } var earliestStart *int64 var latestEnd *int64 - for i := range line.Token { - token := line.Token[i] + for i := range line.Cue { + token := line.Cue[i] if token.Start != nil { if earliestStart == nil || *token.Start < *earliestStart { v := *token.Start diff --git a/core/lyrics/ttml_test.go b/core/lyrics/ttml_test.go index c8596243b..8ec16f679 100644 --- a/core/lyrics/ttml_test.go +++ b/core/lyrics/ttml_test.go @@ -135,11 +135,11 @@ var _ = Describe("parseTTML", func() { Expect(line.Start).To(Equal(gg.P(int64(1000)))) Expect(line.Value).To(Equal("Hello\necho")) Expect(line.End).To(Equal(gg.P(int64(3000)))) - Expect(line.Token).To(HaveLen(3)) + Expect(line.Cue).To(HaveLen(3)) - Expect(line.Token[0]).To(Equal(model.Token{Start: gg.P(int64(1000)), End: gg.P(int64(1400)), Value: "He"})) - Expect(line.Token[1]).To(Equal(model.Token{Start: gg.P(int64(1400)), End: gg.P(int64(1800)), Value: "llo"})) - Expect(line.Token[2]).To(Equal(model.Token{Start: gg.P(int64(2000)), End: gg.P(int64(2500)), Value: "echo", Role: "x-bg"})) + Expect(line.Cue[0]).To(Equal(model.Cue{Start: gg.P(int64(1000)), End: gg.P(int64(1400)), Value: "He"})) + Expect(line.Cue[1]).To(Equal(model.Cue{Start: gg.P(int64(1400)), End: gg.P(int64(1800)), Value: "llo"})) + Expect(line.Cue[2]).To(Equal(model.Cue{Start: gg.P(int64(2000)), End: gg.P(int64(2500)), Value: "echo", Role: "x-bg"})) }) }) @@ -166,9 +166,9 @@ var _ = Describe("parseTTML", func() { Expect(line.Start).To(Equal(gg.P(int64(43444)))) Expect(line.Value).To(Equal("go\ngo")) Expect(line.End).To(Equal(gg.P(int64(45570)))) - Expect(line.Token).To(HaveLen(2)) - Expect(line.Token[0]).To(Equal(model.Token{Start: gg.P(int64(43444)), End: gg.P(int64(43716)), Value: "go"})) - Expect(line.Token[1]).To(Equal(model.Token{Start: gg.P(int64(43716)), End: gg.P(int64(43887)), Value: "go"})) + Expect(line.Cue).To(HaveLen(2)) + Expect(line.Cue[0]).To(Equal(model.Cue{Start: gg.P(int64(43444)), End: gg.P(int64(43716)), Value: "go"})) + Expect(line.Cue[1]).To(Equal(model.Cue{Start: gg.P(int64(43716)), End: gg.P(int64(43887)), Value: "go"})) }) }) @@ -250,9 +250,9 @@ var _ = Describe("parseTTML", func() { Expect(pronunciation.Line[0].Start).To(Equal(gg.P(int64(2000)))) Expect(pronunciation.Line[0].Value).To(Equal("konni")) Expect(pronunciation.Line[0].End).To(Equal(gg.P(int64(2600)))) - Expect(pronunciation.Line[0].Token).To(HaveLen(2)) - Expect(pronunciation.Line[0].Token[0]).To(Equal(model.Token{Start: gg.P(int64(2000)), End: gg.P(int64(2300)), Value: "ko"})) - Expect(pronunciation.Line[0].Token[1]).To(Equal(model.Token{Start: gg.P(int64(2300)), End: gg.P(int64(2600)), Value: "nni"})) + Expect(pronunciation.Line[0].Cue).To(HaveLen(2)) + Expect(pronunciation.Line[0].Cue[0]).To(Equal(model.Cue{Start: gg.P(int64(2000)), End: gg.P(int64(2300)), Value: "ko"})) + Expect(pronunciation.Line[0].Cue[1]).To(Equal(model.Cue{Start: gg.P(int64(2300)), End: gg.P(int64(2600)), Value: "nni"})) }) }) @@ -294,10 +294,10 @@ var _ = Describe("parseTTML", func() { line := pronunciation.Line[0] Expect(line.Start).To(Equal(gg.P(int64(2747)))) Expect(line.Value).To(Equal("I woke up")) - Expect(line.Token).To(HaveLen(3)) - Expect(line.Token[0]).To(Equal(model.Token{Start: gg.P(int64(2747)), End: gg.P(int64(3018)), Value: "I"})) - Expect(line.Token[1]).To(Equal(model.Token{Start: gg.P(int64(3018)), End: gg.P(int64(3179)), Value: "woke"})) - Expect(line.Token[2]).To(Equal(model.Token{Start: gg.P(int64(3179)), End: gg.P(int64(3582)), Value: "up"})) + Expect(line.Cue).To(HaveLen(3)) + Expect(line.Cue[0]).To(Equal(model.Cue{Start: gg.P(int64(2747)), End: gg.P(int64(3018)), Value: "I"})) + Expect(line.Cue[1]).To(Equal(model.Cue{Start: gg.P(int64(3018)), End: gg.P(int64(3179)), Value: "woke"})) + Expect(line.Cue[2]).To(Equal(model.Cue{Start: gg.P(int64(3179)), End: gg.P(int64(3582)), Value: "up"})) }) }) }) diff --git a/model/lyrics.go b/model/lyrics.go index 220eec7b5..3cb1cb715 100644 --- a/model/lyrics.go +++ b/model/lyrics.go @@ -11,7 +11,7 @@ import ( "github.com/navidrome/navidrome/utils/str" ) -type Token struct { +type Cue struct { Start *int64 `structs:"start,omitempty" json:"start,omitempty"` End *int64 `structs:"end,omitempty" json:"end,omitempty"` Value string `structs:"value" json:"value"` @@ -19,10 +19,10 @@ type Token struct { } type Line struct { - Start *int64 `structs:"start,omitempty" json:"start,omitempty"` - End *int64 `structs:"end,omitempty" json:"end,omitempty"` - Value string `structs:"value" json:"value"` - Token []Token `structs:"token,omitempty" json:"token,omitempty"` + Start *int64 `structs:"start,omitempty" json:"start,omitempty"` + End *int64 `structs:"end,omitempty" json:"end,omitempty"` + Value string `structs:"value" json:"value"` + Cue []Cue `structs:"cue,omitempty" json:"cue,omitempty"` } type Lyrics struct { diff --git a/server/subsonic/helpers.go b/server/subsonic/helpers.go index 3b9412fb1..6922f0683 100644 --- a/server/subsonic/helpers.go +++ b/server/subsonic/helpers.go @@ -493,35 +493,49 @@ func mapExplicitStatus(explicitStatus string) string { return "" } -func buildStructuredLyric(mf *model.MediaFile, lyrics model.Lyrics) responses.StructuredLyric { +func buildStructuredLyric(mf *model.MediaFile, lyrics model.Lyrics, enhanced bool) responses.StructuredLyric { lines := make([]responses.Line, len(lyrics.Line)) - tokenLines := make([]responses.TokenLine, 0, len(lyrics.Line)) + var cueLines []responses.CueLine for i, line := range lyrics.Line { lines[i] = responses.Line{ Start: line.Start, Value: line.Value, } - if len(line.Token) == 0 { + if !enhanced || len(line.Cue) == 0 { continue } - tokens := make([]responses.LyricToken, len(line.Token)) - for j, token := range line.Token { - tokens[j] = responses.LyricToken{ - Start: token.Start, - End: token.End, - Value: token.Value, - Role: token.Role, + // Group cues by role, preserving order of first appearance + roleOrder := make([]string, 0, 2) + cuesByRole := make(map[string][]responses.LyricCue) + for _, cue := range line.Cue { + role := cue.Role + if _, exists := cuesByRole[role]; !exists { + roleOrder = append(roleOrder, role) } + cuesByRole[role] = append(cuesByRole[role], responses.LyricCue{ + Start: cue.Start, + End: cue.End, + Value: cue.Value, + }) + } + + // Create a separate CueLine for each role group + for _, role := range roleOrder { + cues := cuesByRole[role] + cueLine := responses.CueLine{ + Index: int32(i), + Start: line.Start, + End: line.End, + Value: line.Value, + Cue: cues, + } + if role != "" { + cueLine.Role = role + } + cueLines = append(cueLines, cueLine) } - tokenLines = append(tokenLines, responses.TokenLine{ - Index: int32(i), - Start: line.Start, - End: line.End, - Value: line.Value, - Token: tokens, - }) } kind := strings.TrimSpace(lyrics.Kind) @@ -535,7 +549,7 @@ func buildStructuredLyric(mf *model.MediaFile, lyrics model.Lyrics) responses.St Kind: kind, Lang: lyrics.Lang, Line: lines, - TokenLine: tokenLines, + CueLine: cueLines, Offset: lyrics.Offset, Synced: lyrics.Synced, } @@ -550,11 +564,23 @@ func buildStructuredLyric(mf *model.MediaFile, lyrics model.Lyrics) responses.St return structured } -func buildLyricsList(mf *model.MediaFile, lyricsList model.LyricList) *responses.LyricsList { - lyricList := make(responses.StructuredLyrics, len(lyricsList)) +func buildLyricsList(mf *model.MediaFile, lyricsList model.LyricList, enhanced bool) *responses.LyricsList { + var filtered model.LyricList + if enhanced { + filtered = lyricsList + } else { + // Without enhanced, only return "main" kind entries + for _, l := range lyricsList { + kind := strings.TrimSpace(l.Kind) + if kind == "" || kind == "main" { + filtered = append(filtered, l) + } + } + } - for i, lyrics := range lyricsList { - lyricList[i] = buildStructuredLyric(mf, lyrics) + lyricList := make(responses.StructuredLyrics, len(filtered)) + for i, lyrics := range filtered { + lyricList[i] = buildStructuredLyric(mf, lyrics, enhanced) } res := &responses.LyricsList{ diff --git a/server/subsonic/media_retrieval.go b/server/subsonic/media_retrieval.go index 963db067c..de88849a2 100644 --- a/server/subsonic/media_retrieval.go +++ b/server/subsonic/media_retrieval.go @@ -149,8 +149,10 @@ func (api *Router) GetLyricsBySongId(r *http.Request) (*responses.Subsonic, erro return nil, err } + enhanced, _ := req.Params(r).Bool("enhanced") + response := newResponse() - response.LyricsList = buildLyricsList(mediaFile, structuredLyrics) + response.LyricsList = buildLyricsList(mediaFile, structuredLyrics, enhanced) return response, nil } diff --git a/server/subsonic/media_retrieval_test.go b/server/subsonic/media_retrieval_test.go index 6c52d38bc..7cf96fee5 100644 --- a/server/subsonic/media_retrieval_test.go +++ b/server/subsonic/media_retrieval_test.go @@ -258,36 +258,36 @@ var _ = Describe("MediaRetrievalController", func() { } } - Expect(realLyric.TokenLine).To(HaveLen(len(expectedLyric.TokenLine))) - for j, realTokenLine := range realLyric.TokenLine { - expectedTokenLine := expectedLyric.TokenLine[j] - Expect(realTokenLine.Index).To(Equal(expectedTokenLine.Index)) - Expect(realTokenLine.Value).To(Equal(expectedTokenLine.Value)) - if expectedTokenLine.Start == nil { - Expect(realTokenLine.Start).To(BeNil()) + Expect(realLyric.CueLine).To(HaveLen(len(expectedLyric.CueLine))) + for j, realCueLine := range realLyric.CueLine { + expectedCueLine := expectedLyric.CueLine[j] + Expect(realCueLine.Index).To(Equal(expectedCueLine.Index)) + Expect(realCueLine.Value).To(Equal(expectedCueLine.Value)) + Expect(realCueLine.Role).To(Equal(expectedCueLine.Role)) + if expectedCueLine.Start == nil { + Expect(realCueLine.Start).To(BeNil()) } else { - Expect(*realTokenLine.Start).To(Equal(*expectedTokenLine.Start)) + Expect(*realCueLine.Start).To(Equal(*expectedCueLine.Start)) } - if expectedTokenLine.End == nil { - Expect(realTokenLine.End).To(BeNil()) + if expectedCueLine.End == nil { + Expect(realCueLine.End).To(BeNil()) } else { - Expect(*realTokenLine.End).To(Equal(*expectedTokenLine.End)) + Expect(*realCueLine.End).To(Equal(*expectedCueLine.End)) } - Expect(realTokenLine.Token).To(HaveLen(len(expectedTokenLine.Token))) - for k, realToken := range realTokenLine.Token { - expectedToken := expectedTokenLine.Token[k] - Expect(realToken.Value).To(Equal(expectedToken.Value)) - Expect(realToken.Role).To(Equal(expectedToken.Role)) - if expectedToken.Start == nil { - Expect(realToken.Start).To(BeNil()) + Expect(realCueLine.Cue).To(HaveLen(len(expectedCueLine.Cue))) + for k, realCue := range realCueLine.Cue { + expectedCue := expectedCueLine.Cue[k] + Expect(realCue.Value).To(Equal(expectedCue.Value)) + if expectedCue.Start == nil { + Expect(realCue.Start).To(BeNil()) } else { - Expect(*realToken.Start).To(Equal(*expectedToken.Start)) + Expect(*realCue.Start).To(Equal(*expectedCue.Start)) } - if expectedToken.End == nil { - Expect(realToken.End).To(BeNil()) + if expectedCue.End == nil { + Expect(realCue.End).To(BeNil()) } else { - Expect(*realToken.End).To(Equal(*expectedToken.End)) + Expect(*realCue.End).To(Equal(*expectedCue.End)) } } } @@ -448,7 +448,7 @@ var _ = Describe("MediaRetrievalController", func() { It("should return metadata-linked translation and pronunciation tracks from TTML", func() { conf.Server.LyricsPriority = ".ttml,embedded" - r := newGetRequest("id=1") + r := newGetRequest("id=1&enhanced=true") mockRepo.SetData(model.MediaFiles{ { @@ -513,13 +513,13 @@ var _ = Describe("MediaRetrievalController", func() { Value: "konni", }, }, - TokenLine: []responses.TokenLine{ + CueLine: []responses.CueLine{ { Index: 0, Start: &mainStartB, End: &tokenEndB, Value: "konni", - Token: []responses.LyricToken{ + Cue: []responses.LyricCue{ { Start: &tokenStartA, End: &tokenEndA, @@ -538,8 +538,8 @@ var _ = Describe("MediaRetrievalController", func() { }) }) - It("should return tokenized lines for songLyrics v2 clients", func() { - r := newGetRequest("id=1") + It("should return cue lines for songLyrics v2 clients with enhanced=true", func() { + r := newGetRequest("id=1&enhanced=true") lineStart := int64(1000) lineEnd := int64(3000) @@ -556,7 +556,7 @@ var _ = Describe("MediaRetrievalController", func() { Start: &lineStart, End: &lineEnd, Value: "Hello echo", - Token: []model.Token{ + Cue: []model.Cue{ { Start: &tokenStartA, End: &tokenEndA, @@ -599,23 +599,31 @@ var _ = Describe("MediaRetrievalController", func() { Value: "Hello echo", }, }, - TokenLine: []responses.TokenLine{ + CueLine: []responses.CueLine{ { Index: 0, Start: &lineStart, End: &lineEnd, Value: "Hello echo", - Token: []responses.LyricToken{ + Cue: []responses.LyricCue{ { Start: &tokenStartA, End: &tokenEndA, Value: "Hello", }, + }, + }, + { + Index: 0, + Start: &lineStart, + End: &lineEnd, + Value: "Hello echo", + Role: "x-bg", + Cue: []responses.LyricCue{ { Start: &tokenStartB, End: &tokenEndB, Value: "echo", - Role: "x-bg", }, }, }, diff --git a/server/subsonic/responses/responses.go b/server/subsonic/responses/responses.go index ff5ae0d3b..d19f99ca6 100644 --- a/server/subsonic/responses/responses.go +++ b/server/subsonic/responses/responses.go @@ -537,30 +537,30 @@ type Line struct { Value string `xml:",chardata" json:"value"` } -type LyricToken struct { +type LyricCue struct { Start *int64 `xml:"start,attr,omitempty" json:"start,omitempty"` End *int64 `xml:"end,attr,omitempty" json:"end,omitempty"` Value string `xml:"value,attr" json:"value"` - Role string `xml:"role,attr,omitempty" json:"role,omitempty"` } -type TokenLine struct { - Index int32 `xml:"index,attr" json:"index"` - Start *int64 `xml:"start,attr,omitempty" json:"start,omitempty"` - End *int64 `xml:"end,attr,omitempty" json:"end,omitempty"` - Value string `xml:"value,attr,omitempty" json:"value,omitempty"` - Token []LyricToken `xml:"token,omitempty" json:"token,omitempty"` +type CueLine struct { + Index int32 `xml:"index,attr" json:"index"` + Start *int64 `xml:"start,attr,omitempty" json:"start,omitempty"` + End *int64 `xml:"end,attr,omitempty" json:"end,omitempty"` + Value string `xml:"value,attr,omitempty" json:"value,omitempty"` + Role string `xml:"role,attr,omitempty" json:"role,omitempty"` + Cue []LyricCue `xml:"cue,omitempty" json:"cue,omitempty"` } type StructuredLyric struct { - DisplayArtist string `xml:"displayArtist,attr,omitempty" json:"displayArtist,omitempty"` - DisplayTitle string `xml:"displayTitle,attr,omitempty" json:"displayTitle,omitempty"` - Kind string `xml:"kind,attr,omitempty" json:"kind,omitempty"` - Lang string `xml:"lang,attr" json:"lang"` - Line []Line `xml:"line" json:"line"` - TokenLine []TokenLine `xml:"tokenLine,omitempty" json:"tokenLine,omitempty"` - Offset *int64 `xml:"offset,attr,omitempty" json:"offset,omitempty"` - Synced bool `xml:"synced,attr" json:"synced"` + DisplayArtist string `xml:"displayArtist,attr,omitempty" json:"displayArtist,omitempty"` + DisplayTitle string `xml:"displayTitle,attr,omitempty" json:"displayTitle,omitempty"` + Kind string `xml:"kind,attr,omitempty" json:"kind,omitempty"` + Lang string `xml:"lang,attr" json:"lang"` + Line []Line `xml:"line" json:"line"` + CueLine []CueLine `xml:"cueLine,omitempty" json:"cueLine,omitempty"` + Offset *int64 `xml:"offset,attr,omitempty" json:"offset,omitempty"` + Synced bool `xml:"synced,attr" json:"synced"` } type StructuredLyrics []StructuredLyric diff --git a/ui/src/audioplayer/lyrics.js b/ui/src/audioplayer/lyrics.js index 3dcf9b0f9..111ded02e 100644 --- a/ui/src/audioplayer/lyrics.js +++ b/ui/src/audioplayer/lyrics.js @@ -74,25 +74,25 @@ const normalizeToken = (token) => { start: toTime(token.start), end: toTime(token.end), value, - role: typeof token.role === 'string' ? token.role : '', } } -const normalizeTokenLine = (tokenLine, fallbackIndex) => { - const index = Number.isFinite(Number(tokenLine?.index)) - ? Number(tokenLine.index) +const normalizeCueLine = (cueLine, fallbackIndex) => { + const index = Number.isFinite(Number(cueLine?.index)) + ? Number(cueLine.index) : fallbackIndex const tokens = sortTokensByStart( - Array.isArray(tokenLine?.token) - ? tokenLine.token.map(normalizeToken).filter(Boolean) + Array.isArray(cueLine?.cue) + ? cueLine.cue.map(normalizeToken).filter(Boolean) : [], ) return { index, - start: toTime(tokenLine?.start), - end: toTime(tokenLine?.end), - value: typeof tokenLine?.value === 'string' ? tokenLine.value : '', + start: toTime(cueLine?.start), + end: toTime(cueLine?.end), + value: typeof cueLine?.value === 'string' ? cueLine.value : '', + role: typeof cueLine?.role === 'string' ? cueLine.role : '', tokens, } } @@ -197,14 +197,14 @@ const buildSyntheticWordTokens = (line, token) => { })) } -export const hasTokenTiming = (structuredLyric) => +export const hasCueTiming = (structuredLyric) => Boolean( structuredLyric && - Array.isArray(structuredLyric.tokenLine) && - structuredLyric.tokenLine.some( - (tokenLine) => - Array.isArray(tokenLine?.token) && - tokenLine.token.some((token) => Number.isFinite(Number(token?.start))), + Array.isArray(structuredLyric.cueLine) && + structuredLyric.cueLine.some( + (cueLine) => + Array.isArray(cueLine?.cue) && + cueLine.cue.some((cue) => Number.isFinite(Number(cue?.start))), ), ) @@ -215,7 +215,7 @@ export const hasStructuredLyricContent = (structuredLyric) => structuredLyric.line.some( (line) => typeof line?.value === 'string' && line.value.trim() !== '', )) || - hasTokenTiming(structuredLyric)), + hasCueTiming(structuredLyric)), ) export const getPreferredLyricLanguage = () => { @@ -319,34 +319,57 @@ export const buildKaraokeLines = (structuredLyric) => { const baseLines = Array.isArray(structuredLyric.line) ? structuredLyric.line : [] - const rawTokenLines = Array.isArray(structuredLyric.tokenLine) - ? structuredLyric.tokenLine + const rawCueLines = Array.isArray(structuredLyric.cueLine) + ? structuredLyric.cueLine : [] const lines = - rawTokenLines.length > 0 - ? rawTokenLines.map((tokenLine, fallbackIndex) => { - const normalized = normalizeTokenLine(tokenLine, fallbackIndex) - const baseLine = baseLines[normalized.index] || {} - const tokens = normalized.tokens - const fallbackStart = - tokens.find((token) => token.start != null)?.start ?? null - const fallbackEnd = - [...tokens].reverse().find((token) => token.end != null)?.end ?? - null - const value = - normalized.value || - (typeof baseLine.value === 'string' ? baseLine.value : '') || - tokens.map((token) => token.value).join('') + rawCueLines.length > 0 + ? (() => { + const normalizedCueLines = rawCueLines.map( + (cueLine, fallbackIndex) => { + const normalized = normalizeCueLine(cueLine, fallbackIndex) + return { + ...normalized, + tokens: normalized.tokens.map((token) => ({ + ...token, + role: normalized.role, + })), + } + }, + ) - return { - index: normalized.index, - start: normalized.start ?? toTime(baseLine.start) ?? fallbackStart, - end: normalized.end ?? toTime(baseLine.end) ?? fallbackEnd, - value, - tokens, + const byIndex = new Map() + for (const cl of normalizedCueLines) { + if (!byIndex.has(cl.index)) { + byIndex.set(cl.index, []) + } + byIndex.get(cl.index).push(cl) } - }) + + return Array.from(byIndex.entries()).map(([index, group]) => { + const first = group[0] + const baseLine = baseLines[index] || {} + const tokens = sortTokensByStart(group.flatMap((cl) => cl.tokens)) + const fallbackStart = + tokens.find((token) => token.start != null)?.start ?? null + const fallbackEnd = + [...tokens].reverse().find((token) => token.end != null)?.end ?? + null + const value = + first.value || + (typeof baseLine.value === 'string' ? baseLine.value : '') || + tokens.map((token) => token.value).join('') + + return { + index, + start: first.start ?? toTime(baseLine.start) ?? fallbackStart, + end: first.end ?? toTime(baseLine.end) ?? fallbackEnd, + value, + tokens, + } + }) + })() : baseLines.map((line, index) => ({ index, start: toTime(line.start), diff --git a/ui/src/audioplayer/lyrics.test.js b/ui/src/audioplayer/lyrics.test.js index c60605a6f..7e0b0d105 100644 --- a/ui/src/audioplayer/lyrics.test.js +++ b/ui/src/audioplayer/lyrics.test.js @@ -1,15 +1,15 @@ import { buildKaraokeLines, findLayerLineIndexForMain, - getPreferredLyricLanguage, getActiveKaraokeState, + getPreferredLyricLanguage, hasStructuredLyricContent, pickStructuredLyric, resolveKaraokeTokenWindow, resolveLayerLineForMain, selectLyricLayers, - structuredLyricToLrc, structuredLyricsToLrc, + structuredLyricToLrc, } from './lyrics' describe('lyrics helpers', () => { @@ -200,21 +200,27 @@ describe('lyrics helpers', () => { expect(getPreferredLyricLanguage()).toBe('pt-BR') }) - it('builds karaoke lines from tokenLine payload', () => { + it('builds karaoke lines from cueLine payload', () => { const lines = buildKaraokeLines({ lang: 'eng', synced: true, line: [{ start: 1000, end: 3000, value: 'Hello world' }], - tokenLine: [ + cueLine: [ { index: 0, start: 1000, end: 3000, value: 'Hello world', - token: [ - { start: 1000, end: 1500, value: 'Hello' }, - { start: 2000, end: 2500, value: 'world', role: 'x-bg' }, - ], + role: '', + cue: [{ start: 1000, end: 1500, value: 'Hello' }], + }, + { + index: 0, + start: 1000, + end: 3000, + value: 'Hello world', + role: 'x-bg', + cue: [{ start: 2000, end: 2500, value: 'world' }], }, ], }) @@ -238,15 +244,16 @@ describe('lyrics helpers', () => { lang: 'eng', synced: true, line: [{ start: 1000, end: 3000, value: 'Hello world' }], - tokenLine: [ + cueLine: [ { index: 0, start: 1000, end: 3000, value: 'Hello world', - token: [ - { start: 2000, end: 2500, value: 'world', role: '' }, - { start: 1000, end: 1500, value: 'Hello', role: '' }, + role: '', + cue: [ + { start: 2000, end: 2500, value: 'world' }, + { start: 1000, end: 1500, value: 'Hello' }, ], }, ], @@ -263,13 +270,14 @@ describe('lyrics helpers', () => { lang: 'ko-Latn', synced: true, line: [{ start: 1000, end: 2000, value: 'Da-la-lun, dun' }], - tokenLine: [ + cueLine: [ { index: 0, start: 1000, end: 2000, value: 'Da-la-lun, dun', - token: [{ start: 1000, end: 2000, value: 'Da-la-lun, dun' }], + role: '', + cue: [{ start: 1000, end: 2000, value: 'Da-la-lun, dun' }], }, ], }) @@ -409,7 +417,7 @@ describe('lyrics helpers', () => { it('reports structured lyric content when token timing exists', () => { expect( hasStructuredLyricContent({ - tokenLine: [{ token: [{ start: 100, value: 'a' }] }], + cueLine: [{ cue: [{ start: 100, value: 'a' }] }], }), ).toBe(true) }) diff --git a/ui/src/subsonic/index.js b/ui/src/subsonic/index.js index b311d5e14..47ebabe99 100644 --- a/ui/src/subsonic/index.js +++ b/ui/src/subsonic/index.js @@ -1,5 +1,5 @@ -import { baseUrl } from '../utils' import { httpClient } from '../dataProvider' +import { baseUrl } from '../utils' const url = (command, id, options) => { const username = localStorage.getItem('username') @@ -121,7 +121,7 @@ const getTopSongs = (artist, count = 50) => { } const getLyricsBySongId = (id) => { - return httpClient(url('getLyricsBySongId', id)) + return httpClient(url('getLyricsBySongId', id, { enhanced: true })) } const streamUrl = (id, options) => { From 944401cae3f21c982ecf5b69b6004db68a147764 Mon Sep 17 00:00:00 2001 From: ranokay Date: Sun, 22 Feb 2026 22:13:22 +0200 Subject: [PATCH 04/14] refactor: address Tolriq feedback on roles and cue timing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Drop x- prefix from role values (x-bg → bg, x-voice1 → voice1, x-group → group) - Clarify voiceN has no upper bound (voice1, voice100, voice1000 all valid) - Make cue.start required (non-pointer int64) in API response - Keep cue.end optional with defined fallback semantics - Strip x- prefix from TTML role values when mapping to API output --- server/subsonic/helpers.go | 13 +++++++++++-- server/subsonic/media_retrieval_test.go | 16 ++++++---------- server/subsonic/responses/responses.go | 2 +- ui/src/audioplayer/lyrics.test.js | 4 ++-- 4 files changed, 20 insertions(+), 15 deletions(-) diff --git a/server/subsonic/helpers.go b/server/subsonic/helpers.go index 6922f0683..d8ec8451b 100644 --- a/server/subsonic/helpers.go +++ b/server/subsonic/helpers.go @@ -493,6 +493,11 @@ func mapExplicitStatus(explicitStatus string) string { return "" } +// sanitizeRole strips the TTML x- prefix from role values for the API. +func sanitizeRole(role string) string { + return strings.TrimPrefix(role, "x-") +} + func buildStructuredLyric(mf *model.MediaFile, lyrics model.Lyrics, enhanced bool) responses.StructuredLyric { lines := make([]responses.Line, len(lyrics.Line)) var cueLines []responses.CueLine @@ -510,12 +515,16 @@ func buildStructuredLyric(mf *model.MediaFile, lyrics model.Lyrics, enhanced boo roleOrder := make([]string, 0, 2) cuesByRole := make(map[string][]responses.LyricCue) for _, cue := range line.Cue { - role := cue.Role + role := sanitizeRole(cue.Role) if _, exists := cuesByRole[role]; !exists { roleOrder = append(roleOrder, role) } + var start int64 + if cue.Start != nil { + start = *cue.Start + } cuesByRole[role] = append(cuesByRole[role], responses.LyricCue{ - Start: cue.Start, + Start: start, End: cue.End, Value: cue.Value, }) diff --git a/server/subsonic/media_retrieval_test.go b/server/subsonic/media_retrieval_test.go index 7cf96fee5..fa3f20e2d 100644 --- a/server/subsonic/media_retrieval_test.go +++ b/server/subsonic/media_retrieval_test.go @@ -279,11 +279,7 @@ var _ = Describe("MediaRetrievalController", func() { for k, realCue := range realCueLine.Cue { expectedCue := expectedCueLine.Cue[k] Expect(realCue.Value).To(Equal(expectedCue.Value)) - if expectedCue.Start == nil { - Expect(realCue.Start).To(BeNil()) - } else { - Expect(*realCue.Start).To(Equal(*expectedCue.Start)) - } + Expect(realCue.Start).To(Equal(expectedCue.Start)) if expectedCue.End == nil { Expect(realCue.End).To(BeNil()) } else { @@ -521,12 +517,12 @@ var _ = Describe("MediaRetrievalController", func() { Value: "konni", Cue: []responses.LyricCue{ { - Start: &tokenStartA, + Start: tokenStartA, End: &tokenEndA, Value: "ko", }, { - Start: &tokenStartB, + Start: tokenStartB, End: &tokenEndB, Value: "nni", }, @@ -607,7 +603,7 @@ var _ = Describe("MediaRetrievalController", func() { Value: "Hello echo", Cue: []responses.LyricCue{ { - Start: &tokenStartA, + Start: tokenStartA, End: &tokenEndA, Value: "Hello", }, @@ -618,10 +614,10 @@ var _ = Describe("MediaRetrievalController", func() { Start: &lineStart, End: &lineEnd, Value: "Hello echo", - Role: "x-bg", + Role: "bg", Cue: []responses.LyricCue{ { - Start: &tokenStartB, + Start: tokenStartB, End: &tokenEndB, Value: "echo", }, diff --git a/server/subsonic/responses/responses.go b/server/subsonic/responses/responses.go index d19f99ca6..d74c118b3 100644 --- a/server/subsonic/responses/responses.go +++ b/server/subsonic/responses/responses.go @@ -538,7 +538,7 @@ type Line struct { } type LyricCue struct { - Start *int64 `xml:"start,attr,omitempty" json:"start,omitempty"` + Start int64 `xml:"start,attr" json:"start"` End *int64 `xml:"end,attr,omitempty" json:"end,omitempty"` Value string `xml:"value,attr" json:"value"` } diff --git a/ui/src/audioplayer/lyrics.test.js b/ui/src/audioplayer/lyrics.test.js index 7e0b0d105..6cb3a1b87 100644 --- a/ui/src/audioplayer/lyrics.test.js +++ b/ui/src/audioplayer/lyrics.test.js @@ -219,7 +219,7 @@ describe('lyrics helpers', () => { start: 1000, end: 3000, value: 'Hello world', - role: 'x-bg', + role: 'bg', cue: [{ start: 2000, end: 2500, value: 'world' }], }, ], @@ -233,7 +233,7 @@ describe('lyrics helpers', () => { value: 'Hello world', tokens: [ { start: 1000, end: 1500, value: 'Hello', role: '' }, - { start: 2000, end: 2500, value: 'world', role: 'x-bg' }, + { start: 2000, end: 2500, value: 'world', role: 'bg' }, ], }, ]) From 92793386648e1c4205d3aad27056006c25268c36 Mon Sep 17 00:00:00 2001 From: ranokay Date: Mon, 23 Feb 2026 11:22:00 +0200 Subject: [PATCH 05/14] fix: guarantee main-first cueLine ordering for same index Add stable sort to ensure the main vocals cueLine (empty role) always appears before other roles when multiple cueLines share the same line index. Previously relied on source document order which is not guaranteed across all TTML files. --- server/subsonic/helpers.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/server/subsonic/helpers.go b/server/subsonic/helpers.go index d8ec8451b..056ca89f1 100644 --- a/server/subsonic/helpers.go +++ b/server/subsonic/helpers.go @@ -530,6 +530,11 @@ func buildStructuredLyric(mf *model.MediaFile, lyrics model.Lyrics, enhanced boo }) } + // Ensure main vocals (empty role) always comes first + sort.SliceStable(roleOrder, func(i, j int) bool { + return roleOrder[i] == "" && roleOrder[j] != "" + }) + // Create a separate CueLine for each role group for _, role := range roleOrder { cues := cuesByRole[role] From 4e8f363e818db764360939a14d2c7326a1ec0028 Mon Sep 17 00:00:00 2001 From: ranokay Date: Thu, 5 Mar 2026 22:20:32 +0200 Subject: [PATCH 06/14] fix: align songLyrics v2 with spec, add Enhanced LRC parser and bg role UI styling - Fix LyricCue.Value XML tag: chardata instead of attribute - Fix Kind field leaking to non-enhanced (v1) responses - Guard against nil cue.Start values - Add Enhanced LRC parser for word-level inline timing markers - Add role-based UI styling: bg tokens render italic at 72% opacity - Add integration test for Enhanced LRC file reading - Add unit tests for Enhanced LRC parser --- core/lyrics/sources_test.go | 37 +++++++ model/lyrics.go | 105 +++++++++++++++++++- model/lyrics_test.go | 59 +++++++++++ server/subsonic/helpers.go | 23 +++-- server/subsonic/media_retrieval_test.go | 7 +- server/subsonic/responses/responses.go | 2 +- tests/fixtures/test-enhanced.lrc | 6 ++ ui/src/audioplayer/KaraokeLyricsOverlay.jsx | 44 ++++---- 8 files changed, 244 insertions(+), 39 deletions(-) create mode 100644 tests/fixtures/test-enhanced.lrc diff --git a/core/lyrics/sources_test.go b/core/lyrics/sources_test.go index 8823a3175..3dd2825e6 100644 --- a/core/lyrics/sources_test.go +++ b/core/lyrics/sources_test.go @@ -88,6 +88,43 @@ var _ = Describe("sources", func() { })) }) + It("should return Enhanced LRC lyrics with word-level cues from a file", func() { + mf := model.MediaFile{Path: "tests/fixtures/test-enhanced.mp3"} + lyrics, err := fromExternalFile(ctx, &mf, ".lrc") + + Expect(err).To(BeNil()) + Expect(lyrics).To(HaveLen(1)) + Expect(lyrics[0].DisplayArtist).To(Equal("Test Artist")) + Expect(lyrics[0].DisplayTitle).To(Equal("Enhanced Test")) + Expect(lyrics[0].Lang).To(Equal("eng")) + Expect(lyrics[0].Synced).To(BeTrue()) + Expect(lyrics[0].Line).To(HaveLen(3)) + + // Line 1: has inline markers → Cue array populated + Expect(lyrics[0].Line[0].Start).To(Equal(gg.P(int64(1000)))) + Expect(lyrics[0].Line[0].Value).To(Equal("Some lyrics here")) + Expect(lyrics[0].Line[0].Cue).To(HaveLen(3)) + Expect(*lyrics[0].Line[0].Cue[0].Start).To(Equal(int64(1000))) + Expect(lyrics[0].Line[0].Cue[0].Value).To(Equal("Some ")) + Expect(*lyrics[0].Line[0].Cue[0].End).To(Equal(int64(1500))) + Expect(*lyrics[0].Line[0].Cue[1].Start).To(Equal(int64(1500))) + Expect(lyrics[0].Line[0].Cue[1].Value).To(Equal("lyrics ")) + Expect(*lyrics[0].Line[0].Cue[1].End).To(Equal(int64(2000))) + Expect(*lyrics[0].Line[0].Cue[2].Start).To(Equal(int64(2000))) + Expect(lyrics[0].Line[0].Cue[2].Value).To(Equal("here")) + Expect(lyrics[0].Line[0].Cue[2].End).To(BeNil()) + + // Line 2: has inline markers + Expect(lyrics[0].Line[1].Start).To(Equal(gg.P(int64(3000)))) + Expect(lyrics[0].Line[1].Value).To(Equal("More words")) + Expect(lyrics[0].Line[1].Cue).To(HaveLen(2)) + + // Line 3: plain line, no cues + Expect(lyrics[0].Line[2].Start).To(Equal(gg.P(int64(5000)))) + Expect(lyrics[0].Line[2].Value).To(Equal("Plain line without inline markers")) + Expect(lyrics[0].Line[2].Cue).To(BeNil()) + }) + It("should return unsynchronized lyrics from a file", func() { mf := model.MediaFile{Path: "tests/fixtures/test.mp3"} lyrics, err := fromExternalFile(ctx, &mf, ".txt") diff --git a/model/lyrics.go b/model/lyrics.go index 3cb1cb715..9fcd4992e 100644 --- a/model/lyrics.go +++ b/model/lyrics.go @@ -43,6 +43,10 @@ var ( syncRegex = regexp.MustCompile(`(^|\n)\s*` + timeRegexString) timeRegex = regexp.MustCompile(timeRegexString) lrcIdRegex = regexp.MustCompile(`\[(ar|ti|offset|lang):([^]]+)]`) + + // Enhanced LRC: inline word-level timing markers like <00:12.34> + enhancedLRCTimeString = `<([0-9]{1,2}:)?([0-9]{1,2}):([0-9]{1,2})(.[0-9]{1,3})?>` + enhancedLRCRegex = regexp.MustCompile(enhancedLRCTimeString) ) func (l Lyrics) IsEmpty() bool { @@ -116,9 +120,15 @@ func ToLyrics(language, text string) (*Lyrics, error) { if validLine { for idx := range timestamps { + cues := parseEnhancedCues(priorLine) + value := priorLine + if cues != nil { + value = stripEnhancedMarkers(value) + } structuredLines = append(structuredLines, Line{ Start: ×tamps[idx], - Value: strings.TrimSpace(priorLine), + Value: strings.TrimSpace(value), + Cue: cues, }) } timestamps = nil @@ -164,9 +174,15 @@ func ToLyrics(language, text string) (*Lyrics, error) { if validLine { for idx := range timestamps { + cues := parseEnhancedCues(priorLine) + value := priorLine + if cues != nil { + value = stripEnhancedMarkers(value) + } structuredLines = append(structuredLines, Line{ Start: ×tamps[idx], - Value: strings.TrimSpace(priorLine), + Value: strings.TrimSpace(value), + Cue: cues, }) } } @@ -190,6 +206,91 @@ func ToLyrics(language, text string) (*Lyrics, error) { return &lyrics, nil } +// parseEnhancedCues extracts word-level timing cues from Enhanced LRC inline markers. +// Format: word word ... +// Returns nil if no inline markers are found. +func parseEnhancedCues(text string) []Cue { + matches := enhancedLRCRegex.FindAllStringSubmatchIndex(text, -1) + if len(matches) == 0 { + return nil + } + + type segment struct { + start int64 + text string + } + + segments := make([]segment, 0, len(matches)) + for i, match := range matches { + timeMs, err := parseTime( + // Rewrite <...> as [...] so parseTime can handle it with the same logic + "["+text[match[0]+1:match[1]-1]+"]", + // Adjust match indices to point into our rewritten string (need start/end pairs for each group) + []int{ + 0, match[1] - match[0], + adjustGroup(match, 2), adjustGroup(match, 3), + adjustGroup(match, 4), adjustGroup(match, 5), + adjustGroup(match, 6), adjustGroup(match, 7), + adjustGroup(match, 8), adjustGroup(match, 9), + }, + ) + if err != nil { + continue + } + + // Text runs from after this marker to the start of the next marker (or end of string) + textStart := match[1] + var textEnd int + if i+1 < len(matches) { + textEnd = matches[i+1][0] + } else { + textEnd = len(text) + } + + word := text[textStart:textEnd] + if word == "" { + continue + } + segments = append(segments, segment{start: timeMs, text: word}) + } + + if len(segments) == 0 { + return nil + } + + cues := make([]Cue, len(segments)) + for i, seg := range segments { + start := seg.start + cues[i] = Cue{ + Start: &start, + Value: seg.text, + } + // Derive End from the next cue's Start + if i+1 < len(segments) { + end := segments[i+1].start + cues[i].End = &end + } + } + return cues +} + +// adjustGroup remaps a capture group index from the original match to our rewritten "[...]" string. +// The rewrite shifts by -1 (removed '<', added '[') so positions within the brackets stay the same. +func adjustGroup(match []int, groupIdx int) int { + orig := match[groupIdx] + if orig == -1 { + return -1 + } + // Offset is: original position minus the position of '<' in the original, plus 1 for '[' + return orig - match[0] +} + +// stripEnhancedMarkers removes all inline markers from text, +// returning the plain lyric text. +func stripEnhancedMarkers(text string) string { + return enhancedLRCRegex.ReplaceAllString(text, "") +} + func parseTime(line string, match []int) (int64, error) { var hours, millis int64 var err error diff --git a/model/lyrics_test.go b/model/lyrics_test.go index 382976872..2228306d0 100644 --- a/model/lyrics_test.go +++ b/model/lyrics_test.go @@ -116,4 +116,63 @@ var _ = Describe("ToLyrics", func() { {Start: &e, Value: "Test"}, })) }) + + It("should parse Enhanced LRC with word-level timing", func() { + lyrics, err := ToLyrics("xxx", "[00:01.00]<00:01.00>Some <00:01.50>lyrics <00:02.00>here\n[00:03.00]<00:03.00>More <00:03.50>words") + Expect(err).ToNot(HaveOccurred()) + Expect(lyrics.Synced).To(BeTrue()) + Expect(lyrics.Line).To(HaveLen(2)) + + t1000, t1500, t2000, t3000, t3500 := int64(1000), int64(1500), int64(2000), int64(3000), int64(3500) + + line0 := lyrics.Line[0] + Expect(line0.Start).To(Equal(&t1000)) + Expect(line0.Value).To(Equal("Some lyrics here")) + Expect(line0.Cue).To(Equal([]Cue{ + {Start: &t1000, End: &t1500, Value: "Some "}, + {Start: &t1500, End: &t2000, Value: "lyrics "}, + {Start: &t2000, Value: "here"}, + })) + + line1 := lyrics.Line[1] + Expect(line1.Start).To(Equal(&t3000)) + Expect(line1.Value).To(Equal("More words")) + Expect(line1.Cue).To(Equal([]Cue{ + {Start: &t3000, End: &t3500, Value: "More "}, + {Start: &t3500, Value: "words"}, + })) + }) + + It("should ignore Enhanced LRC markers and return plain lines when no markers present", func() { + a, b := int64(1000), int64(3000) + lyrics, err := ToLyrics("xxx", "[00:01.00]Plain line\n[00:03.00]Another plain line") + Expect(err).ToNot(HaveOccurred()) + Expect(lyrics.Line).To(Equal([]Line{ + {Start: &a, Value: "Plain line"}, + {Start: &b, Value: "Another plain line"}, + })) + }) + + It("should handle mixed Enhanced and plain LRC lines", func() { + lyrics, err := ToLyrics("xxx", "[00:01.00]<00:01.00>Some <00:01.50>lyrics\n[00:03.00]Plain line\n[00:05.00]<00:05.00>More <00:05.50>words") + Expect(err).ToNot(HaveOccurred()) + Expect(lyrics.Line).To(HaveLen(3)) + + t1000, t1500, t5000, t5500 := int64(1000), int64(1500), int64(5000), int64(5500) + + Expect(lyrics.Line[0].Cue).To(Equal([]Cue{ + {Start: &t1000, End: &t1500, Value: "Some "}, + {Start: &t1500, Value: "lyrics"}, + })) + Expect(lyrics.Line[0].Value).To(Equal("Some lyrics")) + + Expect(lyrics.Line[1].Cue).To(BeNil()) + Expect(lyrics.Line[1].Value).To(Equal("Plain line")) + + Expect(lyrics.Line[2].Cue).To(Equal([]Cue{ + {Start: &t5000, End: &t5500, Value: "More "}, + {Start: &t5500, Value: "words"}, + })) + Expect(lyrics.Line[2].Value).To(Equal("More words")) + }) }) diff --git a/server/subsonic/helpers.go b/server/subsonic/helpers.go index 056ca89f1..b881fa169 100644 --- a/server/subsonic/helpers.go +++ b/server/subsonic/helpers.go @@ -515,16 +515,15 @@ func buildStructuredLyric(mf *model.MediaFile, lyrics model.Lyrics, enhanced boo roleOrder := make([]string, 0, 2) cuesByRole := make(map[string][]responses.LyricCue) for _, cue := range line.Cue { + if cue.Start == nil { + continue + } role := sanitizeRole(cue.Role) if _, exists := cuesByRole[role]; !exists { roleOrder = append(roleOrder, role) } - var start int64 - if cue.Start != nil { - start = *cue.Start - } cuesByRole[role] = append(cuesByRole[role], responses.LyricCue{ - Start: start, + Start: *cue.Start, End: cue.End, Value: cue.Value, }) @@ -552,15 +551,9 @@ func buildStructuredLyric(mf *model.MediaFile, lyrics model.Lyrics, enhanced boo } } - kind := strings.TrimSpace(lyrics.Kind) - if kind == "" { - kind = "main" - } - structured := responses.StructuredLyric{ DisplayArtist: lyrics.DisplayArtist, DisplayTitle: lyrics.DisplayTitle, - Kind: kind, Lang: lyrics.Lang, Line: lines, CueLine: cueLines, @@ -568,6 +561,14 @@ func buildStructuredLyric(mf *model.MediaFile, lyrics model.Lyrics, enhanced boo Synced: lyrics.Synced, } + if enhanced { + kind := strings.TrimSpace(lyrics.Kind) + if kind == "" { + kind = "main" + } + structured.Kind = kind + } + if structured.DisplayArtist == "" { structured.DisplayArtist = mf.Artist } diff --git a/server/subsonic/media_retrieval_test.go b/server/subsonic/media_retrieval_test.go index fa3f20e2d..0fdbb3854 100644 --- a/server/subsonic/media_retrieval_test.go +++ b/server/subsonic/media_retrieval_test.go @@ -232,11 +232,7 @@ var _ = Describe("MediaRetrievalController", func() { Expect(realLyric.DisplayArtist).To(Equal(expectedLyric.DisplayArtist)) Expect(realLyric.DisplayTitle).To(Equal(expectedLyric.DisplayTitle)) - expectedKind := expectedLyric.Kind - if expectedKind == "" { - expectedKind = "main" - } - Expect(realLyric.Kind).To(Equal(expectedKind)) + Expect(realLyric.Kind).To(Equal(expectedLyric.Kind)) Expect(realLyric.Lang).To(Equal(expectedLyric.Lang)) Expect(realLyric.Synced).To(Equal(expectedLyric.Synced)) @@ -587,6 +583,7 @@ var _ = Describe("MediaRetrievalController", func() { { DisplayArtist: "Rick Astley", DisplayTitle: "Never Gonna Give You Up", + Kind: "main", Lang: "eng", Synced: true, Line: []responses.Line{ diff --git a/server/subsonic/responses/responses.go b/server/subsonic/responses/responses.go index d74c118b3..f5446a961 100644 --- a/server/subsonic/responses/responses.go +++ b/server/subsonic/responses/responses.go @@ -540,7 +540,7 @@ type Line struct { type LyricCue struct { Start int64 `xml:"start,attr" json:"start"` End *int64 `xml:"end,attr,omitempty" json:"end,omitempty"` - Value string `xml:"value,attr" json:"value"` + Value string `xml:",chardata" json:"value"` } type CueLine struct { diff --git a/tests/fixtures/test-enhanced.lrc b/tests/fixtures/test-enhanced.lrc new file mode 100644 index 000000000..8f7b60f8c --- /dev/null +++ b/tests/fixtures/test-enhanced.lrc @@ -0,0 +1,6 @@ +[ar:Test Artist] +[ti:Enhanced Test] +[lang:eng] +[00:01.00]<00:01.00>Some <00:01.50>lyrics <00:02.00>here +[00:03.00]<00:03.00>More <00:03.50>words +[00:05.00]Plain line without inline markers diff --git a/ui/src/audioplayer/KaraokeLyricsOverlay.jsx b/ui/src/audioplayer/KaraokeLyricsOverlay.jsx index 3814cbee6..a44e50bf6 100644 --- a/ui/src/audioplayer/KaraokeLyricsOverlay.jsx +++ b/ui/src/audioplayer/KaraokeLyricsOverlay.jsx @@ -1,3 +1,12 @@ +import Button from '@material-ui/core/Button' +import IconButton from '@material-ui/core/IconButton' +import Popover from '@material-ui/core/Popover' +import Slider from '@material-ui/core/Slider' +import { makeStyles } from '@material-ui/core/styles' +import Typography from '@material-ui/core/Typography' +import CloseIcon from '@material-ui/icons/Close' +import TuneIcon from '@material-ui/icons/Tune' +import clsx from 'clsx' import React, { memo, useCallback, @@ -6,21 +15,12 @@ import React, { useRef, useState, } from 'react' -import clsx from 'clsx' -import Button from '@material-ui/core/Button' -import IconButton from '@material-ui/core/IconButton' -import Popover from '@material-ui/core/Popover' -import Slider from '@material-ui/core/Slider' -import Typography from '@material-ui/core/Typography' -import CloseIcon from '@material-ui/icons/Close' -import TuneIcon from '@material-ui/icons/Tune' -import { makeStyles } from '@material-ui/core/styles' import { buildKaraokeLines, getActiveKaraokeState, hasStructuredLyricContent, - resolveLayerLineForMain, resolveKaraokeTokenWindow, + resolveLayerLineForMain, } from './lyrics' const KARAOKE_RENDER_LEAD_MS = 24 @@ -421,9 +421,7 @@ const LyricsSettingsPopover = ({ settings, onChange }) => { const easeInOut = (v) => { const clamped = clamp(v, 0, 1) - return clamped < 0.5 - ? 2 * clamped * clamped - : 1 - Math.pow(-2 * clamped + 2, 2) / 2 + return clamped < 0.5 ? 2 * clamped * clamped : 1 - (-2 * clamped + 2) ** 2 / 2 } const getMaxHeightPx = () => { @@ -716,17 +714,23 @@ const KaraokeLineRow = memo( } alpha = clamp(alpha, TOKEN_FUTURE_ALPHA, TOKEN_ACTIVE_ALPHA) const fillProgress = isDone ? 1 : isActive ? progress : 0 + const isBgRole = segment.token?.role === 'bg' return ( {segment.text} @@ -1066,7 +1070,7 @@ const KaraokeLyricsOverlay = ({ const isActive = delta === 0 let opacity = isActive ? 1 : delta < 0 ? 0.6 : 0.72 const [r, g, b] = parseColorRGB(getColorValue(lyricsSettings.main.colorKey)) - let color = isActive + const color = isActive ? `rgba(${r}, ${g}, ${b}, 0.98)` : delta < 0 ? `rgba(${r}, ${g}, ${b}, 0.4)` From 1aac92bc1401529b4586972b4b80f2a627db63f9 Mon Sep 17 00:00:00 2001 From: ranokay Date: Fri, 20 Mar 2026 23:41:29 +0200 Subject: [PATCH 07/14] feat(lyrics): support agent-based lyric layers --- conf/configuration.go | 2 +- core/lyrics/lyrics_test.go | 53 +++- core/lyrics/sources.go | 11 +- core/lyrics/sources_test.go | 56 ++++- core/lyrics/srt.go | 161 +++++++++++++ core/lyrics/ttml.go | 307 ++++++++++++++++++++---- core/lyrics/ttml_test.go | 42 +++- model/lyrics.go | 160 ++++++++++-- model/lyrics_test.go | 10 +- server/subsonic/helpers.go | 131 ++++++++-- server/subsonic/media_retrieval_test.go | 42 ++-- server/subsonic/responses/responses.go | 19 +- tests/fixtures/test.elrc | 5 + tests/fixtures/test.srt | 7 + ui/src/audioplayer/PlayerToolbar.jsx | 2 +- ui/src/audioplayer/lyrics.js | 73 +++++- ui/src/audioplayer/lyrics.test.js | 132 ++++++++-- 17 files changed, 1059 insertions(+), 154 deletions(-) create mode 100644 core/lyrics/srt.go create mode 100644 tests/fixtures/test.elrc create mode 100644 tests/fixtures/test.srt diff --git a/conf/configuration.go b/conf/configuration.go index af9f6c283..6370d5cb8 100644 --- a/conf/configuration.go +++ b/conf/configuration.go @@ -730,7 +730,7 @@ func setViperDefaults() { viper.SetDefault("artistartpriority", "artist.*, album/artist.*, external") viper.SetDefault("artistimagefolder", "") viper.SetDefault("discartpriority", "disc*.*, cd*.*, cover.*, folder.*, front.*, discsubtitle, embedded") - viper.SetDefault("lyricspriority", ".lrc,.ttml,.txt,embedded") + viper.SetDefault("lyricspriority", ".ttml,.elrc,.lrc,.srt,.txt,embedded") viper.SetDefault("enablegravatar", false) viper.SetDefault("enablefavourites", true) viper.SetDefault("enablestarrating", true) diff --git a/core/lyrics/lyrics_test.go b/core/lyrics/lyrics_test.go index d5f79a4d0..58e8ba82b 100644 --- a/core/lyrics/lyrics_test.go +++ b/core/lyrics/lyrics_test.go @@ -44,6 +44,36 @@ var _ = Describe("sources", func() { }, } + elrcLyrics := model.LyricList{ + model.Lyrics{ + DisplayArtist: "ELRC Artist", + DisplayTitle: "ELRC Song", + Lang: "eng", + Line: []model.Line{ + { + Start: gg.P(int64(1000)), + End: gg.P(int64(1500)), + Value: "Lead words", + Cue: []model.Cue{ + { + Start: gg.P(int64(1000)), + Value: "Lead ", + }, + { + Start: gg.P(int64(1500)), + Value: "words", + }, + }, + }, + { + Start: gg.P(int64(3000)), + Value: "Fallback line", + }, + }, + Synced: true, + }, + } + ttmlLyrics := model.LyricList{ model.Lyrics{ Kind: "main", @@ -88,6 +118,25 @@ var _ = Describe("sources", func() { }, } + srtLyrics := model.LyricList{ + model.Lyrics{ + Lang: "xxx", + Line: []model.Line{ + { + Start: gg.P(int64(18800)), + End: gg.P(int64(22800)), + Value: "We're from subtitles", + }, + { + Start: gg.P(int64(22801)), + End: gg.P(int64(26000)), + Value: "Another subtitle line", + }, + }, + Synced: true, + }, + } + BeforeEach(func() { DeferCleanup(configtest.SetupConfig()) @@ -109,8 +158,10 @@ var _ = Describe("sources", func() { }, Entry("embedded > lrc > txt", "embedded,.lrc,.txt", embeddedLyrics), Entry("lrc > embedded > txt", ".lrc,embedded,.txt", syncedLyrics), + Entry("elrc > lrc > embedded", ".elrc,.lrc,embedded", elrcLyrics), + Entry("srt > txt > embedded", ".srt,.txt,embedded", srtLyrics), Entry("txt > lrc > embedded", ".txt,.lrc,embedded", unsyncedLyrics), - Entry("ttml > lrc > embedded", ".ttml,.lrc,embedded", ttmlLyrics)) + Entry("ttml > elrc > lrc > srt > embedded", ".ttml,.elrc,.lrc,.srt,embedded", ttmlLyrics)) Context("Errors", func() { var RegularUserContext = XContext diff --git a/core/lyrics/sources.go b/core/lyrics/sources.go index 38a71cb8a..7586c944f 100644 --- a/core/lyrics/sources.go +++ b/core/lyrics/sources.go @@ -38,13 +38,20 @@ func fromExternalFile(ctx context.Context, mf *model.MediaFile, suffix string) ( } var list model.LyricList - if strings.EqualFold(suffix, ".ttml") { + switch { + case strings.EqualFold(suffix, ".ttml"): list, err = parseTTML(contents) if err != nil { log.Error(ctx, "error parsing ttml external file", "path", externalLyric, err) return nil, err } - } else { + case strings.EqualFold(suffix, ".srt"): + list, err = parseSRT(contents) + if err != nil { + log.Error(ctx, "error parsing srt external file", "path", externalLyric, err) + return nil, err + } + default: lyrics, err := model.ToLyrics("xxx", string(contents)) if err != nil { log.Error(ctx, "error parsing lyric external file", "path", externalLyric, err) diff --git a/core/lyrics/sources_test.go b/core/lyrics/sources_test.go index 3dd2825e6..a110390d8 100644 --- a/core/lyrics/sources_test.go +++ b/core/lyrics/sources_test.go @@ -106,10 +106,10 @@ var _ = Describe("sources", func() { Expect(lyrics[0].Line[0].Cue).To(HaveLen(3)) Expect(*lyrics[0].Line[0].Cue[0].Start).To(Equal(int64(1000))) Expect(lyrics[0].Line[0].Cue[0].Value).To(Equal("Some ")) - Expect(*lyrics[0].Line[0].Cue[0].End).To(Equal(int64(1500))) + Expect(lyrics[0].Line[0].Cue[0].End).To(BeNil()) Expect(*lyrics[0].Line[0].Cue[1].Start).To(Equal(int64(1500))) Expect(lyrics[0].Line[0].Cue[1].Value).To(Equal("lyrics ")) - Expect(*lyrics[0].Line[0].Cue[1].End).To(Equal(int64(2000))) + Expect(lyrics[0].Line[0].Cue[1].End).To(BeNil()) Expect(*lyrics[0].Line[0].Cue[2].Start).To(Equal(int64(2000))) Expect(lyrics[0].Line[0].Cue[2].Value).To(Equal("here")) Expect(lyrics[0].Line[0].Cue[2].End).To(BeNil()) @@ -125,6 +125,33 @@ var _ = Describe("sources", func() { Expect(lyrics[0].Line[2].Cue).To(BeNil()) }) + It("should return Enhanced LRC lyrics from an ELRC file", func() { + mf := model.MediaFile{Path: "tests/fixtures/test.mp3"} + lyrics, err := fromExternalFile(ctx, &mf, ".elrc") + + Expect(err).To(BeNil()) + Expect(lyrics).To(HaveLen(1)) + Expect(lyrics[0].DisplayArtist).To(Equal("ELRC Artist")) + Expect(lyrics[0].DisplayTitle).To(Equal("ELRC Song")) + Expect(lyrics[0].Lang).To(Equal("eng")) + Expect(lyrics[0].Synced).To(BeTrue()) + Expect(lyrics[0].Line).To(HaveLen(2)) + + Expect(lyrics[0].Line[0].Start).To(Equal(gg.P(int64(1000)))) + Expect(lyrics[0].Line[0].Value).To(Equal("Lead words")) + Expect(lyrics[0].Line[0].Cue).To(HaveLen(2)) + Expect(*lyrics[0].Line[0].Cue[0].Start).To(Equal(int64(1000))) + Expect(lyrics[0].Line[0].Cue[0].Value).To(Equal("Lead ")) + Expect(lyrics[0].Line[0].Cue[0].End).To(BeNil()) + Expect(*lyrics[0].Line[0].Cue[1].Start).To(Equal(int64(1500))) + Expect(lyrics[0].Line[0].Cue[1].Value).To(Equal("words")) + Expect(lyrics[0].Line[0].Cue[1].End).To(BeNil()) + + Expect(lyrics[0].Line[1].Start).To(Equal(gg.P(int64(3000)))) + Expect(lyrics[0].Line[1].Value).To(Equal("Fallback line")) + Expect(lyrics[0].Line[1].Cue).To(BeNil()) + }) + It("should return unsynchronized lyrics from a file", func() { mf := model.MediaFile{Path: "tests/fixtures/test.mp3"} lyrics, err := fromExternalFile(ctx, &mf, ".txt") @@ -146,6 +173,31 @@ var _ = Describe("sources", func() { })) }) + It("should return synchronized lyrics from an SRT file", func() { + mf := model.MediaFile{Path: "tests/fixtures/test.mp3"} + lyrics, err := fromExternalFile(ctx, &mf, ".srt") + + Expect(err).To(BeNil()) + Expect(lyrics).To(Equal(model.LyricList{ + model.Lyrics{ + Lang: "xxx", + Line: []model.Line{ + { + Start: gg.P(int64(18800)), + End: gg.P(int64(22800)), + Value: "We're from subtitles", + }, + { + Start: gg.P(int64(22801)), + End: gg.P(int64(26000)), + Value: "Another subtitle line", + }, + }, + Synced: true, + }, + })) + }) + It("should return synchronized multilingual lyrics from a TTML file", func() { mf := model.MediaFile{Path: "tests/fixtures/test.mp3"} lyrics, err := fromExternalFile(ctx, &mf, ".ttml") diff --git a/core/lyrics/srt.go b/core/lyrics/srt.go new file mode 100644 index 000000000..8fd77abb4 --- /dev/null +++ b/core/lyrics/srt.go @@ -0,0 +1,161 @@ +package lyrics + +import ( + "bytes" + "regexp" + "strconv" + "strings" + + "github.com/navidrome/navidrome/model" + "github.com/navidrome/navidrome/utils/str" +) + +var srtTimeRegex = regexp.MustCompile(`^\s*(\d{1,2}):(\d{2}):(\d{2})[,.](\d{1,3})\s*$`) + +func parseSRT(contents []byte) (model.LyricList, error) { + raw := strings.ReplaceAll(string(contents), "\r\n", "\n") + raw = strings.ReplaceAll(raw, "\r", "\n") + + blocks := splitSRTBlocks(raw) + lines := make([]model.Line, 0, len(blocks)) + + for _, block := range blocks { + line, ok, err := parseSRTBlock(block) + if err != nil { + return nil, err + } + if ok { + lines = append(lines, line) + } + } + + if len(lines) == 0 { + return nil, nil + } + + lyrics := model.NormalizeLyrics(model.Lyrics{ + Lang: "xxx", + Line: lines, + Synced: true, + }) + return model.LyricList{lyrics}, nil +} + +func splitSRTBlocks(raw string) []string { + raw = strings.TrimSpace(raw) + if raw == "" { + return nil + } + + parts := strings.Split(raw, "\n\n") + blocks := make([]string, 0, len(parts)) + for _, part := range parts { + part = strings.TrimSpace(part) + if part != "" { + blocks = append(blocks, part) + } + } + return blocks +} + +func parseSRTBlock(block string) (model.Line, bool, error) { + scanner := bytes.Split([]byte(block), []byte("\n")) + if len(scanner) == 0 { + return model.Line{}, false, nil + } + + lines := make([]string, 0, len(scanner)) + for _, line := range scanner { + lines = append(lines, strings.TrimSpace(string(line))) + } + + if len(lines) == 0 { + return model.Line{}, false, nil + } + + startIdx := 0 + if digitsOnly(lines[0]) { + startIdx = 1 + } + if startIdx >= len(lines) { + return model.Line{}, false, nil + } + + timing := strings.Split(lines[startIdx], "-->") + if len(timing) != 2 { + return model.Line{}, false, nil + } + + startMs, err := parseSRTTime(timing[0]) + if err != nil { + return model.Line{}, false, err + } + endMs, err := parseSRTTime(timing[1]) + if err != nil { + return model.Line{}, false, err + } + + textLines := make([]string, 0, len(lines)-startIdx-1) + for _, line := range lines[startIdx+1:] { + if line == "" { + continue + } + textLines = append(textLines, line) + } + + value := str.SanitizeText(strings.Join(textLines, "\n")) + if value == "" { + return model.Line{}, false, nil + } + + return model.Line{ + Start: &startMs, + End: &endMs, + Value: value, + }, true, nil +} + +func parseSRTTime(value string) (int64, error) { + match := srtTimeRegex.FindStringSubmatch(strings.TrimSpace(value)) + if match == nil { + return 0, strconv.ErrSyntax + } + + hours, err := strconv.ParseInt(match[1], 10, 64) + if err != nil { + return 0, err + } + minutes, err := strconv.ParseInt(match[2], 10, 64) + if err != nil { + return 0, err + } + seconds, err := strconv.ParseInt(match[3], 10, 64) + if err != nil { + return 0, err + } + millis, err := strconv.ParseInt(match[4], 10, 64) + if err != nil { + return 0, err + } + + switch len(match[4]) { + case 1: + millis *= 100 + case 2: + millis *= 10 + } + + return (((hours*60)+minutes)*60+seconds)*1000 + millis, nil +} + +func digitsOnly(value string) bool { + if value == "" { + return false + } + for _, ch := range value { + if ch < '0' || ch > '9' { + return false + } + } + return true +} diff --git a/core/lyrics/ttml.go b/core/lyrics/ttml.go index a0bdcac5a..e79dfe846 100644 --- a/core/lyrics/ttml.go +++ b/core/lyrics/ttml.go @@ -46,6 +46,7 @@ type ttmlTimingParams struct { type ttmlTimingContext struct { lang string role string + agentID string begin int64 hasBegin bool end int64 @@ -70,6 +71,12 @@ type ttmlResolvedMetadataLine struct { line model.Line } +type ttmlDefinedAgent struct { + ID string + Type string + Name string +} + type ttmlParser struct { decoder *xml.Decoder params ttmlTimingParams @@ -86,6 +93,8 @@ type ttmlParser struct { pronunciationLangOrder []string pronunciationEntriesByLg map[string][]ttmlMetadataEntry + definedAgents map[string]ttmlDefinedAgent + metadataSeq int } @@ -103,6 +112,7 @@ func parseTTML(contents []byte) (model.LyricList, error) { mainLineRefsByKey: make(map[string]ttmlLineRef), translationEntriesByLg: make(map[string][]ttmlMetadataEntry), pronunciationEntriesByLg: make(map[string][]ttmlMetadataEntry), + definedAgents: make(map[string]ttmlDefinedAgent), } root := ttmlTimingContext{lang: "xxx"} @@ -140,6 +150,8 @@ func (p *ttmlParser) parseElement(start xml.StartElement, parent ttmlTimingConte return p.parseMetadataTrack(start, parent, ttmlLyricKindTranslation) case "transliteration": return p.parseMetadataTrack(start, parent, ttmlLyricKindPronunciation) + case "agent": + return p.parseAgentDefinition(start) } ctx := p.childContext(start.Attr, parent) @@ -234,6 +246,49 @@ func (p *ttmlParser) parseMetadataTrack(start xml.StartElement, parent ttmlTimin } } +func (p *ttmlParser) parseAgentDefinition(start xml.StartElement) error { + id, ok := attrValue(start.Attr, "id") + id = strings.TrimSpace(id) + if !ok || id == "" { + return p.skipElement(start) + } + + agent := ttmlDefinedAgent{ + ID: id, + Type: strings.ToLower(strings.TrimSpace(attrOrEmpty(start.Attr, "type"))), + } + + for { + token, err := p.decoder.Token() + if err != nil { + return err + } + + switch t := token.(type) { + case xml.StartElement: + if strings.EqualFold(t.Name.Local, "name") { + name, err := p.collectElementText(t) + if err != nil { + return err + } + name = sanitizeTTMLText(name) + if name != "" && agent.Name == "" { + agent.Name = name + } + continue + } + if err := p.skipElement(t); err != nil { + return err + } + case xml.EndElement: + if strings.EqualFold(t.Name.Local, start.Name.Local) { + p.definedAgents[agent.ID] = agent + return nil + } + } + } +} + func (p *ttmlParser) parseMetadataText(start xml.StartElement, parent ttmlTimingContext) (ttmlMetadataEntry, bool, error) { forKey, hasFor := attrValue(start.Attr, "for") forKey = strings.TrimSpace(forKey) @@ -338,8 +393,8 @@ func (p *ttmlParser) parseInlineElement(start xml.StartElement, parent ttmlTimin tokenText := sanitizeTTMLText(value) if local == "span" && hasOwnTiming && !ctx.invalid && tokenText != "" && len(tokens) == 0 { parsedToken := model.Cue{ - Value: tokenText, - Role: ctx.role, + Value: tokenText, + AgentID: p.resolveCueAgentID(ctx), } if ctx.hasBegin { startMs := ctx.begin @@ -366,12 +421,12 @@ func (p *ttmlParser) toLyricList() model.LyricList { if len(lines) == 0 { continue } - res = append(res, model.Lyrics{ + res = append(res, p.finalizeLyrics(model.Lyrics{ Kind: ttmlLyricKindMain, Lang: lang, Line: lines, Synced: linesAreSynced(lines), - }) + })) } res = append(res, p.buildMetadataLyrics(ttmlLyricKindTranslation, p.translationLangOrder, p.translationEntriesByLg)...) @@ -440,17 +495,168 @@ func (p *ttmlParser) buildMetadataLyrics(kind string, langOrder []string, entrie lines[i] = resolved[i].line } - res = append(res, model.Lyrics{ + res = append(res, p.finalizeLyrics(model.Lyrics{ Kind: kind, Lang: lang, Line: lines, Synced: linesAreSynced(lines), - }) + })) } return res } +func (p *ttmlParser) finalizeLyrics(lyrics model.Lyrics) model.Lyrics { + lyrics.Line = model.NormalizeCueLines(lyrics.Line) + lyrics.Line, lyrics.Agents = p.resolveAgents(lyrics.Line) + return model.NormalizeLyrics(lyrics) +} + +func (p *ttmlParser) resolveAgents(lines []model.Line) ([]model.Line, []model.Agent) { + if len(lines) == 0 { + return lines, nil + } + + normalized := model.NormalizeCueLines(lines) + usedOrder := make([]string, 0, 4) + usedSet := make(map[string]struct{}, 4) + sawEmptyCue := false + + for i := range normalized { + for j := range normalized[i].Cue { + agentID := strings.TrimSpace(normalized[i].Cue[j].AgentID) + if agentID == "" { + sawEmptyCue = true + continue + } + if _, exists := usedSet[agentID]; !exists { + usedSet[agentID] = struct{}{} + usedOrder = append(usedOrder, agentID) + } + } + } + + if len(usedOrder) == 0 { + return normalized, nil + } + + mainID := "" + for _, agentID := range usedOrder { + role := p.baseRoleForAgent(agentID) + if role != "bg" && role != "group" { + mainID = agentID + break + } + } + if mainID == "" && sawEmptyCue { + mainID = "main" + } + if mainID == "" { + for _, agentID := range usedOrder { + if p.baseRoleForAgent(agentID) != "bg" { + mainID = agentID + break + } + } + } + if mainID == "" { + mainID = usedOrder[0] + } + + if _, exists := usedSet[mainID]; !exists { + usedSet[mainID] = struct{}{} + usedOrder = append([]string{mainID}, usedOrder...) + } + + for i := range normalized { + for j := range normalized[i].Cue { + if strings.TrimSpace(normalized[i].Cue[j].AgentID) == "" { + normalized[i].Cue[j].AgentID = mainID + } + } + } + + agents := make([]model.Agent, 0, len(usedOrder)) + for _, agentID := range usedOrder { + role := p.baseRoleForAgent(agentID) + if agentID == mainID { + role = "main" + } + agent := model.Agent{ + ID: agentID, + Role: role, + Name: p.agentNameForID(agentID), + } + agents = append(agents, agent) + } + + return normalized, agents +} + +func (p *ttmlParser) resolveCueAgentID(ctx ttmlTimingContext) string { + agentID := strings.TrimSpace(ctx.agentID) + if contextHasRole(ctx.role, "x-bg") { + if agentID == "" { + agentID = "main" + } + return backgroundAgentID(agentID) + } + return agentID +} + +func (p *ttmlParser) baseRoleForAgent(agentID string) string { + if isBackgroundAgentID(agentID) { + return "bg" + } + + if agent, ok := p.definedAgents[agentID]; ok { + switch agent.Type { + case "group": + return "group" + default: + return "voice" + } + } + + return "voice" +} + +func (p *ttmlParser) agentNameForID(agentID string) string { + if isBackgroundAgentID(agentID) { + baseID := strings.TrimSuffix(agentID, "__bg") + if baseID == "main" { + return "" + } + if agent, ok := p.definedAgents[baseID]; ok { + return agent.Name + } + return "" + } + + if agent, ok := p.definedAgents[agentID]; ok { + return agent.Name + } + + return "" +} + +func backgroundAgentID(agentID string) string { + return agentID + "__bg" +} + +func isBackgroundAgentID(agentID string) bool { + return strings.HasSuffix(agentID, "__bg") +} + +func contextHasRole(roles string, role string) bool { + for _, candidate := range strings.Fields(strings.ToLower(roles)) { + if candidate == strings.ToLower(role) { + return true + } + } + return false +} + func (p *ttmlParser) addMainLine(lang string, lineKey string, line model.Line) { lang = normalizeTTMLLang(lang) if _, ok := p.mainLinesByLang[lang]; !ok { @@ -495,6 +701,9 @@ func (p *ttmlParser) childContext(attrs []xml.Attr, parent ttmlTimingContext) tt if lang, ok := attrValue(attrs, "lang"); ok { ctx.lang = normalizeTTMLLang(lang) } + if agentID, ok := attrValue(attrs, "agent"); ok { + ctx.agentID = strings.TrimSpace(agentID) + } if role, ok := attrValue(attrs, "role"); ok { role = strings.TrimSpace(role) if role != "" { @@ -805,6 +1014,55 @@ func attrValue(attrs []xml.Attr, key string) (string, bool) { return "", false } +func attrOrEmpty(attrs []xml.Attr, key string) string { + value, _ := attrValue(attrs, key) + return value +} + +func (p *ttmlParser) collectElementText(start xml.StartElement) (string, error) { + var text strings.Builder + + for { + token, err := p.decoder.Token() + if err != nil { + return "", err + } + + switch t := token.(type) { + case xml.StartElement: + value, err := p.collectElementText(t) + if err != nil { + return "", err + } + text.WriteString(value) + case xml.EndElement: + if strings.EqualFold(t.Name.Local, start.Name.Local) { + return text.String(), nil + } + case xml.CharData: + text.WriteString(string(t)) + } + } +} + +func (p *ttmlParser) skipElement(_ xml.StartElement) error { + depth := 1 + for depth > 0 { + token, err := p.decoder.Token() + if err != nil { + return err + } + + switch token.(type) { + case xml.StartElement: + depth++ + case xml.EndElement: + depth-- + } + } + return nil +} + func normalizeTTMLLang(lang string) string { lang = strings.ToLower(strings.TrimSpace(lang)) if lang == "" { @@ -840,42 +1098,7 @@ func linesAreSynced(lines []model.Line) bool { } func hydrateLineTimingFromTokens(line model.Line) model.Line { - if len(line.Cue) == 0 { - return line - } - - var earliestStart *int64 - var latestEnd *int64 - for i := range line.Cue { - token := line.Cue[i] - if token.Start != nil { - if earliestStart == nil || *token.Start < *earliestStart { - v := *token.Start - earliestStart = &v - } - } - - candidateEnd := token.End - if candidateEnd == nil { - candidateEnd = token.Start - } - if candidateEnd != nil { - if latestEnd == nil || *candidateEnd > *latestEnd { - v := *candidateEnd - latestEnd = &v - } - } - } - - if line.Start == nil && earliestStart != nil { - v := *earliestStart - line.Start = &v - } - if line.End == nil && latestEnd != nil { - v := *latestEnd - line.End = &v - } - return line + return model.NormalizeLineTiming(line) } func max(v float64, fallback float64) float64 { diff --git a/core/lyrics/ttml_test.go b/core/lyrics/ttml_test.go index 8ec16f679..5fc484a3b 100644 --- a/core/lyrics/ttml_test.go +++ b/core/lyrics/ttml_test.go @@ -129,6 +129,10 @@ var _ = Describe("parseTTML", func() { list, err := parseTTML(content) Expect(err).ToNot(HaveOccurred()) Expect(list).To(HaveLen(1)) + Expect(list[0].Agents).To(Equal([]model.Agent{ + {ID: "main", Role: "main"}, + {ID: "main__bg", Role: "bg"}, + })) Expect(list[0].Line).To(HaveLen(1)) line := list[0].Line[0] @@ -137,9 +141,41 @@ var _ = Describe("parseTTML", func() { Expect(line.End).To(Equal(gg.P(int64(3000)))) Expect(line.Cue).To(HaveLen(3)) - Expect(line.Cue[0]).To(Equal(model.Cue{Start: gg.P(int64(1000)), End: gg.P(int64(1400)), Value: "He"})) - Expect(line.Cue[1]).To(Equal(model.Cue{Start: gg.P(int64(1400)), End: gg.P(int64(1800)), Value: "llo"})) - Expect(line.Cue[2]).To(Equal(model.Cue{Start: gg.P(int64(2000)), End: gg.P(int64(2500)), Value: "echo", Role: "x-bg"})) + Expect(line.Cue[0]).To(Equal(model.Cue{Start: gg.P(int64(1000)), End: gg.P(int64(1400)), Value: "He", AgentID: "main"})) + Expect(line.Cue[1]).To(Equal(model.Cue{Start: gg.P(int64(1400)), End: gg.P(int64(1800)), Value: "llo", AgentID: "main"})) + Expect(line.Cue[2]).To(Equal(model.Cue{Start: gg.P(int64(2000)), End: gg.P(int64(2500)), Value: "echo", AgentID: "main__bg"})) + }) + + It("should parse named TTML agents into main, voice, and group roles", func() { + content := []byte(` + + + + Chris Martin + Jin + All + + + +
    +

    You

    +

    and

    +

    All

    +
    + +
    `) + + list, err := parseTTML(content) + Expect(err).ToNot(HaveOccurred()) + Expect(list).To(HaveLen(1)) + Expect(list[0].Agents).To(Equal([]model.Agent{ + {ID: "v1", Role: "main", Name: "Chris Martin"}, + {ID: "v2", Role: "voice", Name: "Jin"}, + {ID: "v1000", Role: "group", Name: "All"}, + })) + Expect(list[0].Line[0].Cue[0].AgentID).To(Equal("v1")) + Expect(list[0].Line[1].Cue[0].AgentID).To(Equal("v2")) + Expect(list[0].Line[2].Cue[0].AgentID).To(Equal("v1000")) }) }) diff --git a/model/lyrics.go b/model/lyrics.go index 9fcd4992e..725c3aa94 100644 --- a/model/lyrics.go +++ b/model/lyrics.go @@ -12,10 +12,16 @@ import ( ) type Cue struct { - Start *int64 `structs:"start,omitempty" json:"start,omitempty"` - End *int64 `structs:"end,omitempty" json:"end,omitempty"` - Value string `structs:"value" json:"value"` - Role string `structs:"role,omitempty" json:"role,omitempty"` + Start *int64 `structs:"start,omitempty" json:"start,omitempty"` + End *int64 `structs:"end,omitempty" json:"end,omitempty"` + Value string `structs:"value" json:"value"` + AgentID string `structs:"agentId,omitempty" json:"agentId,omitempty"` +} + +type Agent struct { + ID string `structs:"id" json:"id"` + Role string `structs:"role" json:"role"` + Name string `structs:"name,omitempty" json:"name,omitempty"` } type Line struct { @@ -26,13 +32,14 @@ type Line struct { } type Lyrics struct { - DisplayArtist string `structs:"displayArtist,omitempty" json:"displayArtist,omitempty"` - DisplayTitle string `structs:"displayTitle,omitempty" json:"displayTitle,omitempty"` - Kind string `structs:"kind,omitempty" json:"kind,omitempty"` - Lang string `structs:"lang" json:"lang"` - Line []Line `structs:"line" json:"line"` - Offset *int64 `structs:"offset,omitempty" json:"offset,omitempty"` - Synced bool `structs:"synced" json:"synced"` + DisplayArtist string `structs:"displayArtist,omitempty" json:"displayArtist,omitempty"` + DisplayTitle string `structs:"displayTitle,omitempty" json:"displayTitle,omitempty"` + Kind string `structs:"kind,omitempty" json:"kind,omitempty"` + Lang string `structs:"lang" json:"lang"` + Agents []Agent `structs:"agents,omitempty" json:"agents,omitempty"` + Line []Line `structs:"line" json:"line"` + Offset *int64 `structs:"offset,omitempty" json:"offset,omitempty"` + Synced bool `structs:"synced" json:"synced"` } // support the standard [mm:ss.mm], as well as [hh:*] and [*.mmm] @@ -199,7 +206,7 @@ func ToLyrics(language, text string) (*Lyrics, error) { DisplayArtist: artist, DisplayTitle: title, Lang: language, - Line: structuredLines, + Line: NormalizeCueLines(structuredLines), Offset: offset, Synced: synced, } @@ -265,11 +272,6 @@ func parseEnhancedCues(text string) []Cue { Start: &start, Value: seg.text, } - // Derive End from the next cue's Start - if i+1 < len(segments) { - end := segments[i+1].start - cues[i].End = &end - } } return cues } @@ -338,3 +340,127 @@ func parseTime(line string, match []int) (int64, error) { } type LyricList []Lyrics + +func NormalizeLyrics(lyrics Lyrics) Lyrics { + lyrics.Line = NormalizeCueLines(lyrics.Line) + if len(lyrics.Agents) == 0 { + lyrics.Agents = nil + } + return lyrics +} + +func NormalizeCueLines(lines []Line) []Line { + if len(lines) == 0 { + return lines + } + + normalized := make([]Line, len(lines)) + copy(normalized, lines) + + for i := range normalized { + var fallbackEnd *int64 + if normalized[i].End != nil { + v := *normalized[i].End + fallbackEnd = &v + } else if i+1 < len(normalized) && normalized[i+1].Start != nil { + v := *normalized[i+1].Start + fallbackEnd = &v + } + + normalized[i] = normalizeCueLine(normalized[i], fallbackEnd) + } + + return normalized +} + +func NormalizeLineTiming(line Line) Line { + if len(line.Cue) == 0 { + return line + } + + var earliestStart *int64 + var latestEnd *int64 + for i := range line.Cue { + token := line.Cue[i] + if token.Start != nil { + if earliestStart == nil || *token.Start < *earliestStart { + v := *token.Start + earliestStart = &v + } + } + + candidateEnd := token.End + if candidateEnd == nil { + candidateEnd = token.Start + } + if candidateEnd != nil { + if latestEnd == nil || *candidateEnd > *latestEnd { + v := *candidateEnd + latestEnd = &v + } + } + } + + if line.Start == nil && earliestStart != nil { + v := *earliestStart + line.Start = &v + } + if line.End == nil && latestEnd != nil { + v := *latestEnd + line.End = &v + } + return line +} + +func normalizeCueLine(line Line, fallbackEnd *int64) Line { + if len(line.Cue) == 0 { + return line + } + + hasAnyEnd := false + for i := range line.Cue { + if line.Cue[i].End != nil { + hasAnyEnd = true + break + } + } + if !hasAnyEnd { + line.Cue = clearCueEnds(line.Cue) + return NormalizeLineTiming(line) + } + + for i := range line.Cue { + if line.Cue[i].End != nil { + continue + } + + if i+1 < len(line.Cue) && line.Cue[i+1].Start != nil { + v := *line.Cue[i+1].Start + line.Cue[i].End = &v + continue + } + + if fallbackEnd != nil { + v := *fallbackEnd + line.Cue[i].End = &v + } + } + + for i := range line.Cue { + if line.Cue[i].End == nil { + line.Cue = clearCueEnds(line.Cue) + return NormalizeLineTiming(line) + } + } + + return NormalizeLineTiming(line) +} + +func clearCueEnds(cues []Cue) []Cue { + normalized := make([]Cue, len(cues)) + copy(normalized, cues) + for i := range normalized { + normalized[i].End = nil + } + return normalized +} diff --git a/model/lyrics_test.go b/model/lyrics_test.go index 2228306d0..9aad7d968 100644 --- a/model/lyrics_test.go +++ b/model/lyrics_test.go @@ -129,8 +129,8 @@ var _ = Describe("ToLyrics", func() { Expect(line0.Start).To(Equal(&t1000)) Expect(line0.Value).To(Equal("Some lyrics here")) Expect(line0.Cue).To(Equal([]Cue{ - {Start: &t1000, End: &t1500, Value: "Some "}, - {Start: &t1500, End: &t2000, Value: "lyrics "}, + {Start: &t1000, Value: "Some "}, + {Start: &t1500, Value: "lyrics "}, {Start: &t2000, Value: "here"}, })) @@ -138,7 +138,7 @@ var _ = Describe("ToLyrics", func() { Expect(line1.Start).To(Equal(&t3000)) Expect(line1.Value).To(Equal("More words")) Expect(line1.Cue).To(Equal([]Cue{ - {Start: &t3000, End: &t3500, Value: "More "}, + {Start: &t3000, Value: "More "}, {Start: &t3500, Value: "words"}, })) }) @@ -161,7 +161,7 @@ var _ = Describe("ToLyrics", func() { t1000, t1500, t5000, t5500 := int64(1000), int64(1500), int64(5000), int64(5500) Expect(lyrics.Line[0].Cue).To(Equal([]Cue{ - {Start: &t1000, End: &t1500, Value: "Some "}, + {Start: &t1000, Value: "Some "}, {Start: &t1500, Value: "lyrics"}, })) Expect(lyrics.Line[0].Value).To(Equal("Some lyrics")) @@ -170,7 +170,7 @@ var _ = Describe("ToLyrics", func() { Expect(lyrics.Line[1].Value).To(Equal("Plain line")) Expect(lyrics.Line[2].Cue).To(Equal([]Cue{ - {Start: &t5000, End: &t5500, Value: "More "}, + {Start: &t5000, Value: "More "}, {Start: &t5500, Value: "words"}, })) Expect(lyrics.Line[2].Value).To(Equal("More words")) diff --git a/server/subsonic/helpers.go b/server/subsonic/helpers.go index b881fa169..ad769ee94 100644 --- a/server/subsonic/helpers.go +++ b/server/subsonic/helpers.go @@ -493,14 +493,22 @@ func mapExplicitStatus(explicitStatus string) string { return "" } -// sanitizeRole strips the TTML x- prefix from role values for the API. -func sanitizeRole(role string) string { - return strings.TrimPrefix(role, "x-") -} - func buildStructuredLyric(mf *model.MediaFile, lyrics model.Lyrics, enhanced bool) responses.StructuredLyric { lines := make([]responses.Line, len(lyrics.Line)) var cueLines []responses.CueLine + agentOrderByID := make(map[string]int, len(lyrics.Agents)) + agentRoleByID := make(map[string]string, len(lyrics.Agents)) + responseAgents := make([]responses.Agent, 0, len(lyrics.Agents)) + + for i, agent := range lyrics.Agents { + agentOrderByID[agent.ID] = i + agentRoleByID[agent.ID] = agent.Role + responseAgents = append(responseAgents, responses.Agent{ + ID: agent.ID, + Role: agent.Role, + Name: agent.Name, + }) + } for i, line := range lyrics.Line { lines[i] = responses.Line{ @@ -511,41 +519,50 @@ func buildStructuredLyric(mf *model.MediaFile, lyrics model.Lyrics, enhanced boo continue } - // Group cues by role, preserving order of first appearance - roleOrder := make([]string, 0, 2) - cuesByRole := make(map[string][]responses.LyricCue) + agentOrder := make([]string, 0, 2) + cuesByAgent := make(map[string][]model.Cue) for _, cue := range line.Cue { if cue.Start == nil { continue } - role := sanitizeRole(cue.Role) - if _, exists := cuesByRole[role]; !exists { - roleOrder = append(roleOrder, role) + agentID := strings.TrimSpace(cue.AgentID) + if _, exists := cuesByAgent[agentID]; !exists { + agentOrder = append(agentOrder, agentID) } - cuesByRole[role] = append(cuesByRole[role], responses.LyricCue{ - Start: *cue.Start, - End: cue.End, - Value: cue.Value, - }) + cuesByAgent[agentID] = append(cuesByAgent[agentID], cue) } - // Ensure main vocals (empty role) always comes first - sort.SliceStable(roleOrder, func(i, j int) bool { - return roleOrder[i] == "" && roleOrder[j] != "" + sort.SliceStable(agentOrder, func(i, j int) bool { + leftRole := agentRoleByID[agentOrder[i]] + rightRole := agentRoleByID[agentOrder[j]] + if leftRole == "main" && rightRole != "main" { + return true + } + if rightRole == "main" && leftRole != "main" { + return false + } + + leftOrder, leftOK := agentOrderByID[agentOrder[i]] + rightOrder, rightOK := agentOrderByID[agentOrder[j]] + if leftOK && rightOK && leftOrder != rightOrder { + return leftOrder < rightOrder + } + if leftOK != rightOK { + return leftOK + } + return i < j }) - // Create a separate CueLine for each role group - for _, role := range roleOrder { - cues := cuesByRole[role] + for _, agentID := range agentOrder { cueLine := responses.CueLine{ Index: int32(i), Start: line.Start, End: line.End, Value: line.Value, - Cue: cues, + Cue: buildLyricCues(cuesByAgent[agentID], line.End), } - if role != "" { - cueLine.Role = role + if agentID != "" { + cueLine.AgentID = agentID } cueLines = append(cueLines, cueLine) } @@ -567,6 +584,9 @@ func buildStructuredLyric(mf *model.MediaFile, lyrics model.Lyrics, enhanced boo kind = "main" } structured.Kind = kind + if len(cueLines) > 0 && len(responseAgents) > 0 { + structured.Agents = responseAgents + } } if structured.DisplayArtist == "" { @@ -579,6 +599,67 @@ func buildStructuredLyric(mf *model.MediaFile, lyrics model.Lyrics, enhanced boo return structured } +func buildLyricCues(cues []model.Cue, lineEnd *int64) []responses.LyricCue { + if len(cues) == 0 { + return nil + } + + hasAnyEnd := false + for i := range cues { + if cues[i].End != nil { + hasAnyEnd = true + break + } + } + + normalized := make([]responses.LyricCue, 0, len(cues)) + for i := range cues { + if cues[i].Start == nil { + continue + } + + cue := responses.LyricCue{ + Start: *cues[i].Start, + Value: cues[i].Value, + } + if hasAnyEnd { + end := cues[i].End + if end == nil { + if i+1 < len(cues) && cues[i+1].Start != nil { + v := *cues[i+1].Start + end = &v + } else if lineEnd != nil { + v := *lineEnd + end = &v + } + } + if end != nil && i+1 < len(cues) && cues[i+1].Start != nil && *end > *cues[i+1].Start { + v := *cues[i+1].Start + end = &v + } + if end != nil && *end < cue.Start { + v := cue.Start + end = &v + } + cue.End = end + } + normalized = append(normalized, cue) + } + + if hasAnyEnd { + for i := range normalized { + if normalized[i].End == nil { + for j := range normalized { + normalized[j].End = nil + } + break + } + } + } + + return normalized +} + func buildLyricsList(mf *model.MediaFile, lyricsList model.LyricList, enhanced bool) *responses.LyricsList { var filtered model.LyricList if enhanced { diff --git a/server/subsonic/media_retrieval_test.go b/server/subsonic/media_retrieval_test.go index 0fdbb3854..5489492ce 100644 --- a/server/subsonic/media_retrieval_test.go +++ b/server/subsonic/media_retrieval_test.go @@ -235,6 +235,7 @@ var _ = Describe("MediaRetrievalController", func() { Expect(realLyric.Kind).To(Equal(expectedLyric.Kind)) Expect(realLyric.Lang).To(Equal(expectedLyric.Lang)) Expect(realLyric.Synced).To(Equal(expectedLyric.Synced)) + Expect(realLyric.Agents).To(Equal(expectedLyric.Agents)) if expectedLyric.Offset == nil { Expect(realLyric.Offset).To(BeNil()) @@ -259,7 +260,7 @@ var _ = Describe("MediaRetrievalController", func() { expectedCueLine := expectedLyric.CueLine[j] Expect(realCueLine.Index).To(Equal(expectedCueLine.Index)) Expect(realCueLine.Value).To(Equal(expectedCueLine.Value)) - Expect(realCueLine.Role).To(Equal(expectedCueLine.Role)) + Expect(realCueLine.AgentID).To(Equal(expectedCueLine.AgentID)) if expectedCueLine.Start == nil { Expect(realCueLine.Start).To(BeNil()) } else { @@ -542,6 +543,7 @@ var _ = Describe("MediaRetrievalController", func() { lyricsJson, err := json.Marshal(model.LyricList{ { Lang: "eng", + Agents: []model.Agent{{ID: "lead", Role: "main"}, {ID: "lead__bg", Role: "bg"}}, Synced: true, Line: []model.Line{ { @@ -550,15 +552,16 @@ var _ = Describe("MediaRetrievalController", func() { Value: "Hello echo", Cue: []model.Cue{ { - Start: &tokenStartA, - End: &tokenEndA, - Value: "Hello", + Start: &tokenStartA, + End: &tokenEndA, + Value: "Hello", + AgentID: "lead", }, { - Start: &tokenStartB, - End: &tokenEndB, - Value: "echo", - Role: "x-bg", + Start: &tokenStartB, + End: &tokenEndB, + Value: "echo", + AgentID: "lead__bg", }, }, }, @@ -586,6 +589,10 @@ var _ = Describe("MediaRetrievalController", func() { Kind: "main", Lang: "eng", Synced: true, + Agents: []responses.Agent{ + {ID: "lead", Role: "main"}, + {ID: "lead__bg", Role: "bg"}, + }, Line: []responses.Line{ { Start: &lineStart, @@ -594,10 +601,11 @@ var _ = Describe("MediaRetrievalController", func() { }, CueLine: []responses.CueLine{ { - Index: 0, - Start: &lineStart, - End: &lineEnd, - Value: "Hello echo", + Index: 0, + Start: &lineStart, + End: &lineEnd, + Value: "Hello echo", + AgentID: "lead", Cue: []responses.LyricCue{ { Start: tokenStartA, @@ -607,11 +615,11 @@ var _ = Describe("MediaRetrievalController", func() { }, }, { - Index: 0, - Start: &lineStart, - End: &lineEnd, - Value: "Hello echo", - Role: "bg", + Index: 0, + Start: &lineStart, + End: &lineEnd, + Value: "Hello echo", + AgentID: "lead__bg", Cue: []responses.LyricCue{ { Start: tokenStartB, diff --git a/server/subsonic/responses/responses.go b/server/subsonic/responses/responses.go index f5446a961..344dd9999 100644 --- a/server/subsonic/responses/responses.go +++ b/server/subsonic/responses/responses.go @@ -543,13 +543,19 @@ type LyricCue struct { Value string `xml:",chardata" json:"value"` } +type Agent struct { + ID string `xml:"id,attr" json:"id"` + Role string `xml:"role,attr" json:"role"` + Name string `xml:"name,attr,omitempty" json:"name,omitempty"` +} + type CueLine struct { - Index int32 `xml:"index,attr" json:"index"` - Start *int64 `xml:"start,attr,omitempty" json:"start,omitempty"` - End *int64 `xml:"end,attr,omitempty" json:"end,omitempty"` - Value string `xml:"value,attr,omitempty" json:"value,omitempty"` - Role string `xml:"role,attr,omitempty" json:"role,omitempty"` - Cue []LyricCue `xml:"cue,omitempty" json:"cue,omitempty"` + Index int32 `xml:"index,attr" json:"index"` + Start *int64 `xml:"start,attr,omitempty" json:"start,omitempty"` + End *int64 `xml:"end,attr,omitempty" json:"end,omitempty"` + Value string `xml:"value,attr,omitempty" json:"value,omitempty"` + AgentID string `xml:"agentId,attr,omitempty" json:"agentId,omitempty"` + Cue []LyricCue `xml:"cue,omitempty" json:"cue,omitempty"` } type StructuredLyric struct { @@ -558,6 +564,7 @@ type StructuredLyric struct { Kind string `xml:"kind,attr,omitempty" json:"kind,omitempty"` Lang string `xml:"lang,attr" json:"lang"` Line []Line `xml:"line" json:"line"` + Agents []Agent `xml:"agent,omitempty" json:"agents,omitempty"` CueLine []CueLine `xml:"cueLine,omitempty" json:"cueLine,omitempty"` Offset *int64 `xml:"offset,attr,omitempty" json:"offset,omitempty"` Synced bool `xml:"synced,attr" json:"synced"` diff --git a/tests/fixtures/test.elrc b/tests/fixtures/test.elrc new file mode 100644 index 000000000..01c3d2cdd --- /dev/null +++ b/tests/fixtures/test.elrc @@ -0,0 +1,5 @@ +[ar:ELRC Artist] +[ti:ELRC Song] +[lang:eng] +[00:01.00]<00:01.00>Lead <00:01.50>words +[00:03.00]Fallback line diff --git a/tests/fixtures/test.srt b/tests/fixtures/test.srt new file mode 100644 index 000000000..3c9c09a39 --- /dev/null +++ b/tests/fixtures/test.srt @@ -0,0 +1,7 @@ +1 +00:00:18,800 --> 00:00:22,800 +We're from subtitles + +2 +00:00:22,801 --> 00:00:26,000 +Another subtitle line diff --git a/ui/src/audioplayer/PlayerToolbar.jsx b/ui/src/audioplayer/PlayerToolbar.jsx index 869df475d..8487b0655 100644 --- a/ui/src/audioplayer/PlayerToolbar.jsx +++ b/ui/src/audioplayer/PlayerToolbar.jsx @@ -108,7 +108,7 @@ const PlayerToolbar = ({ ) const toggleLyricsButton = ( - + Array.isArray(lyric.line) && lyric.line.some((line) => Number.isFinite(Number(line.start))) +const preferTimedLyrics = (lyrics) => { + const timed = lyrics.filter(hasTimedLines) + return timed.length > 0 ? timed : lyrics +} + const normalizeToken = (token) => { if (!token) { return null @@ -77,10 +82,38 @@ const normalizeToken = (token) => { } } -const normalizeCueLine = (cueLine, fallbackIndex) => { +const buildAgentLookup = (structuredLyric) => { + const lookup = new Map() + const agents = Array.isArray(structuredLyric?.agents) ? structuredLyric.agents : [] + for (const agent of agents) { + const id = typeof agent?.id === 'string' ? agent.id : '' + if (!id || lookup.has(id)) { + continue + } + lookup.set(id, { + id, + role: typeof agent?.role === 'string' ? agent.role : '', + name: typeof agent?.name === 'string' ? agent.name : '', + }) + } + return lookup +} + +const deriveUiRole = (agent) => { + if (!agent?.role || agent.role === 'main') { + return '' + } + return agent.role +} + +const normalizeCueLine = (cueLine, fallbackIndex, agentLookup) => { const index = Number.isFinite(Number(cueLine?.index)) ? Number(cueLine.index) : fallbackIndex + const agentId = typeof cueLine?.agentId === 'string' ? cueLine.agentId : '' + const agent = agentId ? agentLookup.get(agentId) || null : null + const fallbackRole = + typeof cueLine?.role === 'string' ? cueLine.role : '' const tokens = sortTokensByStart( Array.isArray(cueLine?.cue) ? cueLine.cue.map(normalizeToken).filter(Boolean) @@ -92,7 +125,10 @@ const normalizeCueLine = (cueLine, fallbackIndex) => { start: toTime(cueLine?.start), end: toTime(cueLine?.end), value: typeof cueLine?.value === 'string' ? cueLine.value : '', - role: typeof cueLine?.role === 'string' ? cueLine.role : '', + role: agent ? deriveUiRole(agent) : fallbackRole, + agentId, + agentRole: agent?.role || fallbackRole, + agentName: agent?.name || '', tokens, } } @@ -194,6 +230,9 @@ const buildSyntheticWordTokens = (line, token) => { end: baseStart + (duration * (idx + 1)) / chunks.length, value: chunk, role: typeof token?.role === 'string' ? token.role : '', + agentId: typeof token?.agentId === 'string' ? token.agentId : '', + agentName: typeof token?.agentName === 'string' ? token.agentName : '', + agentRole: typeof token?.agentRole === 'string' ? token.agentRole : '', })) } @@ -240,8 +279,8 @@ export const selectLyricLayers = (structuredLyrics, preferredLanguage) => { } } - const synced = structuredLyrics.filter(hasTimedLines) - if (synced.length === 0) { + const available = structuredLyrics.filter(hasStructuredLyricContent) + if (available.length === 0) { return { main: null, translation: null, @@ -255,22 +294,25 @@ export const selectLyricLayers = (structuredLyrics, preferredLanguage) => { [LYRIC_KIND_PRONUNCIATION]: [], } - for (const lyric of synced) { + for (const lyric of available) { grouped[normalizeLyricKind(lyric?.kind)].push(lyric) } const mainCandidates = grouped[LYRIC_KIND_MAIN].length ? grouped[LYRIC_KIND_MAIN] - : synced + : available return { - main: pickLyricByLanguage(mainCandidates, preferredLanguage), + main: pickLyricByLanguage( + preferTimedLyrics(mainCandidates), + preferredLanguage, + ), translation: pickLyricByLanguage( - grouped[LYRIC_KIND_TRANSLATION], + preferTimedLyrics(grouped[LYRIC_KIND_TRANSLATION]), preferredLanguage, ), pronunciation: pickLyricByLanguage( - grouped[LYRIC_KIND_PRONUNCIATION], + preferTimedLyrics(grouped[LYRIC_KIND_PRONUNCIATION]), preferredLanguage, ), } @@ -316,6 +358,7 @@ export const buildKaraokeLines = (structuredLyric) => { return [] } + const agentLookup = buildAgentLookup(structuredLyric) const baseLines = Array.isArray(structuredLyric.line) ? structuredLyric.line : [] @@ -328,12 +371,19 @@ export const buildKaraokeLines = (structuredLyric) => { ? (() => { const normalizedCueLines = rawCueLines.map( (cueLine, fallbackIndex) => { - const normalized = normalizeCueLine(cueLine, fallbackIndex) + const normalized = normalizeCueLine( + cueLine, + fallbackIndex, + agentLookup, + ) return { ...normalized, tokens: normalized.tokens.map((token) => ({ ...token, role: normalized.role, + agentId: normalized.agentId, + agentName: normalized.agentName, + agentRole: normalized.agentRole, })), } }, @@ -366,6 +416,9 @@ export const buildKaraokeLines = (structuredLyric) => { start: first.start ?? toTime(baseLine.start) ?? fallbackStart, end: first.end ?? toTime(baseLine.end) ?? fallbackEnd, value, + agentId: first.agentId, + agentName: first.agentName, + agentRole: first.agentRole, tokens, } }) diff --git a/ui/src/audioplayer/lyrics.test.js b/ui/src/audioplayer/lyrics.test.js index 6cb3a1b87..3a5f83b2d 100644 --- a/ui/src/audioplayer/lyrics.test.js +++ b/ui/src/audioplayer/lyrics.test.js @@ -124,6 +124,49 @@ describe('lyrics helpers', () => { expect(layers.pronunciation).toBeNull() }) + it('falls back to unsynced lyric content when no timed track exists', () => { + const layers = selectLyricLayers( + [ + { + lang: 'eng', + synced: false, + line: [{ value: 'Plain embedded lyric' }], + }, + ], + 'eng', + ) + + expect(layers.main).toEqual({ + lang: 'eng', + synced: false, + line: [{ value: 'Plain embedded lyric' }], + }) + }) + + it('still prefers timed lyrics when both timed and untimed tracks exist', () => { + const layers = selectLyricLayers( + [ + { + lang: 'eng', + synced: false, + line: [{ value: 'Plain lyric' }], + }, + { + lang: 'eng', + synced: true, + line: [{ start: 1000, value: 'Timed lyric' }], + }, + ], + 'eng', + ) + + expect(layers.main).toEqual({ + lang: 'eng', + synced: true, + line: [{ start: 1000, value: 'Timed lyric' }], + }) + }) + it('matches layer line by timing for the active main line', () => { const mainLines = [ { index: 0, start: 1000, end: 1800, value: 'Line A', tokens: [] }, @@ -200,43 +243,88 @@ describe('lyrics helpers', () => { expect(getPreferredLyricLanguage()).toBe('pt-BR') }) - it('builds karaoke lines from cueLine payload', () => { + it('builds karaoke lines from agent-based cueLine payload', () => { + const lines = buildKaraokeLines({ + lang: 'eng', + synced: true, + line: [{ start: 1000, end: 3000, value: 'Hello world' }], + agents: [ + { id: 'lead', role: 'main', name: 'Lead Vocal' }, + { id: 'backing', role: 'bg' }, + ], + cueLine: [ + { + index: 0, + start: 1000, + end: 3000, + value: 'Hello world', + agentId: 'lead', + cue: [{ start: 1000, end: 1500, value: 'Hello' }], + }, + { + index: 0, + start: 1000, + end: 3000, + value: 'Hello world', + agentId: 'backing', + cue: [{ start: 2000, end: 2500, value: 'world' }], + }, + ], + }) + + expect(lines).toEqual([ + { + agentId: 'lead', + agentName: 'Lead Vocal', + agentRole: 'main', + index: 0, + start: 1000, + end: 3000, + value: 'Hello world', + tokens: [ + { + start: 1000, + end: 1500, + value: 'Hello', + role: '', + agentId: 'lead', + agentName: 'Lead Vocal', + agentRole: 'main', + }, + { + start: 2000, + end: 2500, + value: 'world', + role: 'bg', + agentId: 'backing', + agentName: '', + agentRole: 'bg', + }, + ], + }, + ]) + }) + + it('falls back to legacy cueLine role values when agents are absent', () => { const lines = buildKaraokeLines({ lang: 'eng', synced: true, line: [{ start: 1000, end: 3000, value: 'Hello world' }], cueLine: [ - { - index: 0, - start: 1000, - end: 3000, - value: 'Hello world', - role: '', - cue: [{ start: 1000, end: 1500, value: 'Hello' }], - }, { index: 0, start: 1000, end: 3000, value: 'Hello world', role: 'bg', - cue: [{ start: 2000, end: 2500, value: 'world' }], + cue: [{ start: 1000, end: 1500, value: 'Hello' }], }, ], }) - expect(lines).toEqual([ - { - index: 0, - start: 1000, - end: 3000, - value: 'Hello world', - tokens: [ - { start: 1000, end: 1500, value: 'Hello', role: '' }, - { start: 2000, end: 2500, value: 'world', role: 'bg' }, - ], - }, - ]) + expect(lines[0].tokens[0].role).toBe('bg') + expect(lines[0].tokens[0].agentId).toBe('') + expect(lines[0].tokens[0].agentName).toBe('') }) it('sorts token timing by start to keep playback stable', () => { From 554074b12052a58e122ed68e8cd359fcc9f13631 Mon Sep 17 00:00:00 2001 From: ranokay Date: Sat, 21 Mar 2026 00:32:01 +0200 Subject: [PATCH 08/14] fix(lyrics): avoid derived TTML agent id collisions --- README.md | 2 +- core/lyrics/ttml.go | 7 +++-- core/lyrics/ttml_test.go | 42 +++++++++++++++++++++++-- server/subsonic/media_retrieval_test.go | 8 ++--- 4 files changed, 49 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 6b9aff799..645f1580d 100644 --- a/README.md +++ b/README.md @@ -52,7 +52,7 @@ A share of the revenue helps fund the development of Navidrome at no additional - **Multi-platform**, runs on macOS, Linux and Windows. **Docker** images are also provided - Ready to use binaries for all major platforms, including **Raspberry Pi** - Automatically **monitors your library** for changes, importing new files and reloading new metadata - - Supports synchronized lyrics from sidecar **.lrc** and **.ttml** files (via `lyricspriority`) + - Supports lyrics from sidecar **.ttml**, **.elrc**, **.lrc**, **.srt**, **.txt** files and embedded tags (via `lyricspriority`) - **Themeable**, modern and responsive **Web interface** based on [Material UI](https://material-ui.com) - **Compatible** with all Subsonic/Madsonic/Airsonic [clients](https://www.navidrome.org/docs/overview/#apps) - **Transcoding** on the fly. Can be set per user/player. **Opus encoding is supported** diff --git a/core/lyrics/ttml.go b/core/lyrics/ttml.go index e79dfe846..adbc0c054 100644 --- a/core/lyrics/ttml.go +++ b/core/lyrics/ttml.go @@ -24,6 +24,7 @@ const ( ttmlLyricKindMain = "main" ttmlLyricKindTranslation = "translation" ttmlLyricKindPronunciation = "pronunciation" + ttmlBackgroundAgentPrefix = "__nd_bg__|" ) var offsetTimeRegex = regexp.MustCompile(`^([0-9]+(?:\.[0-9]+)?)(h|m|s|ms|f|t)$`) @@ -623,7 +624,7 @@ func (p *ttmlParser) baseRoleForAgent(agentID string) string { func (p *ttmlParser) agentNameForID(agentID string) string { if isBackgroundAgentID(agentID) { - baseID := strings.TrimSuffix(agentID, "__bg") + baseID := strings.TrimPrefix(agentID, ttmlBackgroundAgentPrefix) if baseID == "main" { return "" } @@ -641,11 +642,11 @@ func (p *ttmlParser) agentNameForID(agentID string) string { } func backgroundAgentID(agentID string) string { - return agentID + "__bg" + return ttmlBackgroundAgentPrefix + agentID } func isBackgroundAgentID(agentID string) bool { - return strings.HasSuffix(agentID, "__bg") + return strings.HasPrefix(agentID, ttmlBackgroundAgentPrefix) } func contextHasRole(roles string, role string) bool { diff --git a/core/lyrics/ttml_test.go b/core/lyrics/ttml_test.go index 5fc484a3b..4e81197d4 100644 --- a/core/lyrics/ttml_test.go +++ b/core/lyrics/ttml_test.go @@ -131,7 +131,7 @@ var _ = Describe("parseTTML", func() { Expect(list).To(HaveLen(1)) Expect(list[0].Agents).To(Equal([]model.Agent{ {ID: "main", Role: "main"}, - {ID: "main__bg", Role: "bg"}, + {ID: "__nd_bg__|main", Role: "bg"}, })) Expect(list[0].Line).To(HaveLen(1)) @@ -143,7 +143,7 @@ var _ = Describe("parseTTML", func() { Expect(line.Cue[0]).To(Equal(model.Cue{Start: gg.P(int64(1000)), End: gg.P(int64(1400)), Value: "He", AgentID: "main"})) Expect(line.Cue[1]).To(Equal(model.Cue{Start: gg.P(int64(1400)), End: gg.P(int64(1800)), Value: "llo", AgentID: "main"})) - Expect(line.Cue[2]).To(Equal(model.Cue{Start: gg.P(int64(2000)), End: gg.P(int64(2500)), Value: "echo", AgentID: "main__bg"})) + Expect(line.Cue[2]).To(Equal(model.Cue{Start: gg.P(int64(2000)), End: gg.P(int64(2500)), Value: "echo", AgentID: "__nd_bg__|main"})) }) It("should parse named TTML agents into main, voice, and group roles", func() { @@ -177,6 +177,44 @@ var _ = Describe("parseTTML", func() { Expect(list[0].Line[1].Cue[0].AgentID).To(Equal("v2")) Expect(list[0].Line[2].Cue[0].AgentID).To(Equal("v1000")) }) + + It("should avoid collisions between derived background agents and explicit TTML agent ids", func() { + content := []byte(` + + + + Lead + Existing Background Id + + + +
    +

    + Lead + Echo +

    +

    + Named +

    +
    + +
    `) + + list, err := parseTTML(content) + Expect(err).ToNot(HaveOccurred()) + Expect(list).To(HaveLen(1)) + Expect(list[0].Agents).To(Equal([]model.Agent{ + {ID: "lead", Role: "main", Name: "Lead"}, + {ID: "__nd_bg__|lead", Role: "bg", Name: "Lead"}, + {ID: "lead__bg", Role: "voice", Name: "Existing Background Id"}, + })) + Expect(list[0].Line).To(HaveLen(2)) + Expect(list[0].Line[0].Cue).To(HaveLen(2)) + Expect(list[0].Line[0].Cue[0].AgentID).To(Equal("lead")) + Expect(list[0].Line[0].Cue[1].AgentID).To(Equal("__nd_bg__|lead")) + Expect(list[0].Line[1].Cue).To(HaveLen(1)) + Expect(list[0].Line[1].Cue[0].AgentID).To(Equal("lead__bg")) + }) }) Describe("Ambiguous decimal timing", func() { diff --git a/server/subsonic/media_retrieval_test.go b/server/subsonic/media_retrieval_test.go index 5489492ce..e4f6a21d4 100644 --- a/server/subsonic/media_retrieval_test.go +++ b/server/subsonic/media_retrieval_test.go @@ -543,7 +543,7 @@ var _ = Describe("MediaRetrievalController", func() { lyricsJson, err := json.Marshal(model.LyricList{ { Lang: "eng", - Agents: []model.Agent{{ID: "lead", Role: "main"}, {ID: "lead__bg", Role: "bg"}}, + Agents: []model.Agent{{ID: "lead", Role: "main"}, {ID: "__nd_bg__|lead", Role: "bg"}}, Synced: true, Line: []model.Line{ { @@ -561,7 +561,7 @@ var _ = Describe("MediaRetrievalController", func() { Start: &tokenStartB, End: &tokenEndB, Value: "echo", - AgentID: "lead__bg", + AgentID: "__nd_bg__|lead", }, }, }, @@ -591,7 +591,7 @@ var _ = Describe("MediaRetrievalController", func() { Synced: true, Agents: []responses.Agent{ {ID: "lead", Role: "main"}, - {ID: "lead__bg", Role: "bg"}, + {ID: "__nd_bg__|lead", Role: "bg"}, }, Line: []responses.Line{ { @@ -619,7 +619,7 @@ var _ = Describe("MediaRetrievalController", func() { Start: &lineStart, End: &lineEnd, Value: "Hello echo", - AgentID: "lead__bg", + AgentID: "__nd_bg__|lead", Cue: []responses.LyricCue{ { Start: tokenStartB, From 2ffb63477cabc2b82295b2278487ecd6ed048fe9 Mon Sep 17 00:00:00 2001 From: ranokay Date: Fri, 27 Mar 2026 07:55:08 +0200 Subject: [PATCH 09/14] chore(lyrics): polish rebased TTML branch --- core/lyrics/ttml.go | 8 ++++---- server/subsonic/media_retrieval.go | 2 ++ ui/src/audioplayer/lyrics.js | 7 ++++--- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/core/lyrics/ttml.go b/core/lyrics/ttml.go index adbc0c054..a02fa52d8 100644 --- a/core/lyrics/ttml.go +++ b/core/lyrics/ttml.go @@ -814,9 +814,9 @@ func (p *ttmlParser) updateTimingParams(attrs []xml.Attr) { } } - p.params.frameRate = max(frameRate, defaultTTMLFrameRate) - p.params.subFrameRate = max(subFrameRate, defaultTTMLSubFrameRate) - p.params.tickRate = max(tickRate, defaultTTMLTickRate) + p.params.frameRate = positiveOrDefault(frameRate, defaultTTMLFrameRate) + p.params.subFrameRate = positiveOrDefault(subFrameRate, defaultTTMLSubFrameRate) + p.params.tickRate = positiveOrDefault(tickRate, defaultTTMLTickRate) } func parseTTMLDurationExpression(expr string, params ttmlTimingParams) (int64, bool) { @@ -1102,7 +1102,7 @@ func hydrateLineTimingFromTokens(line model.Line) model.Line { return model.NormalizeLineTiming(line) } -func max(v float64, fallback float64) float64 { +func positiveOrDefault(v float64, fallback float64) float64 { if v <= 0 { return fallback } diff --git a/server/subsonic/media_retrieval.go b/server/subsonic/media_retrieval.go index de88849a2..16d0d2666 100644 --- a/server/subsonic/media_retrieval.go +++ b/server/subsonic/media_retrieval.go @@ -99,6 +99,8 @@ func (api *Router) GetLyrics(r *http.Request) (*responses.Subsonic, error) { lyricsResponse := responses.Lyrics{} response.Lyrics = &lyricsResponse opts := filter.SongsByArtistTitleWithLyricsFirst(artist, title) + // Keep the search exhaustive so an older duplicate can still supply the + // matching sidecar lyrics when the newest candidate only has embedded data. opts.Max = 0 mediaFiles, err := api.ds.MediaFile(r.Context()).GetAll(opts) diff --git a/ui/src/audioplayer/lyrics.js b/ui/src/audioplayer/lyrics.js index cd5248096..87b218d05 100644 --- a/ui/src/audioplayer/lyrics.js +++ b/ui/src/audioplayer/lyrics.js @@ -84,7 +84,9 @@ const normalizeToken = (token) => { const buildAgentLookup = (structuredLyric) => { const lookup = new Map() - const agents = Array.isArray(structuredLyric?.agents) ? structuredLyric.agents : [] + const agents = Array.isArray(structuredLyric?.agents) + ? structuredLyric.agents + : [] for (const agent of agents) { const id = typeof agent?.id === 'string' ? agent.id : '' if (!id || lookup.has(id)) { @@ -112,8 +114,7 @@ const normalizeCueLine = (cueLine, fallbackIndex, agentLookup) => { : fallbackIndex const agentId = typeof cueLine?.agentId === 'string' ? cueLine.agentId : '' const agent = agentId ? agentLookup.get(agentId) || null : null - const fallbackRole = - typeof cueLine?.role === 'string' ? cueLine.role : '' + const fallbackRole = typeof cueLine?.role === 'string' ? cueLine.role : '' const tokens = sortTokensByStart( Array.isArray(cueLine?.cue) ? cueLine.cue.map(normalizeToken).filter(Boolean) From 73d94962e01543b6a2340efdecea6e6cfe81320b Mon Sep 17 00:00:00 2001 From: ranokay Date: Fri, 27 Mar 2026 12:41:27 +0200 Subject: [PATCH 10/14] feat(lyrics): refine karaoke overlay timing and state --- core/lyrics/lyrics_test.go | 4 +- core/lyrics/sources_test.go | 15 +- model/lyrics.go | 12 - model/lyrics_test.go | 16 +- ui/src/audioplayer/KaraokeLyricsOverlay.jsx | 535 ++++++++++++++---- .../audioplayer/KaraokeLyricsOverlay.test.jsx | 344 +++++++++++ ui/src/audioplayer/Player.jsx | 51 +- .../audioplayer/Player.lyricsState.test.jsx | 77 +++ ui/src/audioplayer/lyrics.js | 90 +-- ui/src/audioplayer/lyrics.test.js | 134 ++++- ui/src/audioplayer/lyricsOverlayState.js | 27 + 11 files changed, 1070 insertions(+), 235 deletions(-) create mode 100644 ui/src/audioplayer/KaraokeLyricsOverlay.test.jsx create mode 100644 ui/src/audioplayer/Player.lyricsState.test.jsx create mode 100644 ui/src/audioplayer/lyricsOverlayState.js diff --git a/core/lyrics/lyrics_test.go b/core/lyrics/lyrics_test.go index 58e8ba82b..917c530ac 100644 --- a/core/lyrics/lyrics_test.go +++ b/core/lyrics/lyrics_test.go @@ -52,15 +52,17 @@ var _ = Describe("sources", func() { Line: []model.Line{ { Start: gg.P(int64(1000)), - End: gg.P(int64(1500)), + End: gg.P(int64(3000)), Value: "Lead words", Cue: []model.Cue{ { Start: gg.P(int64(1000)), + End: gg.P(int64(1500)), Value: "Lead ", }, { Start: gg.P(int64(1500)), + End: gg.P(int64(3000)), Value: "words", }, }, diff --git a/core/lyrics/sources_test.go b/core/lyrics/sources_test.go index a110390d8..a86c84cd0 100644 --- a/core/lyrics/sources_test.go +++ b/core/lyrics/sources_test.go @@ -102,22 +102,26 @@ var _ = Describe("sources", func() { // Line 1: has inline markers → Cue array populated Expect(lyrics[0].Line[0].Start).To(Equal(gg.P(int64(1000)))) + Expect(lyrics[0].Line[0].End).To(Equal(gg.P(int64(3000)))) Expect(lyrics[0].Line[0].Value).To(Equal("Some lyrics here")) Expect(lyrics[0].Line[0].Cue).To(HaveLen(3)) Expect(*lyrics[0].Line[0].Cue[0].Start).To(Equal(int64(1000))) Expect(lyrics[0].Line[0].Cue[0].Value).To(Equal("Some ")) - Expect(lyrics[0].Line[0].Cue[0].End).To(BeNil()) + Expect(lyrics[0].Line[0].Cue[0].End).To(Equal(gg.P(int64(1500)))) Expect(*lyrics[0].Line[0].Cue[1].Start).To(Equal(int64(1500))) Expect(lyrics[0].Line[0].Cue[1].Value).To(Equal("lyrics ")) - Expect(lyrics[0].Line[0].Cue[1].End).To(BeNil()) + Expect(lyrics[0].Line[0].Cue[1].End).To(Equal(gg.P(int64(2000)))) Expect(*lyrics[0].Line[0].Cue[2].Start).To(Equal(int64(2000))) Expect(lyrics[0].Line[0].Cue[2].Value).To(Equal("here")) - Expect(lyrics[0].Line[0].Cue[2].End).To(BeNil()) + Expect(lyrics[0].Line[0].Cue[2].End).To(Equal(gg.P(int64(3000)))) // Line 2: has inline markers Expect(lyrics[0].Line[1].Start).To(Equal(gg.P(int64(3000)))) + Expect(lyrics[0].Line[1].End).To(Equal(gg.P(int64(5000)))) Expect(lyrics[0].Line[1].Value).To(Equal("More words")) Expect(lyrics[0].Line[1].Cue).To(HaveLen(2)) + Expect(lyrics[0].Line[1].Cue[0].End).To(Equal(gg.P(int64(3500)))) + Expect(lyrics[0].Line[1].Cue[1].End).To(Equal(gg.P(int64(5000)))) // Line 3: plain line, no cues Expect(lyrics[0].Line[2].Start).To(Equal(gg.P(int64(5000)))) @@ -138,14 +142,15 @@ var _ = Describe("sources", func() { Expect(lyrics[0].Line).To(HaveLen(2)) Expect(lyrics[0].Line[0].Start).To(Equal(gg.P(int64(1000)))) + Expect(lyrics[0].Line[0].End).To(Equal(gg.P(int64(3000)))) Expect(lyrics[0].Line[0].Value).To(Equal("Lead words")) Expect(lyrics[0].Line[0].Cue).To(HaveLen(2)) Expect(*lyrics[0].Line[0].Cue[0].Start).To(Equal(int64(1000))) Expect(lyrics[0].Line[0].Cue[0].Value).To(Equal("Lead ")) - Expect(lyrics[0].Line[0].Cue[0].End).To(BeNil()) + Expect(lyrics[0].Line[0].Cue[0].End).To(Equal(gg.P(int64(1500)))) Expect(*lyrics[0].Line[0].Cue[1].Start).To(Equal(int64(1500))) Expect(lyrics[0].Line[0].Cue[1].Value).To(Equal("words")) - Expect(lyrics[0].Line[0].Cue[1].End).To(BeNil()) + Expect(lyrics[0].Line[0].Cue[1].End).To(Equal(gg.P(int64(3000)))) Expect(lyrics[0].Line[1].Start).To(Equal(gg.P(int64(3000)))) Expect(lyrics[0].Line[1].Value).To(Equal("Fallback line")) diff --git a/model/lyrics.go b/model/lyrics.go index 725c3aa94..ec0df9f34 100644 --- a/model/lyrics.go +++ b/model/lyrics.go @@ -417,18 +417,6 @@ func normalizeCueLine(line Line, fallbackEnd *int64) Line { return line } - hasAnyEnd := false - for i := range line.Cue { - if line.Cue[i].End != nil { - hasAnyEnd = true - break - } - } - if !hasAnyEnd { - line.Cue = clearCueEnds(line.Cue) - return NormalizeLineTiming(line) - } - for i := range line.Cue { if line.Cue[i].End != nil { continue diff --git a/model/lyrics_test.go b/model/lyrics_test.go index 9aad7d968..6f189f024 100644 --- a/model/lyrics_test.go +++ b/model/lyrics_test.go @@ -127,20 +127,24 @@ var _ = Describe("ToLyrics", func() { line0 := lyrics.Line[0] Expect(line0.Start).To(Equal(&t1000)) + Expect(line0.End).To(Equal(&t3000)) Expect(line0.Value).To(Equal("Some lyrics here")) Expect(line0.Cue).To(Equal([]Cue{ - {Start: &t1000, Value: "Some "}, - {Start: &t1500, Value: "lyrics "}, - {Start: &t2000, Value: "here"}, + {Start: &t1000, End: &t1500, Value: "Some "}, + {Start: &t1500, End: &t2000, Value: "lyrics "}, + {Start: &t2000, End: &t3000, Value: "here"}, })) line1 := lyrics.Line[1] Expect(line1.Start).To(Equal(&t3000)) + Expect(line1.End).To(Equal(&t3500)) Expect(line1.Value).To(Equal("More words")) Expect(line1.Cue).To(Equal([]Cue{ {Start: &t3000, Value: "More "}, {Start: &t3500, Value: "words"}, })) + + Expect(line1.Cue[1].End).To(BeNil()) }) It("should ignore Enhanced LRC markers and return plain lines when no markers present", func() { @@ -159,12 +163,14 @@ var _ = Describe("ToLyrics", func() { Expect(lyrics.Line).To(HaveLen(3)) t1000, t1500, t5000, t5500 := int64(1000), int64(1500), int64(5000), int64(5500) + t3000 := int64(3000) Expect(lyrics.Line[0].Cue).To(Equal([]Cue{ - {Start: &t1000, Value: "Some "}, - {Start: &t1500, Value: "lyrics"}, + {Start: &t1000, End: &t1500, Value: "Some "}, + {Start: &t1500, End: &t3000, Value: "lyrics"}, })) Expect(lyrics.Line[0].Value).To(Equal("Some lyrics")) + Expect(lyrics.Line[0].End).To(Equal(&t3000)) Expect(lyrics.Line[1].Cue).To(BeNil()) Expect(lyrics.Line[1].Value).To(Equal("Plain line")) diff --git a/ui/src/audioplayer/KaraokeLyricsOverlay.jsx b/ui/src/audioplayer/KaraokeLyricsOverlay.jsx index a44e50bf6..cd1484e41 100644 --- a/ui/src/audioplayer/KaraokeLyricsOverlay.jsx +++ b/ui/src/audioplayer/KaraokeLyricsOverlay.jsx @@ -3,8 +3,10 @@ import IconButton from '@material-ui/core/IconButton' import Popover from '@material-ui/core/Popover' import Slider from '@material-ui/core/Slider' import { makeStyles } from '@material-ui/core/styles' +import Tooltip from '@material-ui/core/Tooltip' import Typography from '@material-ui/core/Typography' import CloseIcon from '@material-ui/icons/Close' +import RestoreIcon from '@material-ui/icons/Restore' import TuneIcon from '@material-ui/icons/Tune' import clsx from 'clsx' import React, { @@ -16,8 +18,11 @@ import React, { useState, } from 'react' import { + buildHighlightedAuxLine, + buildHighlightedMainLine, buildKaraokeLines, getActiveKaraokeState, + hasUsableKaraokeTiming, hasStructuredLyricContent, resolveKaraokeTokenWindow, resolveLayerLineForMain, @@ -36,6 +41,12 @@ const KARAOKE_MAX_HEIGHT_RATIO = 0.72 const KARAOKE_MAX_HEIGHT_PX = 760 const KARAOKE_CENTER_SPACER_RATIO = 0.5 const KARAOKE_CENTER_SPACER_MIN_PX = 132 +const KARAOKE_DEFAULT_LINE_HEIGHT = 1.3 +const KARAOKE_MIN_LINE_HEIGHT = 1 +const KARAOKE_MAX_LINE_HEIGHT = 2.2 +const KARAOKE_LINE_HEIGHT_STEP = 0.02 +const KARAOKE_GROUP_SPACING_BASE_PX = 14 +const KARAOKE_AUX_LINE_HEIGHT = 1.2 const TOKEN_DONE_ALPHA = 1 const TOKEN_FUTURE_ALPHA = 0.34 @@ -55,33 +66,65 @@ const COLOR_PRESETS = [ ] const DEFAULT_LYRICS_SETTINGS = { - tr: { fontSize: 14, colorKey: 'blue' }, - main: { fontSize: 24, colorKey: 'white' }, - pr: { fontSize: 14, colorKey: 'green' }, + lineHeight: KARAOKE_DEFAULT_LINE_HEIGHT, + overlayHeight: KARAOKE_DEFAULT_HEIGHT_PX, + tr: { fontSize: 18, colorKey: 'blue' }, + main: { fontSize: 30, colorKey: 'white' }, + pr: { fontSize: 18, colorKey: 'green' }, } const SETTINGS_STORAGE_KEY = 'karaoke-lyrics-settings' +const createDefaultLyricsSettings = () => ({ + lineHeight: KARAOKE_DEFAULT_LINE_HEIGHT, + overlayHeight: KARAOKE_DEFAULT_HEIGHT_PX, + tr: { ...DEFAULT_LYRICS_SETTINGS.tr }, + main: { ...DEFAULT_LYRICS_SETTINGS.main }, + pr: { ...DEFAULT_LYRICS_SETTINGS.pr }, +}) + +const clampLineHeight = (value) => { + const numeric = Number(value) + if (!Number.isFinite(numeric)) { + return KARAOKE_DEFAULT_LINE_HEIGHT + } + return clamp(numeric, KARAOKE_MIN_LINE_HEIGHT, KARAOKE_MAX_LINE_HEIGHT) +} + +const clampOverlayHeightPreference = (value) => { + const numeric = Number(value) + if (!Number.isFinite(numeric)) { + return KARAOKE_DEFAULT_HEIGHT_PX + } + return clamp(numeric, KARAOKE_MIN_HEIGHT_PX, KARAOKE_MAX_HEIGHT_PX) +} + +const normalizeLyricsSettings = (settings) => ({ + lineHeight: clampLineHeight(settings?.lineHeight), + overlayHeight: clampOverlayHeightPreference(settings?.overlayHeight), + tr: { ...DEFAULT_LYRICS_SETTINGS.tr, ...settings?.tr }, + main: { ...DEFAULT_LYRICS_SETTINGS.main, ...settings?.main }, + pr: { ...DEFAULT_LYRICS_SETTINGS.pr, ...settings?.pr }, +}) + const loadLyricsSettings = () => { try { const raw = localStorage.getItem(SETTINGS_STORAGE_KEY) if (raw) { - const parsed = JSON.parse(raw) - return { - tr: { ...DEFAULT_LYRICS_SETTINGS.tr, ...parsed.tr }, - main: { ...DEFAULT_LYRICS_SETTINGS.main, ...parsed.main }, - pr: { ...DEFAULT_LYRICS_SETTINGS.pr, ...parsed.pr }, - } + return normalizeLyricsSettings(JSON.parse(raw)) } } catch { /* ignore */ } - return { ...DEFAULT_LYRICS_SETTINGS } + return normalizeLyricsSettings() } const saveLyricsSettings = (settings) => { try { - localStorage.setItem(SETTINGS_STORAGE_KEY, JSON.stringify(settings)) + localStorage.setItem( + SETTINGS_STORAGE_KEY, + JSON.stringify(normalizeLyricsSettings(settings)), + ) } catch { /* ignore */ } @@ -97,7 +140,7 @@ const useStyles = makeStyles((theme) => ({ bottom: 100, transform: 'translateX(-50%)', zIndex: 1400, - width: 'min(900px, calc(100vw - 32px))', + width: 'min(1000px, calc(100vw - 32px))', minHeight: KARAOKE_MIN_HEIGHT_PX, background: 'rgba(6, 8, 12, 0.9)', borderRadius: 12, @@ -149,13 +192,39 @@ const useStyles = makeStyles((theme) => ({ gap: theme.spacing(1), minWidth: 0, }, - language: { - fontSize: 11, - letterSpacing: '0.08em', - opacity: 0.72, - textTransform: 'uppercase', + languageBadges: { + display: 'flex', + alignItems: 'center', + gap: theme.spacing(0.5), + flexWrap: 'wrap', + minWidth: 0, + }, + languageBadge: { + display: 'inline-flex', + alignItems: 'center', + gap: theme.spacing(0.35), + padding: theme.spacing(0.2, 0.7), + borderRadius: 999, + border: '1px solid rgba(148, 163, 184, 0.28)', + background: 'rgba(15, 23, 42, 0.42)', + color: 'rgba(226, 232, 240, 0.8)', + fontSize: 10, + letterSpacing: '0.04em', whiteSpace: 'nowrap', }, + languageBadgeActive: { + borderColor: 'rgba(148, 163, 184, 0.46)', + background: 'rgba(30, 41, 59, 0.56)', + color: 'rgba(248, 250, 252, 0.94)', + }, + languageBadgeLabel: { + fontWeight: 700, + textTransform: 'uppercase', + opacity: 0.78, + }, + languageBadgeValue: { + opacity: 0.9, + }, layerControls: { display: 'flex', alignItems: 'center', @@ -186,21 +255,31 @@ const useStyles = makeStyles((theme) => ({ closeButton: { color: 'rgba(255, 255, 255, 0.72)', }, + lineGroup: { + display: 'flex', + flexDirection: 'column', + alignItems: 'center', + gap: theme.spacing(0.35), + }, inlineTr: { - margin: '0 0 2px 0', + margin: 0, textAlign: 'center', fontWeight: 400, - lineHeight: 1.2, + lineHeight: KARAOKE_AUX_LINE_HEIGHT, letterSpacing: '0.01em', transition: `opacity ${KARAOKE_ANIMATION_MS}ms ease-in-out, font-size ${KARAOKE_ANIMATION_MS}ms ease-in-out`, }, inlinePr: { - margin: '2px 0 0 0', + margin: 0, textAlign: 'center', fontWeight: 400, - lineHeight: 1.2, + lineHeight: KARAOKE_AUX_LINE_HEIGHT, letterSpacing: '0.01em', transition: `opacity ${KARAOKE_ANIMATION_MS}ms ease-in-out, font-size ${KARAOKE_ANIMATION_MS}ms ease-in-out`, + padding: theme.spacing(0.15, 0.9), + borderRadius: 999, + background: 'rgba(255, 255, 255, 0.08)', + border: '1px solid rgba(255, 255, 255, 0.12)', }, body: { padding: theme.spacing(0.5, 2, 1.4, 2), @@ -252,15 +331,29 @@ const useStyles = makeStyles((theme) => ({ border: '1px solid rgba(255, 255, 255, 0.12)', borderRadius: 10, padding: theme.spacing(1.5, 2), - width: 260, + width: 278, backdropFilter: 'blur(12px)', }, + settingsHeader: { + display: 'flex', + alignItems: 'center', + justifyContent: 'space-between', + gap: theme.spacing(1), + marginBottom: theme.spacing(1.25), + }, settingsSection: { marginBottom: theme.spacing(1.2), '&:last-child': { marginBottom: 0, }, }, + settingsTitle: { + fontSize: 11, + fontWeight: 700, + letterSpacing: '0.08em', + textTransform: 'uppercase', + color: 'rgba(255, 255, 255, 0.78)', + }, settingsLabel: { fontSize: 10, fontWeight: 600, @@ -291,6 +384,21 @@ const useStyles = makeStyles((theme) => ({ minWidth: 22, textAlign: 'right', }, + settingsControlLabel: { + fontSize: 10, + letterSpacing: '0.06em', + textTransform: 'uppercase', + color: 'rgba(255, 255, 255, 0.45)', + minWidth: 72, + whiteSpace: 'nowrap', + }, + resetButton: { + color: 'rgba(255, 255, 255, 0.58)', + padding: 4, + '&:hover': { + color: 'rgba(255, 255, 255, 0.9)', + }, + }, colorDots: { display: 'flex', gap: 5, @@ -314,6 +422,9 @@ const useStyles = makeStyles((theme) => ({ const clamp = (v, min, max) => Math.max(min, Math.min(max, v)) const lerp = (from, to, t) => from + (to - from) * t +const formatLineHeight = (value) => clampLineHeight(value).toFixed(2) +const getLineGapPx = (lineHeight) => + `${Math.round(clampLineHeight(lineHeight) * KARAOKE_GROUP_SPACING_BASE_PX)}px` const normalizeForComparison = (text) => (text || '').replace(/[\s\p{P}]/gu, '').toLowerCase() @@ -326,6 +437,34 @@ const shouldShowAuxLine = (mainLine, auxLine) => { ) } +const buildLanguageBadges = ({ + mainLyric, + translationLyric, + pronunciationLyric, + showTranslation, + showPronunciation, +}) => + [ + { + key: 'main', + label: 'Main', + lang: mainLyric?.lang, + active: true, + }, + { + key: 'pr', + label: 'PR', + lang: pronunciationLyric?.lang, + active: showPronunciation, + }, + { + key: 'tr', + label: 'TR', + lang: translationLyric?.lang, + active: showTranslation, + }, + ].filter((badge) => badge.lang) + const SettingsSection = ({ label, layer, settings, onChange, classes }) => { const s = settings[layer] return ( @@ -363,7 +502,37 @@ const SettingsSection = ({ label, layer, settings, onChange, classes }) => { ) } -const LyricsSettingsPopover = ({ settings, onChange }) => { +const LineHeightSetting = ({ settings, onChange, classes }) => ( +
    +
    Spacing
    +
    +
    Line height
    + + onChange({ + ...settings, + lineHeight: clampLineHeight(Array.isArray(val) ? val[0] : val), + }) + } + /> + + {formatLineHeight(settings.lineHeight)} + +
    +
    +) + +const LyricsSettingsPopover = ({ settings, onChange, onReset }) => { const classes = useStyles() const [anchorEl, setAnchorEl] = useState(null) @@ -376,14 +545,19 @@ const LyricsSettingsPopover = ({ settings, onChange }) => { return ( <> - - - + + + + + + + { PaperProps={{ className: classes.settingsPanel }} style={{ zIndex: 1500 }} > +
    + Appearance + + + + + + + +
    + { classes={classes} /> { a.opacity === b.opacity && a.color === b.color && a.fontSize === b.fontSize && - a.fontWeight === b.fontWeight + a.fontWeight === b.fontWeight && + a.lineHeight === b.lineHeight ) } @@ -778,7 +974,6 @@ const KaraokeLyricsOverlay = ({ }) => { const classes = useStyles() const [playbackMs, setPlaybackMs] = useState(0) - const [overlayHeight, setOverlayHeight] = useState(KARAOKE_DEFAULT_HEIGHT_PX) const [maxHeightPx, setMaxHeightPx] = useState(getMaxHeightPx()) const [bodyViewportHeight, setBodyViewportHeight] = useState(0) const [isCompact, setIsCompact] = useState( @@ -787,8 +982,15 @@ const KaraokeLyricsOverlay = ({ const [lyricsSettings, setLyricsSettings] = useState(loadLyricsSettings) const handleSettingsChange = useCallback((next) => { - setLyricsSettings(next) - saveLyricsSettings(next) + const normalized = normalizeLyricsSettings(next) + setLyricsSettings(normalized) + saveLyricsSettings(normalized) + }, []) + + const handleResetAppearance = useCallback(() => { + const defaults = createDefaultLyricsSettings() + setLyricsSettings(defaults) + saveLyricsSettings(defaults) }, []) const bodyRef = useRef(null) @@ -803,15 +1005,17 @@ const KaraokeLyricsOverlay = ({ () => buildKaraokeLines(pronunciationLyric), [pronunciationLyric], ) + const overlayHeight = clamp( + lyricsSettings.overlayHeight, + KARAOKE_MIN_HEIGHT_PX, + maxHeightPx, + ) useEffect(() => { const onResize = () => { const nextMaxHeight = getMaxHeightPx() setIsCompact(window.innerWidth <= 810) setMaxHeightPx(nextMaxHeight) - setOverlayHeight((previous) => - clamp(previous, KARAOKE_MIN_HEIGHT_PX, nextMaxHeight), - ) } onResize() @@ -853,9 +1057,14 @@ const KaraokeLyricsOverlay = ({ const onMove = (moveEvent) => { const delta = startY - moveEvent.clientY - setOverlayHeight( - clamp(startHeight + delta, KARAOKE_MIN_HEIGHT_PX, maxHeightPx), - ) + handleSettingsChange({ + ...lyricsSettings, + overlayHeight: clamp( + startHeight + delta, + KARAOKE_MIN_HEIGHT_PX, + maxHeightPx, + ), + }) } const onUp = () => { @@ -866,7 +1075,13 @@ const KaraokeLyricsOverlay = ({ window.addEventListener('mousemove', onMove) window.addEventListener('mouseup', onUp) }, - [isCompact, maxHeightPx, overlayHeight], + [ + handleSettingsChange, + isCompact, + lyricsSettings, + maxHeightPx, + overlayHeight, + ], ) useEffect(() => { @@ -967,13 +1182,29 @@ const KaraokeLyricsOverlay = ({ }, [audioInstance, visible]) const renderPlaybackMs = playbackMs + KARAOKE_RENDER_LEAD_MS - - const { lineIndex } = useMemo( - () => getActiveKaraokeState(mainLines, renderPlaybackMs), - [mainLines, renderPlaybackMs], + const hasTimedMainLines = useMemo( + () => hasUsableKaraokeTiming(mainLines), + [mainLines], ) - const activeIndex = lineIndex >= 0 ? lineIndex : 0 + const { lineIndex } = useMemo( + () => + hasTimedMainLines + ? getActiveKaraokeState(mainLines, renderPlaybackMs) + : { lineIndex: -1, tokenIndex: -1 }, + [hasTimedMainLines, mainLines, renderPlaybackMs], + ) + + const activeIndex = hasTimedMainLines && lineIndex >= 0 ? lineIndex : -1 + const lineHeight = lyricsSettings.lineHeight + const lineGap = getLineGapPx(lineHeight) + const languageBadges = buildLanguageBadges({ + mainLyric, + translationLyric, + pronunciationLyric, + showTranslation, + showPronunciation, + }) const trByMainIndex = useMemo(() => { if (!showTranslation || translationLines.length === 0) return {} @@ -1008,12 +1239,14 @@ const KaraokeLyricsOverlay = ({ ? 260 : Math.max(220, overlayHeight - 170) const centerSpacerPx = Math.max( - KARAOKE_CENTER_SPACER_MIN_PX, - Math.floor(estimatedViewportHeight * KARAOKE_CENTER_SPACER_RATIO), + hasTimedMainLines ? KARAOKE_CENTER_SPACER_MIN_PX : 0, + hasTimedMainLines + ? Math.floor(estimatedViewportHeight * KARAOKE_CENTER_SPACER_RATIO) + : 0, ) useEffect(() => { - if (!visible) { + if (!visible || !hasTimedMainLines) { return } @@ -1050,6 +1283,7 @@ const KaraokeLyricsOverlay = ({ return () => window.cancelAnimationFrame(rafId) }, [ centerSpacerPx, + hasTimedMainLines, hasPronunciationLine, hasTranslationLine, lineIndex, @@ -1066,10 +1300,19 @@ const KaraokeLyricsOverlay = ({ } const getMainLineStyle = (idx) => { + const [r, g, b] = parseColorRGB(getColorValue(lyricsSettings.main.colorKey)) + if (!hasTimedMainLines) { + return { + opacity: 1, + color: `rgba(${r}, ${g}, ${b}, 0.98)`, + fontSize: lyricsSettings.main.fontSize, + lineHeight, + } + } + const delta = idx - activeIndex const isActive = delta === 0 let opacity = isActive ? 1 : delta < 0 ? 0.6 : 0.72 - const [r, g, b] = parseColorRGB(getColorValue(lyricsSettings.main.colorKey)) const color = isActive ? `rgba(${r}, ${g}, ${b}, 0.98)` : delta < 0 @@ -1093,6 +1336,48 @@ const KaraokeLyricsOverlay = ({ opacity, color, fontSize, + lineHeight, + } + } + + const getAuxLineStyle = (idx, layerKey) => { + const [r, g, b] = parseColorRGB( + getColorValue(lyricsSettings[layerKey].colorKey), + ) + if (!hasTimedMainLines) { + return { + opacity: 0.94, + fontSize: lyricsSettings[layerKey].fontSize, + color: `rgba(${r}, ${g}, ${b}, 0.94)`, + lineHeight: KARAOKE_AUX_LINE_HEIGHT, + } + } + + const delta = idx - activeIndex + const isActive = delta === 0 + + let opacity = isActive ? 0.94 : delta < 0 ? 0.5 : 0.62 + const color = isActive + ? `rgba(${r}, ${g}, ${b}, 0.94)` + : delta < 0 + ? `rgba(${r}, ${g}, ${b}, 0.42)` + : `rgba(${r}, ${g}, ${b}, 0.56)` + + if (delta > 1) { + const level = clamp(delta, 1, 6) + opacity = Math.max(0.28, 0.64 - level * 0.08) + } + + if (delta < -1) { + const level = clamp(Math.abs(delta), 1, 6) + opacity = Math.max(0.22, 0.5 - level * 0.08) + } + + return { + opacity, + fontSize: lyricsSettings[layerKey].fontSize, + color, + lineHeight: KARAOKE_AUX_LINE_HEIGHT, } } @@ -1109,36 +1394,61 @@ const KaraokeLyricsOverlay = ({ data-testid="karaoke-lyrics-overlay" style={overlayStyle} > -
    +
    - - {mainLyric?.lang || 'xxx'} - +
    + {languageBadges.map((badge) => ( +
    + + {badge.label} + + {badge.lang} +
    + ))} +
    - - + + + + + + + + + +
    @@ -1146,6 +1456,7 @@ const KaraokeLyricsOverlay = ({
    -
    +
    {mainLines.map((line, idx) => { const trLine = trByMainIndex[idx] const prLine = prByMainIndex[idx] + const mainNextLineStart = mainLines[idx + 1]?.start ?? null + const highlightedMainLine = buildHighlightedMainLine( + line, + mainNextLineStart, + ) + const highlightedTrLine = buildHighlightedAuxLine( + line, + trLine, + mainNextLineStart, + ) + const highlightedPrLine = buildHighlightedAuxLine( + line, + prLine, + mainNextLineStart, + ) const showTr = shouldShowAuxLine(line, trLine) const showPr = shouldShowAuxLine(line, prLine) const lineStyle = getMainLineStyle(idx) - const auxOpacity = - lineStyle.opacity != null ? lineStyle.opacity * 0.85 : 1 - const trStyle = { - opacity: auxOpacity, - fontSize: lyricsSettings.tr.fontSize, - color: getColorValue(lyricsSettings.tr.colorKey), - } - const prStyle = { - opacity: auxOpacity, - fontSize: lyricsSettings.pr.fontSize, - color: getColorValue(lyricsSettings.pr.colorKey), - } + const trStyle = getAuxLineStyle(idx, 'tr') + const prStyle = getAuxLineStyle(idx, 'pr') return (
    { if (audioInstance && line.start != null) { @@ -1190,33 +1511,35 @@ const KaraokeLyricsOverlay = ({ } }} > - {showTr && ( - - )} {showPr && ( + )} + {showTr && ( + )}
    diff --git a/ui/src/audioplayer/KaraokeLyricsOverlay.test.jsx b/ui/src/audioplayer/KaraokeLyricsOverlay.test.jsx new file mode 100644 index 000000000..411116eae --- /dev/null +++ b/ui/src/audioplayer/KaraokeLyricsOverlay.test.jsx @@ -0,0 +1,344 @@ +import React from 'react' +import { + cleanup, + fireEvent, + render, + screen, + waitFor, +} from '@testing-library/react' +import KaraokeLyricsOverlay from './KaraokeLyricsOverlay' + +const DEFAULT_LINE_HEIGHT_TEXT = '1.30' +const NEXT_LINE_HEIGHT_TEXT = '1.32' + +const audioInstance = { + currentTime: 0, + paused: true, + seeking: false, + playbackRate: 1, +} + +const buildLyric = (kind, lang, value) => ({ + kind, + lang, + synced: true, + line: [{ start: 1000, value }], +}) + +const renderOverlay = (props = {}) => + render( + {}} + onTogglePronunciation={() => {}} + audioInstance={audioInstance} + onClose={() => {}} + {...props} + />, + ) + +describe(' behavior', () => { + beforeEach(() => { + localStorage.clear() + window.innerWidth = 1200 + window.innerHeight = 900 + vi.spyOn(window, 'requestAnimationFrame').mockImplementation(() => 1) + vi.spyOn(window, 'cancelAnimationFrame').mockImplementation(() => {}) + }) + + afterEach(() => { + vi.restoreAllMocks() + cleanup() + }) + + it('shows tooltips for translation, pronunciation, and appearance controls', async () => { + renderOverlay() + + fireEvent.mouseOver(screen.getByTestId('lyrics-toggle-translation')) + expect(await screen.findByText('Toggle translations')).toBeInTheDocument() + + fireEvent.mouseOver(screen.getByTestId('lyrics-toggle-pronunciation')) + expect(await screen.findByText('Toggle pronunciations')).toBeInTheDocument() + + fireEvent.mouseOver(screen.getByTestId('lyrics-settings-button')) + expect(await screen.findByText('Appearance')).toBeInTheDocument() + }) + + it('renders the appearance popup with Main label and default line height for older settings', async () => { + localStorage.setItem( + 'karaoke-lyrics-settings', + JSON.stringify({ + tr: { fontSize: 16, colorKey: 'blue' }, + main: { fontSize: 26, colorKey: 'white' }, + pr: { fontSize: 15, colorKey: 'green' }, + }), + ) + + renderOverlay() + + fireEvent.click(screen.getByTestId('lyrics-settings-button')) + + expect(await screen.findByText('Appearance')).toBeInTheDocument() + expect(screen.getByText('Main', { selector: 'div' })).toBeInTheDocument() + expect(screen.queryByText('Default')).not.toBeInTheDocument() + expect(screen.getByTestId('lyrics-reset-appearance')).toBeInTheDocument() + expect(screen.getByTestId('lyrics-line-height-value')).toHaveTextContent( + DEFAULT_LINE_HEIGHT_TEXT, + ) + }) + + it('renders the lyric group in main, pronunciation, translation order with layer badges', () => { + renderOverlay({ + showTranslation: true, + showPronunciation: true, + }) + + const mainLine = screen.getByText('こんにちは') + const pronunciationLine = screen.getByText('konnichiwa') + const translationLine = screen.getByText('Hello') + + expect( + mainLine.compareDocumentPosition(pronunciationLine) & + Node.DOCUMENT_POSITION_FOLLOWING, + ).toBeTruthy() + expect( + pronunciationLine.compareDocumentPosition(translationLine) & + Node.DOCUMENT_POSITION_FOLLOWING, + ).toBeTruthy() + + expect(screen.getByTestId('lyrics-language-badge-main')).toHaveTextContent( + 'Mainja', + ) + expect(screen.getByTestId('lyrics-language-badge-pr')).toHaveTextContent( + 'PRja-Latn', + ) + expect(screen.getByTestId('lyrics-language-badge-tr')).toHaveTextContent( + 'TRen', + ) + }) + + it('renders line-timed rows as whole-line spans without synthetic token splits', () => { + renderOverlay({ + mainLyric: { + kind: 'main', + lang: 'en', + synced: true, + line: [ + { start: 1000, end: 2400, value: 'Batter up, batter up, batter up' }, + ], + }, + translationLyric: { + kind: 'translation', + lang: 'ja', + synced: true, + line: [ + { + start: 1000, + end: 2400, + value: 'バッターアップ、バッターアップ、バッターアップ', + }, + ], + }, + pronunciationLyric: { + kind: 'pronunciation', + lang: 'ja-Latn', + synced: true, + line: [ + { + start: 1000, + end: 2400, + value: 'Battaa appu, battaa appu, battaa appu', + }, + ], + }, + showTranslation: true, + showPronunciation: true, + }) + + const mainLine = screen.getByText( + 'Batter up, batter up, batter up', + ).parentElement + const pronunciationLine = screen.getByText( + 'Battaa appu, battaa appu, battaa appu', + ).parentElement + const translationLine = screen.getByText( + 'バッターアップ、バッターアップ、バッターアップ', + ).parentElement + + expect(mainLine.querySelectorAll('span')).toHaveLength(1) + expect(pronunciationLine.querySelectorAll('span')).toHaveLength(1) + expect(translationLine.querySelectorAll('span')).toHaveLength(1) + }) + + it('highlights line-timed pronunciation and translation rows with the active main line', () => { + renderOverlay({ + mainLyric: { + kind: 'main', + lang: 'en', + synced: true, + line: [ + { start: 1000, end: 1800, value: 'Line one' }, + { start: 2500, end: 3300, value: 'Line two' }, + ], + }, + translationLyric: { + kind: 'translation', + lang: 'ja', + synced: true, + line: [ + { start: 1000, end: 1800, value: '一行目' }, + { start: 2500, end: 3300, value: '二行目' }, + ], + }, + pronunciationLyric: { + kind: 'pronunciation', + lang: 'ja-Latn', + synced: true, + line: [ + { start: 1000, end: 1800, value: 'ichigyoume' }, + { start: 2500, end: 3300, value: 'nigyoume' }, + ], + }, + showTranslation: true, + showPronunciation: true, + audioInstance: { + ...audioInstance, + currentTime: 1.2, + }, + }) + + const activePronunciation = screen.getByText('ichigyoume').parentElement + const inactivePronunciation = screen.getByText('nigyoume').parentElement + const activeTranslation = screen.getByText('一行目').parentElement + const inactiveTranslation = screen.getByText('二行目').parentElement + + expect(parseFloat(activePronunciation.style.opacity)).toBeGreaterThan( + parseFloat(inactivePronunciation.style.opacity), + ) + expect(parseFloat(activeTranslation.style.opacity)).toBeGreaterThan( + parseFloat(inactiveTranslation.style.opacity), + ) + }) + + it('renders untimed text lyrics in manual reading mode without a pinned active line', () => { + renderOverlay({ + mainLyric: { + kind: 'main', + lang: 'en', + synced: false, + line: [{ value: 'First plain line' }, { value: 'Second plain line' }], + }, + translationLyric: null, + pronunciationLyric: null, + showTranslation: false, + showPronunciation: false, + translationEnabled: false, + pronunciationEnabled: false, + }) + + const firstLine = screen.getByText('First plain line').parentElement + const secondLine = screen.getByText('Second plain line').parentElement + + expect(firstLine.style.opacity).toBe('1') + expect(secondLine.style.opacity).toBe('1') + expect(firstLine.style.color).toBe(secondLine.style.color) + }) + + it('persists line height changes, keeps aux line spacing fixed, and stores overlay height', async () => { + renderOverlay({ + mainLyric: buildLyric('main', 'en', 'Hello world'), + translationLyric: buildLyric('translation', 'es', 'Hola'), + pronunciationLyric: buildLyric('pronunciation', 'en-Latn', 'heh-loh'), + showTranslation: true, + showPronunciation: true, + translationEnabled: true, + pronunciationEnabled: true, + }) + + const overlay = screen.getByTestId('karaoke-lyrics-overlay') + const mainLine = screen.getByText('Hello world').parentElement + const pronunciationLine = screen.getByText('heh-loh').parentElement + expect(mainLine).toHaveStyle(`line-height: ${DEFAULT_LINE_HEIGHT_TEXT}`) + expect(pronunciationLine).toHaveStyle('line-height: 1.2') + + fireEvent.click(screen.getByTestId('lyrics-settings-button')) + + const slider = screen.getByRole('slider', { name: 'Line height' }) + slider.focus() + fireEvent.keyDown(slider, { key: 'ArrowRight' }) + + await waitFor(() => + expect(screen.getByTestId('lyrics-line-height-value')).toHaveTextContent( + NEXT_LINE_HEIGHT_TEXT, + ), + ) + + await waitFor(() => + expect(mainLine).toHaveStyle(`line-height: ${NEXT_LINE_HEIGHT_TEXT}`), + ) + expect(pronunciationLine).toHaveStyle('line-height: 1.2') + + fireEvent.mouseDown(screen.getByTestId('lyrics-resize-handle'), { + clientY: 400, + }) + fireEvent.mouseMove(window, { clientY: 360 }) + fireEvent.mouseUp(window) + + await waitFor(() => expect(overlay).toHaveStyle('height: 340px')) + + const stored = JSON.parse(localStorage.getItem('karaoke-lyrics-settings')) + expect(stored.lineHeight).toBeCloseTo(1.32, 2) + expect(stored.overlayHeight).toBe(340) + }) + + it('resets appearance back to the default spacing and overlay height', async () => { + localStorage.setItem( + 'karaoke-lyrics-settings', + JSON.stringify({ + lineHeight: 1.8, + overlayHeight: 420, + tr: { fontSize: 16, colorKey: 'yellow' }, + main: { fontSize: 28, colorKey: 'cyan' }, + pr: { fontSize: 15, colorKey: 'pink' }, + }), + ) + + renderOverlay({ + mainLyric: buildLyric('main', 'en', 'Hello world'), + translationLyric: null, + pronunciationLyric: null, + showPronunciation: false, + translationEnabled: false, + pronunciationEnabled: false, + }) + + const overlay = screen.getByTestId('karaoke-lyrics-overlay') + const mainLine = screen.getByText('Hello world').parentElement + expect(overlay).toHaveStyle('height: 420px') + expect(mainLine).toHaveStyle('line-height: 1.8') + + fireEvent.click(screen.getByTestId('lyrics-settings-button')) + fireEvent.click(screen.getByTestId('lyrics-reset-appearance')) + + await waitFor(() => + expect(screen.getByTestId('lyrics-line-height-value')).toHaveTextContent( + DEFAULT_LINE_HEIGHT_TEXT, + ), + ) + await waitFor(() => expect(overlay).toHaveStyle('height: 300px')) + await waitFor(() => + expect(mainLine).toHaveStyle(`line-height: ${DEFAULT_LINE_HEIGHT_TEXT}`), + ) + + const stored = JSON.parse(localStorage.getItem('karaoke-lyrics-settings')) + expect(stored.lineHeight).toBeCloseTo(1.3, 2) + expect(stored.overlayHeight).toBe(300) + }) +}) diff --git a/ui/src/audioplayer/Player.jsx b/ui/src/audioplayer/Player.jsx index b8b33b6d5..c6e73c916 100644 --- a/ui/src/audioplayer/Player.jsx +++ b/ui/src/audioplayer/Player.jsx @@ -40,6 +40,10 @@ import { selectLyricLayers, structuredLyricToLrc, } from './lyrics' +import { + resolveLyricsOverlayState, + togglePronunciationPreference, +} from './lyricsOverlayState' import KaraokeLyricsOverlay from './KaraokeLyricsOverlay' const emptyLyricLayers = { @@ -143,11 +147,12 @@ const Player = () => { const lyricCacheRef = useRef(new Map()) const lyricRequestIdRef = useRef(0) const playerRef = useRef(null) - const [karaokeVisible, setKaraokeVisible] = useState(false) + const [karaokeVisiblePreference, setKaraokeVisiblePreference] = + useState(false) const [selectedLyricLayers, setSelectedLyricLayers] = useState(emptyLyricLayers) - const [showTranslation, setShowTranslation] = useState(false) - const [showPronunciation, setShowPronunciation] = useState(false) + const [translationPreference, setTranslationPreference] = useState(false) + const [pronunciationPreference, setPronunciationPreference] = useState(null) const currentTrackId = playerState.current?.trackId const currentTrackIsRadio = playerState.current?.isRadio const selectedStructuredLyric = selectedLyricLayers.main @@ -158,6 +163,15 @@ const Player = () => { const hasPronunciationLyric = hasStructuredLyricContent( selectedLyricLayers.pronunciation, ) + const { karaokeVisible, showTranslation, showPronunciation } = + resolveLyricsOverlayState({ + karaokeVisiblePreference, + translationPreference, + pronunciationPreference, + hasKaraokeLyric, + hasTranslationLyric, + hasPronunciationLyric, + }) const applyLyricToRuntimePlayer = useCallback((trackId, lyric) => { if (!trackId) { @@ -255,9 +269,6 @@ const Player = () => { useEffect(() => { if (!currentTrackId || currentTrackIsRadio) { setSelectedLyricLayers(emptyLyricLayers) - setShowTranslation(false) - setShowPronunciation(false) - setKaraokeVisible(false) return } @@ -273,8 +284,6 @@ const Player = () => { } } setSelectedLyricLayers(layers) - setShowTranslation(false) - setShowPronunciation(hasStructuredLyricContent(layers.pronunciation)) }, [currentTrackId, currentTrackIsRadio]) useEffect(() => { @@ -297,10 +306,6 @@ const Player = () => { : normalizeLyricLayers({ main: cached?.structuredLyric }) setSelectedLyricLayers(cachedLayers) - setShowTranslation(false) - setShowPronunciation( - hasStructuredLyricContent(cachedLayers.pronunciation), - ) if (cachedLyric) { dispatch(updateQueueLyric(currentTrackId, cachedLyric)) applyLyricToRuntimePlayer(currentTrackId, cachedLyric) @@ -327,8 +332,6 @@ const Player = () => { layers, }) setSelectedLyricLayers(layers) - setShowTranslation(false) - setShowPronunciation(hasStructuredLyricContent(layers.pronunciation)) if (lyric !== '') { dispatch(updateQueueLyric(currentTrackId, lyric)) @@ -340,19 +343,11 @@ const Player = () => { return } setSelectedLyricLayers(emptyLyricLayers) - setShowTranslation(false) - setShowPronunciation(false) // Do not cache network/request failures as empty lyrics, so we can retry. lyricCacheRef.current.delete(currentTrackId) }) }, [dispatch, currentTrackId, currentTrackIsRadio, applyLyricToRuntimePlayer]) - useEffect(() => { - if (!hasKaraokeLyric && karaokeVisible) { - setKaraokeVisible(false) - } - }, [hasKaraokeLyric, karaokeVisible]) - const defaultOptions = useMemo( () => ({ theme: playerTheme, @@ -404,7 +399,9 @@ const Player = () => { setKaraokeVisible((visible) => !visible)} + onToggleLyrics={() => + setKaraokeVisiblePreference((visible) => !visible) + } lyricsActive={karaokeVisible} lyricsDisabled={!hasKaraokeLyric} /> @@ -616,17 +613,17 @@ const Player = () => { translationEnabled={hasTranslationLyric} pronunciationEnabled={hasPronunciationLyric} onToggleTranslation={() => - setShowTranslation((previous) => + setTranslationPreference((previous) => hasTranslationLyric ? !previous : false, ) } onTogglePronunciation={() => - setShowPronunciation((previous) => - hasPronunciationLyric ? !previous : false, + setPronunciationPreference((previous) => + togglePronunciationPreference(previous, hasPronunciationLyric), ) } audioInstance={audioInstance} - onClose={() => setKaraokeVisible(false)} + onClose={() => setKaraokeVisiblePreference(false)} /> diff --git a/ui/src/audioplayer/Player.lyricsState.test.jsx b/ui/src/audioplayer/Player.lyricsState.test.jsx new file mode 100644 index 000000000..c47abea76 --- /dev/null +++ b/ui/src/audioplayer/Player.lyricsState.test.jsx @@ -0,0 +1,77 @@ +import { + resolveLyricsOverlayState, + togglePronunciationPreference, +} from './lyricsOverlayState' + +describe('Player lyrics state helpers', () => { + it('keeps the lyrics window preference across track changes in the session', () => { + const visibleOnCurrentTrack = resolveLyricsOverlayState({ + karaokeVisiblePreference: true, + translationPreference: false, + pronunciationPreference: null, + hasKaraokeLyric: true, + hasTranslationLyric: true, + hasPronunciationLyric: true, + }) + expect(visibleOnCurrentTrack.karaokeVisible).toBe(true) + + const hiddenForTrackWithoutLyrics = resolveLyricsOverlayState({ + karaokeVisiblePreference: true, + translationPreference: false, + pronunciationPreference: null, + hasKaraokeLyric: false, + hasTranslationLyric: false, + hasPronunciationLyric: false, + }) + expect(hiddenForTrackWithoutLyrics.karaokeVisible).toBe(false) + + const restoredOnNextLyricsTrack = resolveLyricsOverlayState({ + karaokeVisiblePreference: true, + translationPreference: false, + pronunciationPreference: null, + hasKaraokeLyric: true, + hasTranslationLyric: false, + hasPronunciationLyric: false, + }) + expect(restoredOnNextLyricsTrack.karaokeVisible).toBe(true) + }) + + it('restores translation and pronunciation preferences after tracks without those layers', () => { + const initialState = resolveLyricsOverlayState({ + karaokeVisiblePreference: false, + translationPreference: false, + pronunciationPreference: null, + hasKaraokeLyric: true, + hasTranslationLyric: true, + hasPronunciationLyric: true, + }) + expect(initialState.showTranslation).toBe(false) + expect(initialState.showPronunciation).toBe(true) + + const translationPreference = true + const pronunciationPreference = togglePronunciationPreference(null, true) + expect(pronunciationPreference).toBe(false) + + const hiddenOnTrackWithoutAuxLayers = resolveLyricsOverlayState({ + karaokeVisiblePreference: false, + translationPreference, + pronunciationPreference, + hasKaraokeLyric: true, + hasTranslationLyric: false, + hasPronunciationLyric: false, + }) + expect(hiddenOnTrackWithoutAuxLayers.showTranslation).toBe(false) + expect(hiddenOnTrackWithoutAuxLayers.showPronunciation).toBe(false) + + const restoredOnTrackWithAuxLayers = resolveLyricsOverlayState({ + karaokeVisiblePreference: false, + translationPreference, + pronunciationPreference, + hasKaraokeLyric: true, + hasTranslationLyric: true, + hasPronunciationLyric: true, + }) + expect(restoredOnTrackWithAuxLayers.showTranslation).toBe(true) + expect(restoredOnTrackWithAuxLayers.showPronunciation).toBe(false) + }) +}) diff --git a/ui/src/audioplayer/lyrics.js b/ui/src/audioplayer/lyrics.js index 87b218d05..e9cd16d5a 100644 --- a/ui/src/audioplayer/lyrics.js +++ b/ui/src/audioplayer/lyrics.js @@ -12,6 +12,9 @@ const padTime = (value) => { } const toTime = (value) => { + if (value == null || value === '') { + return null + } const numeric = Number(value) return Number.isFinite(numeric) ? numeric : null } @@ -179,64 +182,6 @@ const lineTimeWindow = (lines, index) => { return { start, end } } -const buildSyntheticWordTokens = (line, token) => { - const text = typeof line?.value === 'string' ? line.value : '' - if (!text.trim()) { - return null - } - - const chunks = text.match(/\S+\s*/g) || [] - if (chunks.length < 2) { - return null - } - - const normalizedLine = text.replace(/\s+/g, ' ').trim().toLowerCase() - const normalizedTokenValue = (token?.value || '') - .replace(/\s+/g, ' ') - .trim() - .toLowerCase() - if (!normalizedTokenValue || !normalizedLine) { - return null - } - - const compressedLine = normalizedLine.replace(/\s+/g, '') - const compressedToken = normalizedTokenValue.replace(/\s+/g, '') - const tokenLooksLikeWholeLine = - compressedToken === compressedLine || - compressedToken.length >= Math.floor(compressedLine.length * 0.8) - if (!tokenLooksLikeWholeLine) { - return null - } - - const tokenStart = toTime(token?.start) - const tokenEnd = toTime(token?.end) - const lineStart = toTime(line?.start) - const lineEnd = toTime(line?.end) - - const baseStart = tokenStart ?? lineStart - const baseEnd = tokenEnd ?? lineEnd - if ( - baseStart == null || - baseEnd == null || - !Number.isFinite(baseStart) || - !Number.isFinite(baseEnd) || - baseEnd <= baseStart - ) { - return null - } - - const duration = baseEnd - baseStart - return chunks.map((chunk, idx) => ({ - start: baseStart + (duration * idx) / chunks.length, - end: baseStart + (duration * (idx + 1)) / chunks.length, - value: chunk, - role: typeof token?.role === 'string' ? token.role : '', - agentId: typeof token?.agentId === 'string' ? token.agentId : '', - agentName: typeof token?.agentName === 'string' ? token.agentName : '', - agentRole: typeof token?.agentRole === 'string' ? token.agentRole : '', - })) -} - export const hasCueTiming = (structuredLyric) => Boolean( structuredLyric && @@ -449,19 +394,6 @@ export const buildKaraokeLines = (structuredLyric) => { } return a.index - b.index }) - .map((line) => { - const nextLine = { ...line } - if (nextLine.tokens.length === 1) { - const syntheticTokens = buildSyntheticWordTokens( - nextLine, - nextLine.tokens[0], - ) - if (syntheticTokens) { - nextLine.tokens = syntheticTokens - } - } - return nextLine - }) for (let i = 0; i < normalized.length; i += 1) { if (normalized[i].end == null) { @@ -628,6 +560,17 @@ export const getActiveKaraokeState = (lines, currentTimeMs) => { return { lineIndex, tokenIndex } } +export const hasUsableKaraokeTiming = (lines) => + Array.isArray(lines) && + lines.some( + (line) => + toTime(line?.start) != null || + (Array.isArray(line?.tokens) && + line.tokens.some( + (token) => toTime(token?.start) != null || toTime(token?.end) != null, + )), + ) + export const findLayerLineIndexForMain = (mainLines, layerLines, mainIndex) => { if ( !Array.isArray(mainLines) || @@ -692,3 +635,8 @@ export const resolveLayerLineForMain = (mainLines, layerLines, mainIndex) => { line: index >= 0 ? layerLines[index] : null, } } + +export const buildHighlightedMainLine = (line) => line + +export const buildHighlightedAuxLine = (_referenceLine, auxiliaryLine) => + auxiliaryLine ?? null diff --git a/ui/src/audioplayer/lyrics.test.js b/ui/src/audioplayer/lyrics.test.js index 3a5f83b2d..2fcf1df40 100644 --- a/ui/src/audioplayer/lyrics.test.js +++ b/ui/src/audioplayer/lyrics.test.js @@ -1,8 +1,11 @@ import { + buildHighlightedAuxLine, + buildHighlightedMainLine, buildKaraokeLines, findLayerLineIndexForMain, getActiveKaraokeState, getPreferredLyricLanguage, + hasUsableKaraokeTiming, hasStructuredLyricContent, pickStructuredLyric, resolveKaraokeTokenWindow, @@ -201,6 +204,110 @@ describe('lyrics helpers', () => { ) }) + it('keeps translation lines line-level when they do not have real cue timing', () => { + const mainLine = { + index: 0, + start: 1000, + end: 2200, + value: '불을 질러라', + tokens: [ + { start: 1000, end: 1300, value: '불을 ' }, + { start: 1300, end: 1650, value: '질' }, + { start: 1650, end: 2200, value: '러라' }, + ], + } + const translationLine = { + index: 0, + start: 1000, + end: 2200, + value: 'Set it on fire', + tokens: [], + } + + const highlighted = buildHighlightedAuxLine(mainLine, translationLine, 2600) + + expect(highlighted).toBe(translationLine) + expect(highlighted.tokens).toEqual([]) + }) + + it('keeps pronunciation lines line-level when they do not have real cue timing', () => { + const mainLine = { + index: 0, + start: 1000, + end: 2200, + value: 'You もっと強く 素早く 吹き飛ばせ', + tokens: [], + } + const pronunciationLine = { + index: 0, + start: 1000, + end: 2200, + value: 'You motto tsuyoku subayaku fukitobase', + tokens: [], + } + + const highlighted = buildHighlightedAuxLine( + mainLine, + pronunciationLine, + 2600, + ) + + expect(highlighted).toBe(pronunciationLine) + expect(highlighted.tokens).toEqual([]) + }) + + it('keeps main lines line-level when they do not have real cue timing', () => { + const line = { + index: 0, + start: 1000, + end: 2200, + value: 'Youもっと強く 素早く 吹き飛ばせ', + tokens: [], + } + + const highlighted = buildHighlightedMainLine(line, 2600) + + expect(highlighted).toBe(line) + expect(highlighted.tokens).toEqual([]) + }) + + it('keeps auxiliary lines line-level when end time is missing and they lack cues', () => { + const mainLine = { + index: 0, + start: 1000, + end: null, + value: 'Hello there', + tokens: [], + } + const translationLine = { + index: 0, + start: 1000, + end: null, + value: 'Bonjour toi', + tokens: [], + } + + const highlighted = buildHighlightedAuxLine(mainLine, translationLine, 2400) + + expect(highlighted).toBe(translationLine) + expect(highlighted.tokens).toEqual([]) + }) + + it('keeps main lines line-level when end time is missing and they lack cues', () => { + const line = { + index: 0, + start: 1000, + end: null, + value: 'One more time', + tokens: [], + } + + const highlighted = buildHighlightedMainLine(line, 2400) + + expect(highlighted).toBe(line) + expect(highlighted.tokens).toEqual([]) + }) + it('returns no layer match when the nearest line is too far in time', () => { const mainLines = [ { index: 0, start: 1000, end: 1800, value: 'Line A', tokens: [] }, @@ -353,7 +460,7 @@ describe('lyrics helpers', () => { ]) }) - it('splits a single full-line token into synthetic word tokens', () => { + it('keeps a single full-line token unchanged instead of expanding it synthetically', () => { const lines = buildKaraokeLines({ lang: 'ko-Latn', synced: true, @@ -371,17 +478,13 @@ describe('lyrics helpers', () => { }) expect(lines).toHaveLength(1) - expect(lines[0].tokens).toHaveLength(2) - expect(lines[0].tokens[0].value).toBe('Da-la-lun, ') - expect(lines[0].tokens[1].value).toBe('dun') + expect(lines[0].tokens).toHaveLength(1) + expect(lines[0].tokens[0].value).toBe('Da-la-lun, dun') const firstWindow = resolveKaraokeTokenWindow(lines[0], 0) - const secondWindow = resolveKaraokeTokenWindow(lines[0], 1) expect(firstWindow.start).toBeCloseTo(1000) - expect(firstWindow.end).toBeCloseTo(1500) - expect(secondWindow.start).toBeCloseTo(1500) - expect(secondWindow.end).toBeCloseTo(2000) + expect(firstWindow.end).toBeCloseTo(2000) }) it('detects active line and token for karaoke timing', () => { @@ -509,4 +612,19 @@ describe('lyrics helpers', () => { }), ).toBe(true) }) + + it('detects when built karaoke lines have no usable timing', () => { + expect( + hasUsableKaraokeTiming([ + { index: 0, value: 'First line', tokens: [] }, + { index: 1, value: 'Second line', tokens: [] }, + ]), + ).toBe(false) + + expect( + hasUsableKaraokeTiming([ + { index: 0, start: 1000, value: 'Timed line', tokens: [] }, + ]), + ).toBe(true) + }) }) diff --git a/ui/src/audioplayer/lyricsOverlayState.js b/ui/src/audioplayer/lyricsOverlayState.js new file mode 100644 index 000000000..e8ff0e0a8 --- /dev/null +++ b/ui/src/audioplayer/lyricsOverlayState.js @@ -0,0 +1,27 @@ +export const resolveLyricsOverlayState = ({ + karaokeVisiblePreference, + translationPreference, + pronunciationPreference, + hasKaraokeLyric, + hasTranslationLyric, + hasPronunciationLyric, +}) => ({ + karaokeVisible: karaokeVisiblePreference && hasKaraokeLyric, + showTranslation: translationPreference && hasTranslationLyric, + showPronunciation: + (pronunciationPreference == null + ? hasPronunciationLyric + : pronunciationPreference) && hasPronunciationLyric, +}) + +export const togglePronunciationPreference = ( + previousPreference, + hasPronunciationLyric, +) => { + if (!hasPronunciationLyric) { + return false + } + const currentPreference = + previousPreference == null ? hasPronunciationLyric : previousPreference + return !currentPreference +} From aeae6d221706e392272b8c2697b886144e81b48e Mon Sep 17 00:00:00 2001 From: ranokay Date: Tue, 14 Apr 2026 02:19:09 +0300 Subject: [PATCH 11/14] feat(lyrics): require cue byte offsets --- core/lyrics/lyrics_test.go | 16 +- core/lyrics/sources_test.go | 14 ++ core/lyrics/ttml.go | 226 +++++++++++++++--- core/lyrics/ttml_test.go | 20 +- model/lyrics.go | 86 ++++--- model/lyrics_test.go | 34 ++- server/subsonic/helpers.go | 6 +- server/subsonic/media_retrieval_test.go | 164 +++++++++++-- server/subsonic/responses/responses.go | 10 +- ui/src/audioplayer/KaraokeLyricsOverlay.jsx | 67 ++++++ .../audioplayer/KaraokeLyricsOverlay.test.jsx | 68 ++++++ ui/src/audioplayer/lyrics.js | 80 +++++++ ui/src/audioplayer/lyrics.test.js | 56 +++++ ui/src/subsonic/index.test.js | 3 +- 14 files changed, 730 insertions(+), 120 deletions(-) diff --git a/core/lyrics/lyrics_test.go b/core/lyrics/lyrics_test.go index 917c530ac..822e975ce 100644 --- a/core/lyrics/lyrics_test.go +++ b/core/lyrics/lyrics_test.go @@ -56,14 +56,18 @@ var _ = Describe("sources", func() { Value: "Lead words", Cue: []model.Cue{ { - Start: gg.P(int64(1000)), - End: gg.P(int64(1500)), - Value: "Lead ", + Start: gg.P(int64(1000)), + End: gg.P(int64(1500)), + Value: "Lead ", + ByteStart: 0, + ByteEnd: 4, }, { - Start: gg.P(int64(1500)), - End: gg.P(int64(3000)), - Value: "words", + Start: gg.P(int64(1500)), + End: gg.P(int64(3000)), + Value: "words", + ByteStart: 5, + ByteEnd: 9, }, }, }, diff --git a/core/lyrics/sources_test.go b/core/lyrics/sources_test.go index a86c84cd0..5ba03336e 100644 --- a/core/lyrics/sources_test.go +++ b/core/lyrics/sources_test.go @@ -108,12 +108,18 @@ var _ = Describe("sources", func() { Expect(*lyrics[0].Line[0].Cue[0].Start).To(Equal(int64(1000))) Expect(lyrics[0].Line[0].Cue[0].Value).To(Equal("Some ")) Expect(lyrics[0].Line[0].Cue[0].End).To(Equal(gg.P(int64(1500)))) + Expect(lyrics[0].Line[0].Cue[0].ByteStart).To(Equal(0)) + Expect(lyrics[0].Line[0].Cue[0].ByteEnd).To(Equal(4)) Expect(*lyrics[0].Line[0].Cue[1].Start).To(Equal(int64(1500))) Expect(lyrics[0].Line[0].Cue[1].Value).To(Equal("lyrics ")) Expect(lyrics[0].Line[0].Cue[1].End).To(Equal(gg.P(int64(2000)))) + Expect(lyrics[0].Line[0].Cue[1].ByteStart).To(Equal(5)) + Expect(lyrics[0].Line[0].Cue[1].ByteEnd).To(Equal(11)) Expect(*lyrics[0].Line[0].Cue[2].Start).To(Equal(int64(2000))) Expect(lyrics[0].Line[0].Cue[2].Value).To(Equal("here")) Expect(lyrics[0].Line[0].Cue[2].End).To(Equal(gg.P(int64(3000)))) + Expect(lyrics[0].Line[0].Cue[2].ByteStart).To(Equal(12)) + Expect(lyrics[0].Line[0].Cue[2].ByteEnd).To(Equal(15)) // Line 2: has inline markers Expect(lyrics[0].Line[1].Start).To(Equal(gg.P(int64(3000)))) @@ -122,6 +128,10 @@ var _ = Describe("sources", func() { Expect(lyrics[0].Line[1].Cue).To(HaveLen(2)) Expect(lyrics[0].Line[1].Cue[0].End).To(Equal(gg.P(int64(3500)))) Expect(lyrics[0].Line[1].Cue[1].End).To(Equal(gg.P(int64(5000)))) + Expect(lyrics[0].Line[1].Cue[0].ByteStart).To(Equal(0)) + Expect(lyrics[0].Line[1].Cue[0].ByteEnd).To(Equal(4)) + Expect(lyrics[0].Line[1].Cue[1].ByteStart).To(Equal(5)) + Expect(lyrics[0].Line[1].Cue[1].ByteEnd).To(Equal(9)) // Line 3: plain line, no cues Expect(lyrics[0].Line[2].Start).To(Equal(gg.P(int64(5000)))) @@ -148,9 +158,13 @@ var _ = Describe("sources", func() { Expect(*lyrics[0].Line[0].Cue[0].Start).To(Equal(int64(1000))) Expect(lyrics[0].Line[0].Cue[0].Value).To(Equal("Lead ")) Expect(lyrics[0].Line[0].Cue[0].End).To(Equal(gg.P(int64(1500)))) + Expect(lyrics[0].Line[0].Cue[0].ByteStart).To(Equal(0)) + Expect(lyrics[0].Line[0].Cue[0].ByteEnd).To(Equal(4)) Expect(*lyrics[0].Line[0].Cue[1].Start).To(Equal(int64(1500))) Expect(lyrics[0].Line[0].Cue[1].Value).To(Equal("words")) Expect(lyrics[0].Line[0].Cue[1].End).To(Equal(gg.P(int64(3000)))) + Expect(lyrics[0].Line[0].Cue[1].ByteStart).To(Equal(5)) + Expect(lyrics[0].Line[0].Cue[1].ByteEnd).To(Equal(9)) Expect(lyrics[0].Line[1].Start).To(Equal(gg.P(int64(3000)))) Expect(lyrics[0].Line[1].Value).To(Equal("Fallback line")) diff --git a/core/lyrics/ttml.go b/core/lyrics/ttml.go index a02fa52d8..6e4ce9da3 100644 --- a/core/lyrics/ttml.go +++ b/core/lyrics/ttml.go @@ -10,6 +10,7 @@ import ( "sort" "strconv" "strings" + "unicode" "github.com/navidrome/navidrome/log" "github.com/navidrome/navidrome/model" @@ -78,6 +79,11 @@ type ttmlDefinedAgent struct { Name string } +type ttmlPiece struct { + raw string + cue *model.Cue +} + type ttmlParser struct { decoder *xml.Decoder params ttmlTimingParams @@ -294,7 +300,7 @@ func (p *ttmlParser) parseMetadataText(start xml.StartElement, parent ttmlTiming forKey, hasFor := attrValue(start.Attr, "for") forKey = strings.TrimSpace(forKey) - value, tokens, err := p.parseInlineElement(start, parent) + pieces, err := p.parseInlineElement(start, parent) if err != nil { return ttmlMetadataEntry{}, false, err } @@ -307,7 +313,8 @@ func (p *ttmlParser) parseMetadataText(start xml.StartElement, parent ttmlTiming return ttmlMetadataEntry{}, false, nil } - line := model.Line{Value: sanitizeTTMLText(value)} + value, tokens := buildTTMLLineFromPieces(pieces) + line := model.Line{Value: value} if ctx.hasBegin { startMs := ctx.begin line.Start = &startMs @@ -329,8 +336,7 @@ func (p *ttmlParser) parseMetadataText(start xml.StartElement, parent ttmlTiming } func (p *ttmlParser) parseParagraph(parent ttmlTimingContext) (string, []model.Cue, error) { - var text strings.Builder - var tokens []model.Cue + var pieces []ttmlPiece for { token, err := p.decoder.Token() @@ -340,26 +346,26 @@ func (p *ttmlParser) parseParagraph(parent ttmlTimingContext) (string, []model.C switch t := token.(type) { case xml.StartElement: - value, inlineTokens, err := p.parseInlineElement(t, parent) + inlinePieces, err := p.parseInlineElement(t, parent) if err != nil { return "", nil, err } - text.WriteString(value) - tokens = append(tokens, inlineTokens...) + pieces = append(pieces, inlinePieces...) case xml.EndElement: if strings.EqualFold(t.Name.Local, "p") { - return sanitizeTTMLText(text.String()), tokens, nil + value, tokens := buildTTMLLineFromPieces(pieces) + return value, tokens, nil } case xml.CharData: - text.WriteString(string(t)) + pieces = append(pieces, ttmlPiece{raw: string(t)}) } } } -func (p *ttmlParser) parseInlineElement(start xml.StartElement, parent ttmlTimingContext) (string, []model.Cue, error) { +func (p *ttmlParser) parseInlineElement(start xml.StartElement, parent ttmlTimingContext) ([]ttmlPiece, error) { local := strings.ToLower(start.Name.Local) if local == "br" { - return "\n", nil, nil + return []ttmlPiece{{raw: "\n"}}, nil } ctx := p.childContext(start.Attr, parent) @@ -368,53 +374,203 @@ func (p *ttmlParser) parseInlineElement(start xml.StartElement, parent ttmlTimin _, hasDur := attrValue(start.Attr, "dur") hasOwnTiming := hasBegin || hasEnd || hasDur - var text strings.Builder - var tokens []model.Cue + var pieces []ttmlPiece for { token, err := p.decoder.Token() if err != nil { - return "", nil, err + return nil, err } switch t := token.(type) { case xml.StartElement: - value, inlineTokens, err := p.parseInlineElement(t, ctx) + inlinePieces, err := p.parseInlineElement(t, ctx) if err != nil { - return "", nil, err + return nil, err } - text.WriteString(value) - tokens = append(tokens, inlineTokens...) + pieces = append(pieces, inlinePieces...) case xml.EndElement: if !strings.EqualFold(t.Name.Local, start.Name.Local) { continue } - value := text.String() - tokenText := sanitizeTTMLText(value) - if local == "span" && hasOwnTiming && !ctx.invalid && tokenText != "" && len(tokens) == 0 { - parsedToken := model.Cue{ - Value: tokenText, - AgentID: p.resolveCueAgentID(ctx), + if local == "span" && hasOwnTiming && !ctx.invalid && !ttmlPiecesContainCue(pieces) { + rawValue := concatTTMLPieceRaw(pieces) + tokenText := sanitizeTTMLText(rawValue) + if tokenText != "" { + parsedToken := model.Cue{ + AgentID: p.resolveCueAgentID(ctx), + } + if ctx.hasBegin { + startMs := ctx.begin + parsedToken.Start = &startMs + } + if ctx.hasEnd { + endMs := ctx.end + parsedToken.End = &endMs + } + + return []ttmlPiece{{ + raw: rawValue, + cue: &parsedToken, + }}, nil } - if ctx.hasBegin { - startMs := ctx.begin - parsedToken.Start = &startMs - } - if ctx.hasEnd { - endMs := ctx.end - parsedToken.End = &endMs - } - tokens = append(tokens, parsedToken) } - return value, tokens, nil + return pieces, nil case xml.CharData: - text.WriteString(string(t)) + pieces = append(pieces, ttmlPiece{raw: string(t)}) } } } +func buildTTMLLineFromPieces(pieces []ttmlPiece) (string, []model.Cue) { + finalized := finalizeTTMLLines(splitTTMLPiecesByNewline(pieces)) + for len(finalized) > 0 && finalized[0].text == "" && len(finalized[0].cues) == 0 { + finalized = finalized[1:] + } + for len(finalized) > 0 { + last := finalized[len(finalized)-1] + if last.text != "" || len(last.cues) > 0 { + break + } + finalized = finalized[:len(finalized)-1] + } + + var value strings.Builder + cues := make([]model.Cue, 0, 8) + byteOffset := 0 + for i, line := range finalized { + if i > 0 { + value.WriteByte('\n') + byteOffset++ + } + value.WriteString(line.text) + for _, cue := range line.cues { + cue.ByteStart += byteOffset + cue.ByteEnd += byteOffset + cues = append(cues, cue) + } + byteOffset += len(line.text) + } + + return value.String(), cues +} + +type ttmlFinalLine struct { + text string + cues []model.Cue +} + +func finalizeTTMLLines(lines [][]ttmlPiece) []ttmlFinalLine { + finalized := make([]ttmlFinalLine, 0, len(lines)) + for _, line := range lines { + text, cues := finalizeTTMLLogicalLine(line) + finalized = append(finalized, ttmlFinalLine{text: text, cues: cues}) + } + return finalized +} + +func splitTTMLPiecesByNewline(pieces []ttmlPiece) [][]ttmlPiece { + lines := [][]ttmlPiece{{}} + for _, piece := range pieces { + raw := normalizeTTMLPieceRaw(piece.raw) + if raw == "" { + continue + } + + start := 0 + for i := 0; i < len(raw); i++ { + if raw[i] != '\n' { + continue + } + if start < i { + lines[len(lines)-1] = append(lines[len(lines)-1], ttmlPiece{ + raw: raw[start:i], + cue: cloneTTMLCue(piece.cue), + }) + } + lines = append(lines, []ttmlPiece{}) + start = i + 1 + } + if start < len(raw) { + lines[len(lines)-1] = append(lines[len(lines)-1], ttmlPiece{ + raw: raw[start:], + cue: cloneTTMLCue(piece.cue), + }) + } + } + return lines +} + +func finalizeTTMLLogicalLine(line []ttmlPiece) (string, []model.Cue) { + rawLine := concatTTMLPieceRaw(line) + if rawLine == "" { + return "", nil + } + + leftTrimBytes := len(rawLine) - len(strings.TrimLeftFunc(rawLine, unicode.IsSpace)) + rightTrimBytes := len(rawLine) - len(strings.TrimRightFunc(rawLine, unicode.IsSpace)) + trimmedEnd := len(rawLine) - rightTrimBytes + if trimmedEnd < leftTrimBytes { + trimmedEnd = leftTrimBytes + } + + trimmed := strings.TrimSpace(rawLine) + cues := make([]model.Cue, 0, len(line)) + cursor := 0 + for _, piece := range line { + pieceEnd := cursor + len(piece.raw) + if piece.cue != nil { + byteStart := max(cursor, leftTrimBytes) + byteEnd := min(pieceEnd, trimmedEnd) + if byteStart < byteEnd { + cue := *piece.cue + cue.Value = rawLine[byteStart:byteEnd] + cue.ByteStart = byteStart - leftTrimBytes + cue.ByteEnd = byteEnd - leftTrimBytes - 1 + cues = append(cues, cue) + } + } + cursor = pieceEnd + } + + return trimmed, cues +} + +func normalizeTTMLPieceRaw(raw string) string { + raw = str.SanitizeText(raw) + raw = strings.ReplaceAll(raw, "\r\n", "\n") + raw = strings.ReplaceAll(raw, "\r", "\n") + return raw +} + +func concatTTMLPieceRaw(pieces []ttmlPiece) string { + var raw strings.Builder + for _, piece := range pieces { + raw.WriteString(normalizeTTMLPieceRaw(piece.raw)) + } + return raw.String() +} + +func ttmlPiecesContainCue(pieces []ttmlPiece) bool { + for _, piece := range pieces { + if piece.cue != nil { + return true + } + } + return false +} + +func cloneTTMLCue(cue *model.Cue) *model.Cue { + if cue == nil { + return nil + } + + cloned := *cue + return &cloned +} + func (p *ttmlParser) toLyricList() model.LyricList { res := make(model.LyricList, 0, len(p.mainLangOrder)+len(p.translationLangOrder)+len(p.pronunciationLangOrder)) for _, lang := range p.mainLangOrder { diff --git a/core/lyrics/ttml_test.go b/core/lyrics/ttml_test.go index 4e81197d4..5f9092e36 100644 --- a/core/lyrics/ttml_test.go +++ b/core/lyrics/ttml_test.go @@ -141,9 +141,9 @@ var _ = Describe("parseTTML", func() { Expect(line.End).To(Equal(gg.P(int64(3000)))) Expect(line.Cue).To(HaveLen(3)) - Expect(line.Cue[0]).To(Equal(model.Cue{Start: gg.P(int64(1000)), End: gg.P(int64(1400)), Value: "He", AgentID: "main"})) - Expect(line.Cue[1]).To(Equal(model.Cue{Start: gg.P(int64(1400)), End: gg.P(int64(1800)), Value: "llo", AgentID: "main"})) - Expect(line.Cue[2]).To(Equal(model.Cue{Start: gg.P(int64(2000)), End: gg.P(int64(2500)), Value: "echo", AgentID: "__nd_bg__|main"})) + Expect(line.Cue[0]).To(Equal(model.Cue{Start: gg.P(int64(1000)), End: gg.P(int64(1400)), Value: "He", ByteStart: 0, ByteEnd: 1, AgentID: "main"})) + Expect(line.Cue[1]).To(Equal(model.Cue{Start: gg.P(int64(1400)), End: gg.P(int64(1800)), Value: "llo", ByteStart: 2, ByteEnd: 4, AgentID: "main"})) + Expect(line.Cue[2]).To(Equal(model.Cue{Start: gg.P(int64(2000)), End: gg.P(int64(2500)), Value: "echo", ByteStart: 6, ByteEnd: 9, AgentID: "__nd_bg__|main"})) }) It("should parse named TTML agents into main, voice, and group roles", func() { @@ -241,8 +241,8 @@ var _ = Describe("parseTTML", func() { Expect(line.Value).To(Equal("go\ngo")) Expect(line.End).To(Equal(gg.P(int64(45570)))) Expect(line.Cue).To(HaveLen(2)) - Expect(line.Cue[0]).To(Equal(model.Cue{Start: gg.P(int64(43444)), End: gg.P(int64(43716)), Value: "go"})) - Expect(line.Cue[1]).To(Equal(model.Cue{Start: gg.P(int64(43716)), End: gg.P(int64(43887)), Value: "go"})) + Expect(line.Cue[0]).To(Equal(model.Cue{Start: gg.P(int64(43444)), End: gg.P(int64(43716)), Value: "go", ByteStart: 0, ByteEnd: 1})) + Expect(line.Cue[1]).To(Equal(model.Cue{Start: gg.P(int64(43716)), End: gg.P(int64(43887)), Value: "go", ByteStart: 3, ByteEnd: 4})) }) }) @@ -325,8 +325,8 @@ var _ = Describe("parseTTML", func() { Expect(pronunciation.Line[0].Value).To(Equal("konni")) Expect(pronunciation.Line[0].End).To(Equal(gg.P(int64(2600)))) Expect(pronunciation.Line[0].Cue).To(HaveLen(2)) - Expect(pronunciation.Line[0].Cue[0]).To(Equal(model.Cue{Start: gg.P(int64(2000)), End: gg.P(int64(2300)), Value: "ko"})) - Expect(pronunciation.Line[0].Cue[1]).To(Equal(model.Cue{Start: gg.P(int64(2300)), End: gg.P(int64(2600)), Value: "nni"})) + Expect(pronunciation.Line[0].Cue[0]).To(Equal(model.Cue{Start: gg.P(int64(2000)), End: gg.P(int64(2300)), Value: "ko", ByteStart: 0, ByteEnd: 1})) + Expect(pronunciation.Line[0].Cue[1]).To(Equal(model.Cue{Start: gg.P(int64(2300)), End: gg.P(int64(2600)), Value: "nni", ByteStart: 2, ByteEnd: 4})) }) }) @@ -369,9 +369,9 @@ var _ = Describe("parseTTML", func() { Expect(line.Start).To(Equal(gg.P(int64(2747)))) Expect(line.Value).To(Equal("I woke up")) Expect(line.Cue).To(HaveLen(3)) - Expect(line.Cue[0]).To(Equal(model.Cue{Start: gg.P(int64(2747)), End: gg.P(int64(3018)), Value: "I"})) - Expect(line.Cue[1]).To(Equal(model.Cue{Start: gg.P(int64(3018)), End: gg.P(int64(3179)), Value: "woke"})) - Expect(line.Cue[2]).To(Equal(model.Cue{Start: gg.P(int64(3179)), End: gg.P(int64(3582)), Value: "up"})) + Expect(line.Cue[0]).To(Equal(model.Cue{Start: gg.P(int64(2747)), End: gg.P(int64(3018)), Value: "I", ByteStart: 0, ByteEnd: 0})) + Expect(line.Cue[1]).To(Equal(model.Cue{Start: gg.P(int64(3018)), End: gg.P(int64(3179)), Value: "woke", ByteStart: 2, ByteEnd: 5})) + Expect(line.Cue[2]).To(Equal(model.Cue{Start: gg.P(int64(3179)), End: gg.P(int64(3582)), Value: "up", ByteStart: 7, ByteEnd: 8})) }) }) }) diff --git a/model/lyrics.go b/model/lyrics.go index ec0df9f34..9a57ebaad 100644 --- a/model/lyrics.go +++ b/model/lyrics.go @@ -6,16 +6,19 @@ import ( "slices" "strconv" "strings" + "unicode" "github.com/navidrome/navidrome/log" "github.com/navidrome/navidrome/utils/str" ) type Cue struct { - Start *int64 `structs:"start,omitempty" json:"start,omitempty"` - End *int64 `structs:"end,omitempty" json:"end,omitempty"` - Value string `structs:"value" json:"value"` - AgentID string `structs:"agentId,omitempty" json:"agentId,omitempty"` + Start *int64 `structs:"start,omitempty" json:"start,omitempty"` + End *int64 `structs:"end,omitempty" json:"end,omitempty"` + Value string `structs:"value" json:"value"` + ByteStart int `structs:"byteStart" json:"byteStart"` + ByteEnd int `structs:"byteEnd" json:"byteEnd"` + AgentID string `structs:"agentId,omitempty" json:"agentId,omitempty"` } type Agent struct { @@ -127,14 +130,10 @@ func ToLyrics(language, text string) (*Lyrics, error) { if validLine { for idx := range timestamps { - cues := parseEnhancedCues(priorLine) - value := priorLine - if cues != nil { - value = stripEnhancedMarkers(value) - } + value, cues := parseEnhancedLine(priorLine) structuredLines = append(structuredLines, Line{ Start: ×tamps[idx], - Value: strings.TrimSpace(value), + Value: value, Cue: cues, }) } @@ -181,14 +180,10 @@ func ToLyrics(language, text string) (*Lyrics, error) { if validLine { for idx := range timestamps { - cues := parseEnhancedCues(priorLine) - value := priorLine - if cues != nil { - value = stripEnhancedMarkers(value) - } + value, cues := parseEnhancedLine(priorLine) structuredLines = append(structuredLines, Line{ Start: ×tamps[idx], - Value: strings.TrimSpace(value), + Value: value, Cue: cues, }) } @@ -213,21 +208,22 @@ func ToLyrics(language, text string) (*Lyrics, error) { return &lyrics, nil } -// parseEnhancedCues extracts word-level timing cues from Enhanced LRC inline markers. -// Format: word word ... -// Returns nil if no inline markers are found. -func parseEnhancedCues(text string) []Cue { +// parseEnhancedLine extracts word-level timing cues from Enhanced LRC inline markers +// and computes UTF-8 byte offsets against the final stripped line value. +func parseEnhancedLine(text string) (string, []Cue) { matches := enhancedLRCRegex.FindAllStringSubmatchIndex(text, -1) if len(matches) == 0 { - return nil + return strings.TrimSpace(text), nil } type segment struct { - start int64 - text string + start int64 + rawStart int + rawEnd int } segments := make([]segment, 0, len(matches)) + var rawValue strings.Builder for i, match := range matches { timeMs, err := parseTime( // Rewrite <...> as [...] so parseTime can handle it with the same logic @@ -258,22 +254,46 @@ func parseEnhancedCues(text string) []Cue { if word == "" { continue } - segments = append(segments, segment{start: timeMs, text: word}) + + rawStart := rawValue.Len() + rawValue.WriteString(word) + segments = append(segments, segment{ + start: timeMs, + rawStart: rawStart, + rawEnd: rawValue.Len(), + }) } if len(segments) == 0 { - return nil + return strings.TrimSpace(stripEnhancedMarkers(text)), nil } - cues := make([]Cue, len(segments)) - for i, seg := range segments { - start := seg.start - cues[i] = Cue{ - Start: &start, - Value: seg.text, - } + finalRaw := rawValue.String() + leftTrimBytes := len(finalRaw) - len(strings.TrimLeftFunc(finalRaw, unicode.IsSpace)) + rightTrimBytes := len(finalRaw) - len(strings.TrimRightFunc(finalRaw, unicode.IsSpace)) + trimmedEnd := len(finalRaw) - rightTrimBytes + if trimmedEnd < leftTrimBytes { + trimmedEnd = leftTrimBytes } - return cues + + cues := make([]Cue, 0, len(segments)) + for _, seg := range segments { + start := seg.start + byteStart := max(seg.rawStart, leftTrimBytes) + byteEnd := min(seg.rawEnd, trimmedEnd) + if byteStart >= byteEnd { + continue + } + + cues = append(cues, Cue{ + Start: &start, + Value: finalRaw[byteStart:byteEnd], + ByteStart: byteStart - leftTrimBytes, + ByteEnd: byteEnd - leftTrimBytes - 1, + }) + } + + return strings.TrimSpace(finalRaw), cues } // adjustGroup remaps a capture group index from the original match to our rewritten "[...]" string. diff --git a/model/lyrics_test.go b/model/lyrics_test.go index 6f189f024..1fa82f258 100644 --- a/model/lyrics_test.go +++ b/model/lyrics_test.go @@ -130,9 +130,9 @@ var _ = Describe("ToLyrics", func() { Expect(line0.End).To(Equal(&t3000)) Expect(line0.Value).To(Equal("Some lyrics here")) Expect(line0.Cue).To(Equal([]Cue{ - {Start: &t1000, End: &t1500, Value: "Some "}, - {Start: &t1500, End: &t2000, Value: "lyrics "}, - {Start: &t2000, End: &t3000, Value: "here"}, + {Start: &t1000, End: &t1500, Value: "Some ", ByteStart: 0, ByteEnd: 4}, + {Start: &t1500, End: &t2000, Value: "lyrics ", ByteStart: 5, ByteEnd: 11}, + {Start: &t2000, End: &t3000, Value: "here", ByteStart: 12, ByteEnd: 15}, })) line1 := lyrics.Line[1] @@ -140,8 +140,8 @@ var _ = Describe("ToLyrics", func() { Expect(line1.End).To(Equal(&t3500)) Expect(line1.Value).To(Equal("More words")) Expect(line1.Cue).To(Equal([]Cue{ - {Start: &t3000, Value: "More "}, - {Start: &t3500, Value: "words"}, + {Start: &t3000, Value: "More ", ByteStart: 0, ByteEnd: 4}, + {Start: &t3500, Value: "words", ByteStart: 5, ByteEnd: 9}, })) Expect(line1.Cue[1].End).To(BeNil()) @@ -166,8 +166,8 @@ var _ = Describe("ToLyrics", func() { t3000 := int64(3000) Expect(lyrics.Line[0].Cue).To(Equal([]Cue{ - {Start: &t1000, End: &t1500, Value: "Some "}, - {Start: &t1500, End: &t3000, Value: "lyrics"}, + {Start: &t1000, End: &t1500, Value: "Some ", ByteStart: 0, ByteEnd: 4}, + {Start: &t1500, End: &t3000, Value: "lyrics", ByteStart: 5, ByteEnd: 10}, })) Expect(lyrics.Line[0].Value).To(Equal("Some lyrics")) Expect(lyrics.Line[0].End).To(Equal(&t3000)) @@ -176,9 +176,25 @@ var _ = Describe("ToLyrics", func() { Expect(lyrics.Line[1].Value).To(Equal("Plain line")) Expect(lyrics.Line[2].Cue).To(Equal([]Cue{ - {Start: &t5000, Value: "More "}, - {Start: &t5500, Value: "words"}, + {Start: &t5000, Value: "More ", ByteStart: 0, ByteEnd: 4}, + {Start: &t5500, Value: "words", ByteStart: 5, ByteEnd: 9}, })) Expect(lyrics.Line[2].Value).To(Equal("More words")) }) + + It("should preserve byte offsets for Enhanced LRC cues", func() { + lyrics, err := ToLyrics("xxx", "[00:00.00]<00:00.00>Oh <00:00.90>love<00:01.30> me <00:01.60>tonight") + Expect(err).ToNot(HaveOccurred()) + Expect(lyrics.Line).To(HaveLen(1)) + + t0, t900, t1300, t1600 := int64(0), int64(900), int64(1300), int64(1600) + line := lyrics.Line[0] + Expect(line.Value).To(Equal("Oh love me tonight")) + Expect(line.Cue).To(Equal([]Cue{ + {Start: &t0, Value: "Oh ", ByteStart: 0, ByteEnd: 2}, + {Start: &t900, Value: "love", ByteStart: 3, ByteEnd: 6}, + {Start: &t1300, Value: " me ", ByteStart: 7, ByteEnd: 10}, + {Start: &t1600, Value: "tonight", ByteStart: 11, ByteEnd: 17}, + })) + }) }) diff --git a/server/subsonic/helpers.go b/server/subsonic/helpers.go index ad769ee94..6a14aa4aa 100644 --- a/server/subsonic/helpers.go +++ b/server/subsonic/helpers.go @@ -619,8 +619,10 @@ func buildLyricCues(cues []model.Cue, lineEnd *int64) []responses.LyricCue { } cue := responses.LyricCue{ - Start: *cues[i].Start, - Value: cues[i].Value, + Start: *cues[i].Start, + Value: cues[i].Value, + ByteStart: cues[i].ByteStart, + ByteEnd: cues[i].ByteEnd, } if hasAnyEnd { end := cues[i].End diff --git a/server/subsonic/media_retrieval_test.go b/server/subsonic/media_retrieval_test.go index e4f6a21d4..faa90e375 100644 --- a/server/subsonic/media_retrieval_test.go +++ b/server/subsonic/media_retrieval_test.go @@ -277,6 +277,8 @@ var _ = Describe("MediaRetrievalController", func() { expectedCue := expectedCueLine.Cue[k] Expect(realCue.Value).To(Equal(expectedCue.Value)) Expect(realCue.Start).To(Equal(expectedCue.Start)) + Expect(realCue.ByteStart).To(Equal(expectedCue.ByteStart)) + Expect(realCue.ByteEnd).To(Equal(expectedCue.ByteEnd)) if expectedCue.End == nil { Expect(realCue.End).To(BeNil()) } else { @@ -514,14 +516,18 @@ var _ = Describe("MediaRetrievalController", func() { Value: "konni", Cue: []responses.LyricCue{ { - Start: tokenStartA, - End: &tokenEndA, - Value: "ko", + Start: tokenStartA, + End: &tokenEndA, + ByteStart: 0, + ByteEnd: 1, + Value: "ko", }, { - Start: tokenStartB, - End: &tokenEndB, - Value: "nni", + Start: tokenStartB, + End: &tokenEndB, + ByteStart: 2, + ByteEnd: 4, + Value: "nni", }, }, }, @@ -552,16 +558,20 @@ var _ = Describe("MediaRetrievalController", func() { Value: "Hello echo", Cue: []model.Cue{ { - Start: &tokenStartA, - End: &tokenEndA, - Value: "Hello", - AgentID: "lead", + Start: &tokenStartA, + End: &tokenEndA, + Value: "Hello", + ByteStart: 0, + ByteEnd: 4, + AgentID: "lead", }, { - Start: &tokenStartB, - End: &tokenEndB, - Value: "echo", - AgentID: "__nd_bg__|lead", + Start: &tokenStartB, + End: &tokenEndB, + Value: "echo", + ByteStart: 6, + ByteEnd: 9, + AgentID: "__nd_bg__|lead", }, }, }, @@ -608,9 +618,11 @@ var _ = Describe("MediaRetrievalController", func() { AgentID: "lead", Cue: []responses.LyricCue{ { - Start: tokenStartA, - End: &tokenEndA, - Value: "Hello", + Start: tokenStartA, + End: &tokenEndA, + ByteStart: 0, + ByteEnd: 4, + Value: "Hello", }, }, }, @@ -622,9 +634,11 @@ var _ = Describe("MediaRetrievalController", func() { AgentID: "__nd_bg__|lead", Cue: []responses.LyricCue{ { - Start: tokenStartB, - End: &tokenEndB, - Value: "echo", + Start: tokenStartB, + End: &tokenEndB, + ByteStart: 6, + ByteEnd: 9, + Value: "echo", }, }, }, @@ -633,6 +647,116 @@ var _ = Describe("MediaRetrievalController", func() { }, }) }) + + It("should return required cue byte offsets for ambiguous and multibyte cue lines", func() { + r := newGetRequest("id=1&enhanced=true") + + asciiLineStart := int64(0) + asciiLineEnd := int64(2400) + asciiCueStartA := int64(0) + asciiCueEndA := int64(300) + asciiCueStartB := int64(900) + asciiCueEndB := int64(1300) + asciiCueStartC := int64(1300) + asciiCueEndC := int64(1600) + asciiCueStartD := int64(1600) + + utfLineStart := int64(2747) + utfLineEnd := int64(6214) + utfCueStartA := int64(2747) + utfCueEndA := int64(3018) + utfCueStartB := int64(3018) + utfCueEndB := int64(3179) + utfCueStartC := int64(3582) + utfCueEndC := int64(4100) + utfCueStartD := int64(4500) + utfCueEndD := int64(6214) + + lyricsJSON, err := json.Marshal(model.LyricList{ + { + Lang: "eng", + Synced: true, + Line: []model.Line{ + { + Start: &asciiLineStart, + End: &asciiLineEnd, + Value: "Oh love love me tonight", + Cue: []model.Cue{ + {Start: &asciiCueStartA, End: &asciiCueEndA, Value: "Oh", ByteStart: 0, ByteEnd: 1}, + {Start: &asciiCueStartB, End: &asciiCueEndB, Value: "love", ByteStart: 8, ByteEnd: 11}, + {Start: &asciiCueStartC, End: &asciiCueEndC, Value: "me", ByteStart: 13, ByteEnd: 14}, + {Start: &asciiCueStartD, Value: "tonight", ByteStart: 16, ByteEnd: 22}, + }, + }, + { + Start: &utfLineStart, + End: &utfLineEnd, + Value: "눈을 뜬 순간", + Cue: []model.Cue{ + {Start: &utfCueStartA, End: &utfCueEndA, Value: "눈", ByteStart: 0, ByteEnd: 2}, + {Start: &utfCueStartB, End: &utfCueEndB, Value: "을", ByteStart: 3, ByteEnd: 5}, + {Start: &utfCueStartC, End: &utfCueEndC, Value: "뜬", ByteStart: 7, ByteEnd: 9}, + {Start: &utfCueStartD, End: &utfCueEndD, Value: "순간", ByteStart: 11, ByteEnd: 16}, + }, + }, + }, + }, + }) + Expect(err).ToNot(HaveOccurred()) + + mockRepo.SetData(model.MediaFiles{ + { + ID: "1", + Artist: "Rick Astley", + Title: "Never Gonna Give You Up", + Lyrics: string(lyricsJSON), + }, + }) + + response, err := router.GetLyricsBySongId(r) + Expect(err).ToNot(HaveOccurred()) + compareResponses(response.LyricsList, responses.LyricsList{ + StructuredLyrics: responses.StructuredLyrics{ + { + DisplayArtist: "Rick Astley", + DisplayTitle: "Never Gonna Give You Up", + Kind: "main", + Lang: "eng", + Synced: true, + Line: []responses.Line{ + {Start: &asciiLineStart, Value: "Oh love love me tonight"}, + {Start: &utfLineStart, Value: "눈을 뜬 순간"}, + }, + CueLine: []responses.CueLine{ + { + Index: 0, + Start: &asciiLineStart, + End: &asciiLineEnd, + Value: "Oh love love me tonight", + Cue: []responses.LyricCue{ + {Start: asciiCueStartA, End: &asciiCueEndA, Value: "Oh", ByteStart: 0, ByteEnd: 1}, + {Start: asciiCueStartB, End: &asciiCueEndB, Value: "love", ByteStart: 8, ByteEnd: 11}, + {Start: asciiCueStartC, End: &asciiCueEndC, Value: "me", ByteStart: 13, ByteEnd: 14}, + {Start: asciiCueStartD, End: &asciiLineEnd, Value: "tonight", ByteStart: 16, ByteEnd: 22}, + }, + }, + { + Index: 1, + Start: &utfLineStart, + End: &utfLineEnd, + Value: "눈을 뜬 순간", + Cue: []responses.LyricCue{ + {Start: utfCueStartA, End: &utfCueEndA, Value: "눈", ByteStart: 0, ByteEnd: 2}, + {Start: utfCueStartB, End: &utfCueEndB, Value: "을", ByteStart: 3, ByteEnd: 5}, + {Start: utfCueStartC, End: &utfCueEndC, Value: "뜬", ByteStart: 7, ByteEnd: 9}, + {Start: utfCueStartD, End: &utfCueEndD, Value: "순간", ByteStart: 11, ByteEnd: 16}, + }, + }, + }, + }, + }, + }) + }) }) }) diff --git a/server/subsonic/responses/responses.go b/server/subsonic/responses/responses.go index 344dd9999..d1ecdb307 100644 --- a/server/subsonic/responses/responses.go +++ b/server/subsonic/responses/responses.go @@ -538,9 +538,11 @@ type Line struct { } type LyricCue struct { - Start int64 `xml:"start,attr" json:"start"` - End *int64 `xml:"end,attr,omitempty" json:"end,omitempty"` - Value string `xml:",chardata" json:"value"` + Start int64 `xml:"start,attr" json:"start"` + End *int64 `xml:"end,attr,omitempty" json:"end,omitempty"` + ByteStart int `xml:"byteStart,attr" json:"byteStart"` + ByteEnd int `xml:"byteEnd,attr" json:"byteEnd"` + Value string `xml:",chardata" json:"value"` } type Agent struct { @@ -553,7 +555,7 @@ type CueLine struct { Index int32 `xml:"index,attr" json:"index"` Start *int64 `xml:"start,attr,omitempty" json:"start,omitempty"` End *int64 `xml:"end,attr,omitempty" json:"end,omitempty"` - Value string `xml:"value,attr,omitempty" json:"value,omitempty"` + Value string `xml:"value,attr" json:"value"` AgentID string `xml:"agentId,attr,omitempty" json:"agentId,omitempty"` Cue []LyricCue `xml:"cue,omitempty" json:"cue,omitempty"` } diff --git a/ui/src/audioplayer/KaraokeLyricsOverlay.jsx b/ui/src/audioplayer/KaraokeLyricsOverlay.jsx index cd1484e41..799f8bdc2 100644 --- a/ui/src/audioplayer/KaraokeLyricsOverlay.jsx +++ b/ui/src/audioplayer/KaraokeLyricsOverlay.jsx @@ -26,6 +26,7 @@ import { hasStructuredLyricContent, resolveKaraokeTokenWindow, resolveLayerLineForMain, + utf8ByteRangeToCodeUnitRange, } from './lyrics' const KARAOKE_RENDER_LEAD_MS = 24 @@ -635,6 +636,72 @@ const buildSegmentsFromLine = (line) => { } const text = line.value || '' + const exactSegments = (() => { + if (!text) { + return null + } + + const rangedTokens = line.tokens + .map((token, tokenIndex) => ({ + token, + tokenIndex, + range: utf8ByteRangeToCodeUnitRange( + text, + token?.byteStart, + token?.byteEnd, + ), + })) + .filter((entry) => entry.range != null) + + if ( + rangedTokens.length !== line.tokens.length || + rangedTokens.length === 0 + ) { + return null + } + + rangedTokens.sort( + (a, b) => + a.range.start - b.range.start || + a.range.end - b.range.end || + a.tokenIndex - b.tokenIndex, + ) + + const segments = [] + let cursor = 0 + for (const entry of rangedTokens) { + if (entry.range.start < cursor) { + return null + } + if (entry.range.start > cursor) { + segments.push({ + text: text.slice(cursor, entry.range.start), + token: null, + tokenIndex: -1, + }) + } + segments.push({ + text: entry.range.text, + token: entry.token, + tokenIndex: entry.tokenIndex, + }) + cursor = entry.range.end + } + + if (cursor < text.length) { + segments.push({ + text: text.slice(cursor), + token: null, + tokenIndex: -1, + }) + } + + return segments + })() + if (exactSegments) { + return exactSegments + } + const matchedSegments = [] const fallbackSegments = [] let cursor = 0 diff --git a/ui/src/audioplayer/KaraokeLyricsOverlay.test.jsx b/ui/src/audioplayer/KaraokeLyricsOverlay.test.jsx index 411116eae..412bc3946 100644 --- a/ui/src/audioplayer/KaraokeLyricsOverlay.test.jsx +++ b/ui/src/audioplayer/KaraokeLyricsOverlay.test.jsx @@ -177,6 +177,74 @@ describe(' behavior', () => { expect(translationLine.querySelectorAll('span')).toHaveLength(1) }) + it('uses cue byte offsets to segment repeated words in the karaoke line', () => { + renderOverlay({ + mainLyric: { + kind: 'main', + lang: 'en', + synced: true, + line: [{ start: 0, end: 2400, value: 'Oh love love me tonight' }], + cueLine: [ + { + index: 0, + start: 0, + end: 2400, + value: 'Oh love love me tonight', + cue: [ + { start: 0, end: 300, value: 'Oh', byteStart: 0, byteEnd: 1 }, + { + start: 900, + end: 1300, + value: 'love', + byteStart: 8, + byteEnd: 11, + }, + { + start: 1300, + end: 1600, + value: 'me', + byteStart: 13, + byteEnd: 14, + }, + { + start: 1600, + end: 2400, + value: 'tonight', + byteStart: 16, + byteEnd: 22, + }, + ], + }, + ], + }, + translationLyric: null, + pronunciationLyric: null, + showTranslation: false, + showPronunciation: false, + translationEnabled: false, + pronunciationEnabled: false, + audioInstance: { + ...audioInstance, + currentTime: 1.0, + }, + }) + + const mainLine = screen.getByText('Oh').parentElement + const segments = Array.from(mainLine.querySelectorAll('span')).map( + (span) => span.textContent, + ) + + expect(segments).toEqual([ + 'Oh', + ' love ', + 'love', + ' ', + 'me', + ' ', + 'tonight', + ]) + }) + it('highlights line-timed pronunciation and translation rows with the active main line', () => { renderOverlay({ mainLyric: { diff --git a/ui/src/audioplayer/lyrics.js b/ui/src/audioplayer/lyrics.js index e9cd16d5a..6fa627ee5 100644 --- a/ui/src/audioplayer/lyrics.js +++ b/ui/src/audioplayer/lyrics.js @@ -19,6 +19,17 @@ const toTime = (value) => { return Number.isFinite(numeric) ? numeric : null } +const toByteOffset = (value) => { + if (value == null || value === '') { + return null + } + const numeric = Number(value) + if (!Number.isInteger(numeric) || numeric < 0) { + return null + } + return numeric +} + const compareNullableTime = (a, b) => { if (a == null && b == null) { return 0 @@ -78,10 +89,79 @@ const normalizeToken = (token) => { if (!value.trim()) { return null } + const byteStart = toByteOffset(token.byteStart) + const byteEnd = toByteOffset(token.byteEnd) return { start: toTime(token.start), end: toTime(token.end), value, + ...(byteStart != null ? { byteStart } : {}), + ...(byteEnd != null ? { byteEnd } : {}), + } +} + +const utf8BytesForCodePoint = (codePoint) => { + if (codePoint <= 0x7f) { + return 1 + } + if (codePoint <= 0x7ff) { + return 2 + } + if (codePoint <= 0xffff) { + return 3 + } + return 4 +} + +export const utf8ByteOffsetToCodeUnitIndex = (text, targetByteOffset) => { + if (typeof text !== 'string' || text.length === 0) { + return 0 + } + + const target = toByteOffset(targetByteOffset) + if (target == null || target <= 0) { + return 0 + } + + let byteOffset = 0 + let index = 0 + while (index < text.length) { + if (byteOffset >= target) { + return index + } + const codePoint = text.codePointAt(index) + byteOffset += utf8BytesForCodePoint(codePoint) + index += codePoint > 0xffff ? 2 : 1 + } + + return text.length +} + +export const utf8ByteRangeToCodeUnitRange = (text, byteStart, byteEnd) => { + if (typeof text !== 'string') { + return null + } + + const start = toByteOffset(byteStart) + const end = toByteOffset(byteEnd) + if (start == null || end == null || end < start) { + return null + } + + const startIndex = utf8ByteOffsetToCodeUnitIndex(text, start) + const endIndex = utf8ByteOffsetToCodeUnitIndex(text, end + 1) + if ( + startIndex >= endIndex || + startIndex > text.length || + endIndex > text.length + ) { + return null + } + + return { + start: startIndex, + end: endIndex, + text: text.slice(startIndex, endIndex), } } diff --git a/ui/src/audioplayer/lyrics.test.js b/ui/src/audioplayer/lyrics.test.js index 2fcf1df40..961fdb10b 100644 --- a/ui/src/audioplayer/lyrics.test.js +++ b/ui/src/audioplayer/lyrics.test.js @@ -13,6 +13,8 @@ import { selectLyricLayers, structuredLyricsToLrc, structuredLyricToLrc, + utf8ByteOffsetToCodeUnitIndex, + utf8ByteRangeToCodeUnitRange, } from './lyrics' describe('lyrics helpers', () => { @@ -412,6 +414,60 @@ describe('lyrics helpers', () => { ]) }) + it('preserves cue byte offsets on karaoke tokens', () => { + const lines = buildKaraokeLines({ + lang: 'eng', + synced: true, + line: [{ start: 0, end: 2400, value: 'Oh love love me tonight' }], + cueLine: [ + { + index: 0, + start: 0, + end: 2400, + value: 'Oh love love me tonight', + cue: [ + { start: 0, end: 300, value: 'Oh', byteStart: 0, byteEnd: 1 }, + { start: 900, end: 1300, value: 'love', byteStart: 8, byteEnd: 11 }, + { start: 1300, end: 1600, value: 'me', byteStart: 13, byteEnd: 14 }, + { + start: 1600, + end: 2400, + value: 'tonight', + byteStart: 16, + byteEnd: 22, + }, + ], + }, + ], + }) + + expect( + lines[0].tokens.map((token) => [ + token.value, + token.byteStart, + token.byteEnd, + ]), + ).toEqual([ + ['Oh', 0, 1], + ['love', 8, 11], + ['me', 13, 14], + ['tonight', 16, 22], + ]) + }) + + it('maps UTF-8 byte offsets to string ranges for multibyte lyrics', () => { + const text = '눈을 뜬 순간' + + expect(utf8ByteOffsetToCodeUnitIndex(text, 0)).toBe(0) + expect(utf8ByteOffsetToCodeUnitIndex(text, 3)).toBe(1) + expect(utf8ByteOffsetToCodeUnitIndex(text, 7)).toBe(3) + expect(utf8ByteRangeToCodeUnitRange(text, 11, 16)).toEqual({ + start: 5, + end: 7, + text: '순간', + }) + }) + it('falls back to legacy cueLine role values when agents are absent', () => { const lines = buildKaraokeLines({ lang: 'eng', diff --git a/ui/src/subsonic/index.test.js b/ui/src/subsonic/index.test.js index 6910fdc8d..6bd5e08ee 100644 --- a/ui/src/subsonic/index.test.js +++ b/ui/src/subsonic/index.test.js @@ -1,5 +1,4 @@ import { vi } from 'vitest' -import { COVER_ART_SIZE } from '../consts' import { httpClient } from '../dataProvider' import subsonic from './index' @@ -7,6 +6,8 @@ vi.mock('../dataProvider', () => ({ httpClient: vi.fn(() => Promise.resolve({})), })) +const COVER_ART_SIZE = 600 + describe('getCoverArtUrl', () => { beforeEach(() => { // Mock window.location From cc15a2f820244fe395b3a3d099489851cacf8e4e Mon Sep 17 00:00:00 2001 From: ranokay Date: Tue, 14 Apr 2026 05:10:54 +0300 Subject: [PATCH 12/14] fix(lyrics): polish karaoke rendering and mobile layout --- core/lyrics/lyrics.go | 59 +++++++--- core/lyrics/lyrics_test.go | 23 ++++ core/lyrics/sources_test.go | 2 +- server/subsonic/media_retrieval.go | 48 ++++++--- server/subsonic/media_retrieval_test.go | 11 +- ui/src/audioplayer/KaraokeLyricsOverlay.jsx | 83 +++++++++++--- .../audioplayer/KaraokeLyricsOverlay.test.jsx | 102 ++++++++++++++++++ .../audioplayer/MobileKaraokeLyricsPortal.jsx | 65 +++++++++++ .../MobileKaraokeLyricsPortal.test.jsx | 55 ++++++++++ ui/src/audioplayer/Player.jsx | 78 ++++++++++---- ui/src/audioplayer/lyrics.js | 2 +- ui/src/audioplayer/lyrics.test.js | 37 +++++++ ui/src/audioplayer/styles.js | 18 ++++ 13 files changed, 512 insertions(+), 71 deletions(-) create mode 100644 ui/src/audioplayer/MobileKaraokeLyricsPortal.jsx create mode 100644 ui/src/audioplayer/MobileKaraokeLyricsPortal.test.jsx diff --git a/core/lyrics/lyrics.go b/core/lyrics/lyrics.go index 758053042..cc3d574b3 100644 --- a/core/lyrics/lyrics.go +++ b/core/lyrics/lyrics.go @@ -14,6 +14,12 @@ type Lyrics interface { GetLyrics(ctx context.Context, mf *model.MediaFile) (model.LyricList, error) } +// BatchLyrics can resolve lyrics across multiple candidate media files while +// still honoring the configured source priority globally. +type BatchLyrics interface { + GetLyricsForMediaFiles(ctx context.Context, mediaFiles []model.MediaFile) (model.LyricList, error) +} + // PluginLoader discovers and loads lyrics provider plugins. type PluginLoader interface { LoadLyricsProvider(name string) (Lyrics, bool) @@ -32,28 +38,53 @@ func NewLyrics(pluginLoader PluginLoader) Lyrics { // GetLyrics returns lyrics for the given media file, trying sources in the // order specified by conf.Server.LyricsPriority. func (l *lyricsService) GetLyrics(ctx context.Context, mf *model.MediaFile) (model.LyricList, error) { - var lyricsList model.LyricList - var err error + return l.getLyricsForCandidates(ctx, []*model.MediaFile{mf}) +} +// GetLyricsForMediaFiles resolves lyrics across duplicate media files while +// preserving the configured source priority across the full candidate set. +func (l *lyricsService) GetLyricsForMediaFiles(ctx context.Context, mediaFiles []model.MediaFile) (model.LyricList, error) { + candidates := make([]*model.MediaFile, 0, len(mediaFiles)) + for i := range mediaFiles { + candidates = append(candidates, &mediaFiles[i]) + } + return l.getLyricsForCandidates(ctx, candidates) +} + +func (l *lyricsService) getLyricsForCandidates(ctx context.Context, mediaFiles []*model.MediaFile) (model.LyricList, error) { for pattern := range strings.SplitSeq(conf.Server.LyricsPriority, ",") { pattern = strings.TrimSpace(pattern) - switch { - case strings.EqualFold(pattern, "embedded"): - lyricsList, err = fromEmbedded(ctx, mf) - case strings.HasPrefix(pattern, "."): - lyricsList, err = fromExternalFile(ctx, mf, strings.ToLower(pattern)) - default: - lyricsList, err = l.fromPlugin(ctx, mf, pattern) + if pattern == "" { + continue } - if err != nil { - log.Error(ctx, "error getting lyrics", "source", pattern, err) - } + for _, mf := range mediaFiles { + if mf == nil { + continue + } - if len(lyricsList) > 0 { - return lyricsList, nil + lyricsList, err := l.getLyricsFromSource(ctx, mf, pattern) + if err != nil { + log.Error(ctx, "error getting lyrics", "source", pattern, err) + continue + } + + if len(lyricsList) > 0 { + return lyricsList, nil + } } } return nil, nil } + +func (l *lyricsService) getLyricsFromSource(ctx context.Context, mf *model.MediaFile, pattern string) (model.LyricList, error) { + switch { + case strings.EqualFold(pattern, "embedded"): + return fromEmbedded(ctx, mf) + case strings.HasPrefix(pattern, "."): + return fromExternalFile(ctx, mf, strings.ToLower(pattern)) + default: + return l.fromPlugin(ctx, mf, pattern) + } +} diff --git a/core/lyrics/lyrics_test.go b/core/lyrics/lyrics_test.go index 822e975ce..26bdd5aa9 100644 --- a/core/lyrics/lyrics_test.go +++ b/core/lyrics/lyrics_test.go @@ -169,6 +169,29 @@ var _ = Describe("sources", func() { Entry("txt > lrc > embedded", ".txt,.lrc,embedded", unsyncedLyrics), Entry("ttml > elrc > lrc > srt > embedded", ".ttml,.elrc,.lrc,.srt,embedded", ttmlLyrics)) + It("resolves source priority across duplicate media files", func() { + conf.Server.LyricsPriority = ".ttml,embedded" + embeddedJSON, err := json.Marshal(embeddedLyrics) + Expect(err).To(BeNil()) + + svc := lyrics.NewLyrics(nil) + batchSvc, ok := svc.(lyrics.BatchLyrics) + Expect(ok).To(BeTrue()) + + list, err := batchSvc.GetLyricsForMediaFiles(ctx, []model.MediaFile{ + { + Lyrics: string(embeddedJSON), + Path: "tests/fixtures/01 Invisible (RED) Edit Version.mp3", + }, + { + Lyrics: "[]", + Path: "tests/fixtures/test.mp3", + }, + }) + Expect(err).To(BeNil()) + Expect(list).To(Equal(ttmlLyrics)) + }) + Context("Errors", func() { var RegularUserContext = XContext var isRegularUser = os.Getuid() != 0 diff --git a/core/lyrics/sources_test.go b/core/lyrics/sources_test.go index 5ba03336e..1e98323ca 100644 --- a/core/lyrics/sources_test.go +++ b/core/lyrics/sources_test.go @@ -299,7 +299,7 @@ var _ = Describe("sources", func() { Expect(lyrics[0].Line[0].Value).To(Equal("BOM test line")) }) - It("should handle UTF-16 LE encoded TTML files", func() { + It("should handle UTF-16 BE encoded TTML files", func() { mf := model.MediaFile{Path: "tests/fixtures/bom-utf16-test.mp3"} lyrics, err := fromExternalFile(ctx, &mf, ".ttml") diff --git a/server/subsonic/media_retrieval.go b/server/subsonic/media_retrieval.go index 16d0d2666..c3c6d98ea 100644 --- a/server/subsonic/media_retrieval.go +++ b/server/subsonic/media_retrieval.go @@ -10,6 +10,7 @@ import ( "github.com/navidrome/navidrome/conf" "github.com/navidrome/navidrome/consts" + lyricssvc "github.com/navidrome/navidrome/core/lyrics" "github.com/navidrome/navidrome/log" "github.com/navidrome/navidrome/model" "github.com/navidrome/navidrome/resources" @@ -19,6 +20,8 @@ import ( "github.com/navidrome/navidrome/utils/req" ) +const maxLegacyLyricsCandidates = 10 + func (api *Router) GetAvatar(w http.ResponseWriter, r *http.Request) (*responses.Subsonic, error) { if !conf.Server.EnableGravatar { return api.getPlaceHolderAvatar(w, r) @@ -99,9 +102,9 @@ func (api *Router) GetLyrics(r *http.Request) (*responses.Subsonic, error) { lyricsResponse := responses.Lyrics{} response.Lyrics = &lyricsResponse opts := filter.SongsByArtistTitleWithLyricsFirst(artist, title) - // Keep the search exhaustive so an older duplicate can still supply the - // matching sidecar lyrics when the newest candidate only has embedded data. - opts.Max = 0 + // Search a bounded duplicate window so source-priority fallback can still + // reach older matches without turning legacy getLyrics into an unbounded scan. + opts.Max = maxLegacyLyricsCandidates mediaFiles, err := api.ds.MediaFile(r.Context()).GetAll(opts) if err != nil { @@ -112,26 +115,37 @@ func (api *Router) GetLyrics(r *http.Request) (*responses.Subsonic, error) { return response, nil } - for i := range mediaFiles { - structuredLyrics, err := api.lyrics.GetLyrics(r.Context(), &mediaFiles[i]) + var structuredLyrics model.LyricList + if batchLyrics, ok := api.lyrics.(lyricssvc.BatchLyrics); ok { + structuredLyrics, err = batchLyrics.GetLyricsForMediaFiles(r.Context(), mediaFiles) if err != nil { return nil, err } - if len(structuredLyrics) == 0 { - continue + } else { + for i := range mediaFiles { + structuredLyrics, err = api.lyrics.GetLyrics(r.Context(), &mediaFiles[i]) + if err != nil { + return nil, err + } + if len(structuredLyrics) > 0 { + break + } } - - lyricsResponse.Artist = artist - lyricsResponse.Title = title - - var lyricsText strings.Builder - for _, line := range structuredLyrics[0].Line { - lyricsText.WriteString(line.Value + "\n") - } - lyricsResponse.Value = lyricsText.String() - break } + if len(structuredLyrics) == 0 { + return response, nil + } + + lyricsResponse.Artist = artist + lyricsResponse.Title = title + + var lyricsText strings.Builder + for _, line := range structuredLyrics[0].Line { + lyricsText.WriteString(line.Value + "\n") + } + lyricsResponse.Value = lyricsText.String() + return response, nil } diff --git a/server/subsonic/media_retrieval_test.go b/server/subsonic/media_retrieval_test.go index faa90e375..d02d5b9bd 100644 --- a/server/subsonic/media_retrieval_test.go +++ b/server/subsonic/media_retrieval_test.go @@ -187,18 +187,22 @@ var _ = Describe("MediaRetrievalController", func() { Expect(response.Lyrics.Value).To(Equal("We're no strangers to love\nYou know the rules and so do I\n")) }) - It("should continue searching candidates for sidecar lyrics", func() { + It("should prefer higher-priority sidecar lyrics across duplicate candidates", func() { conf.Server.LyricsPriority = ".ttml,embedded" r := newGetRequest("artist=Rick+Astley", "title=Never+Gonna+Give+You+Up") baseTime := time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC) + embedded, err := model.ToLyrics("eng", "Newest duplicate embedded lyrics") + Expect(err).ToNot(HaveOccurred()) + embeddedJSON, err := json.Marshal(model.LyricList{*embedded}) + Expect(err).ToNot(HaveOccurred()) mockRepo.SetData(model.MediaFiles{ { ID: "1", Path: "tests/fixtures/01 Invisible (RED) Edit Version.mp3", Artist: "Rick Astley", Title: "Never Gonna Give You Up", - Lyrics: "[]", - UpdatedAt: baseTime.Add(2 * time.Hour), // Newer, but no TTML sidecar + Lyrics: string(embeddedJSON), + UpdatedAt: baseTime.Add(2 * time.Hour), // Newer duplicate with embedded lyrics only }, { ID: "2", @@ -215,6 +219,7 @@ var _ = Describe("MediaRetrievalController", func() { Expect(response.Lyrics.Artist).To(Equal("Rick Astley")) Expect(response.Lyrics.Title).To(Equal("Never Gonna Give You Up")) Expect(response.Lyrics.Value).To(Equal("We're no strangers to love\nYou know the rules and so do I\n")) + Expect(mockRepo.Options.Max).To(Equal(maxLegacyLyricsCandidates)) }) }) diff --git a/ui/src/audioplayer/KaraokeLyricsOverlay.jsx b/ui/src/audioplayer/KaraokeLyricsOverlay.jsx index 799f8bdc2..c016df9e5 100644 --- a/ui/src/audioplayer/KaraokeLyricsOverlay.jsx +++ b/ui/src/audioplayer/KaraokeLyricsOverlay.jsx @@ -48,6 +48,7 @@ const KARAOKE_MAX_LINE_HEIGHT = 2.2 const KARAOKE_LINE_HEIGHT_STEP = 0.02 const KARAOKE_GROUP_SPACING_BASE_PX = 14 const KARAOKE_AUX_LINE_HEIGHT = 1.2 +const KARAOKE_MAIN_INACTIVE_FONT_FACTOR = 0.8 const TOKEN_DONE_ALPHA = 1 const TOKEN_FUTURE_ALPHA = 0.34 @@ -160,6 +161,21 @@ const useStyles = makeStyles((theme) => ({ maxHeight: '65vh', }, }, + overlayInline: { + position: 'absolute', + inset: 0, + width: '100%', + height: '100%', + minHeight: 0, + maxHeight: '100%', + transform: 'none', + borderRadius: 'inherit', + border: 'none', + boxShadow: 'none', + background: 'rgba(6, 8, 12, 0.92)', + backdropFilter: 'blur(12px)', + zIndex: 1, + }, resizeHandle: { height: 14, cursor: 'ns-resize', @@ -187,6 +203,10 @@ const useStyles = makeStyles((theme) => ({ gap: theme.spacing(1), padding: theme.spacing(0.3, 1.3, 0.4, 1.3), }, + headerInline: { + padding: theme.spacing(0.25, 0.65, 0.35, 0.65), + gap: theme.spacing(0.65), + }, headerLeft: { display: 'flex', alignItems: 'center', @@ -264,6 +284,8 @@ const useStyles = makeStyles((theme) => ({ }, inlineTr: { margin: 0, + display: 'inline-block', + maxWidth: '100%', textAlign: 'center', fontWeight: 400, lineHeight: KARAOKE_AUX_LINE_HEIGHT, @@ -272,6 +294,14 @@ const useStyles = makeStyles((theme) => ({ }, inlinePr: { margin: 0, + display: 'inline-flex', + alignItems: 'center', + justifyContent: 'center', + flexWrap: 'wrap', + alignSelf: 'center', + width: 'fit-content', + maxWidth: '100%', + boxSizing: 'border-box', textAlign: 'center', fontWeight: 400, lineHeight: KARAOKE_AUX_LINE_HEIGHT, @@ -300,6 +330,9 @@ const useStyles = makeStyles((theme) => ({ padding: theme.spacing(0.35, 1.2, 1.2, 1.2), }, }, + bodyInline: { + padding: theme.spacing(0.25, 0.8, 0.85, 0.8), + }, lines: { display: 'flex', flexDirection: 'column', @@ -308,12 +341,14 @@ const useStyles = makeStyles((theme) => ({ }, line: { margin: 0, + display: 'inline-block', + maxWidth: '100%', fontWeight: 600, lineHeight: 1.24, letterSpacing: '0.01em', textAlign: 'center', color: 'rgba(255, 255, 255, 0.62)', - transition: `opacity ${KARAOKE_ANIMATION_MS}ms ease-in-out, color ${KARAOKE_ANIMATION_MS}ms ease-in-out, font-size 280ms ease-in-out`, + transition: `opacity ${KARAOKE_ANIMATION_MS}ms ease-in-out, color ${KARAOKE_ANIMATION_MS}ms ease-in-out, font-size 280ms ease-in-out, max-width 280ms ease-in-out`, }, token: { display: 'inline-block', @@ -858,7 +893,8 @@ const areLineStylesEqual = (prevStyle, nextStyle) => { a.color === b.color && a.fontSize === b.fontSize && a.fontWeight === b.fontWeight && - a.lineHeight === b.lineHeight + a.lineHeight === b.lineHeight && + a.maxWidth === b.maxWidth ) } @@ -1038,6 +1074,7 @@ const KaraokeLyricsOverlay = ({ onTogglePronunciation, audioInstance, onClose, + inline = false, }) => { const classes = useStyles() const [playbackMs, setPlaybackMs] = useState(0) @@ -1397,13 +1434,18 @@ const KaraokeLyricsOverlay = ({ } const baseFontSize = lyricsSettings.main.fontSize - const fontSize = isActive ? baseFontSize : Math.round(baseFontSize * 0.8) + const fontSize = isActive + ? baseFontSize + : Math.round(baseFontSize * KARAOKE_MAIN_INACTIVE_FONT_FACTOR) return { opacity, color, fontSize, lineHeight, + maxWidth: isActive + ? '100%' + : `${Math.round(KARAOKE_MAIN_INACTIVE_FONT_FACTOR * 100)}%`, } } @@ -1448,7 +1490,9 @@ const KaraokeLyricsOverlay = ({ } } - const overlayStyle = isCompact + const overlayStyle = inline + ? undefined + : isCompact ? undefined : { height: overlayHeight, @@ -1457,17 +1501,27 @@ const KaraokeLyricsOverlay = ({ return (
    event.stopPropagation() : undefined} > -
    + {!inline && ( +
    + )} -
    +
    {languageBadges.map((badge) => ( @@ -1536,7 +1590,12 @@ const KaraokeLyricsOverlay = ({
    -
    +
    {mainLines.map((line, idx) => { diff --git a/ui/src/audioplayer/KaraokeLyricsOverlay.test.jsx b/ui/src/audioplayer/KaraokeLyricsOverlay.test.jsx index 412bc3946..2fccf693a 100644 --- a/ui/src/audioplayer/KaraokeLyricsOverlay.test.jsx +++ b/ui/src/audioplayer/KaraokeLyricsOverlay.test.jsx @@ -71,6 +71,16 @@ describe(' behavior', () => { expect(await screen.findByText('Appearance')).toBeInTheDocument() }) + it('renders inline mode without the desktop resize handle', () => { + renderOverlay({ inline: true }) + + expect(screen.getByTestId('karaoke-lyrics-overlay')).toHaveAttribute( + 'data-inline', + 'true', + ) + expect(screen.queryByTestId('lyrics-resize-handle')).not.toBeInTheDocument() + }) + it('renders the appearance popup with Main label and default line height for older settings', async () => { localStorage.setItem( 'karaoke-lyrics-settings', @@ -245,6 +255,49 @@ describe(' behavior', () => { ]) }) + it('uses cue byte offsets to preserve explicit space cues in multibyte karaoke lines', () => { + renderOverlay({ + mainLyric: { + kind: 'main', + lang: 'ko', + synced: true, + line: [{ start: 0, end: 900, value: '눈을 뜬 순간' }], + cueLine: [ + { + index: 0, + start: 0, + end: 900, + value: '눈을 뜬 순간', + cue: [ + { start: 0, end: 150, value: '눈을', byteStart: 0, byteEnd: 5 }, + { start: 150, end: 250, value: ' ', byteStart: 6, byteEnd: 6 }, + { start: 250, end: 450, value: '뜬', byteStart: 7, byteEnd: 9 }, + { start: 450, end: 550, value: ' ', byteStart: 10, byteEnd: 10 }, + { start: 550, end: 900, value: '순간', byteStart: 11, byteEnd: 16 }, + ], + }, + ], + }, + translationLyric: null, + pronunciationLyric: null, + showTranslation: false, + showPronunciation: false, + translationEnabled: false, + pronunciationEnabled: false, + audioInstance: { + ...audioInstance, + currentTime: 0.3, + }, + }) + + const mainLine = screen.getByText('눈을').parentElement + const segments = Array.from(mainLine.querySelectorAll('span')).map( + (span) => span.textContent, + ) + + expect(segments).toEqual(['눈을', ' ', '뜬', ' ', '순간']) + }) + it('highlights line-timed pronunciation and translation rows with the active main line', () => { renderOverlay({ mainLyric: { @@ -295,6 +348,55 @@ describe(' behavior', () => { ) }) + it('pre-wraps inactive main lines so the active line keeps the same wrap shape', () => { + renderOverlay({ + mainLyric: { + kind: 'main', + lang: 'en', + synced: true, + line: [ + { start: 1000, end: 1800, value: 'First line that is getting focus' }, + { start: 2500, end: 3300, value: 'Second line waiting below' }, + ], + }, + translationLyric: null, + pronunciationLyric: null, + showTranslation: false, + showPronunciation: false, + translationEnabled: false, + pronunciationEnabled: false, + audioInstance: { + ...audioInstance, + currentTime: 1.2, + }, + }) + + const activeLine = screen.getByText('First line that is getting focus') + .parentElement + const inactiveLine = screen.getByText('Second line waiting below') + .parentElement + + expect(parseFloat(activeLine.style.fontSize)).toBeGreaterThan( + parseFloat(inactiveLine.style.fontSize), + ) + expect(activeLine.style.maxWidth).toBe('100%') + expect(inactiveLine.style.maxWidth).toBe('80%') + }) + + it('centers pronunciation text inside the pill container', () => { + renderOverlay({ + showTranslation: false, + showPronunciation: true, + }) + + const pronunciationLine = screen.getByText('konnichiwa').parentElement + const styles = window.getComputedStyle(pronunciationLine) + + expect(styles.display).toBe('inline-flex') + expect(styles.justifyContent).toBe('center') + expect(styles.alignItems).toBe('center') + }) + it('renders untimed text lyrics in manual reading mode without a pinned active line', () => { renderOverlay({ mainLyric: { diff --git a/ui/src/audioplayer/MobileKaraokeLyricsPortal.jsx b/ui/src/audioplayer/MobileKaraokeLyricsPortal.jsx new file mode 100644 index 000000000..636107184 --- /dev/null +++ b/ui/src/audioplayer/MobileKaraokeLyricsPortal.jsx @@ -0,0 +1,65 @@ +import React, { useEffect, useState } from 'react' +import { createPortal } from 'react-dom' + +export const MOBILE_KARAOKE_LYRICS_HOST_SELECTOR = + '.react-jinke-music-player-mobile-cover' +export const MOBILE_KARAOKE_LYRICS_ACTIVE_CLASS = 'nd-mobile-lyrics-active' + +const resolveMobileLyricsHost = () => { + if (typeof document === 'undefined') { + return null + } + return document.querySelector(MOBILE_KARAOKE_LYRICS_HOST_SELECTOR) +} + +const MobileKaraokeLyricsPortal = ({ active, children }) => { + const [host, setHost] = useState(() => + active ? resolveMobileLyricsHost() : null, + ) + + useEffect(() => { + if (typeof document === 'undefined') { + setHost(null) + return undefined + } + + if (!active) { + setHost(null) + return undefined + } + + const syncHost = () => { + setHost(resolveMobileLyricsHost()) + } + + syncHost() + + const observer = new MutationObserver(syncHost) + observer.observe(document.body, { + childList: true, + subtree: true, + }) + + return () => observer.disconnect() + }, [active]) + + useEffect(() => { + if (!host) { + return undefined + } + + host.classList.toggle(MOBILE_KARAOKE_LYRICS_ACTIVE_CLASS, active) + + return () => { + host.classList.remove(MOBILE_KARAOKE_LYRICS_ACTIVE_CLASS) + } + }, [active, host]) + + if (!active || !host) { + return null + } + + return createPortal(children, host) +} + +export default MobileKaraokeLyricsPortal diff --git a/ui/src/audioplayer/MobileKaraokeLyricsPortal.test.jsx b/ui/src/audioplayer/MobileKaraokeLyricsPortal.test.jsx new file mode 100644 index 000000000..8b237e184 --- /dev/null +++ b/ui/src/audioplayer/MobileKaraokeLyricsPortal.test.jsx @@ -0,0 +1,55 @@ +import React from 'react' +import { cleanup, render, screen, waitFor } from '@testing-library/react' +import MobileKaraokeLyricsPortal, { + MOBILE_KARAOKE_LYRICS_ACTIVE_CLASS, +} from './MobileKaraokeLyricsPortal' + +const HOST_CLASS = 'react-jinke-music-player-mobile-cover' + +describe('', () => { + afterEach(() => { + cleanup() + document.body.innerHTML = '' + }) + + it('renders lyrics into the mobile cover host and toggles the active class', () => { + const host = document.createElement('div') + host.className = HOST_CLASS + document.body.appendChild(host) + + const { rerender } = render( + +
    Lyrics
    +
    , + ) + + expect(host).toContainElement(screen.getByTestId('mobile-inline-lyrics')) + expect(host).toHaveClass(MOBILE_KARAOKE_LYRICS_ACTIVE_CLASS) + + rerender( + +
    Lyrics
    +
    , + ) + + expect(screen.queryByTestId('mobile-inline-lyrics')).not.toBeInTheDocument() + expect(host).not.toHaveClass(MOBILE_KARAOKE_LYRICS_ACTIVE_CLASS) + }) + + it('attaches when the mobile cover host appears after mount', async () => { + render( + +
    Lyrics
    +
    , + ) + + const host = document.createElement('div') + host.className = HOST_CLASS + document.body.appendChild(host) + + await waitFor(() => + expect(host).toContainElement(screen.getByTestId('mobile-inline-lyrics')), + ) + expect(host).toHaveClass(MOBILE_KARAOKE_LYRICS_ACTIVE_CLASS) + }) +}) diff --git a/ui/src/audioplayer/Player.jsx b/ui/src/audioplayer/Player.jsx index c6e73c916..9a60655cd 100644 --- a/ui/src/audioplayer/Player.jsx +++ b/ui/src/audioplayer/Player.jsx @@ -45,6 +45,7 @@ import { togglePronunciationPreference, } from './lyricsOverlayState' import KaraokeLyricsOverlay from './KaraokeLyricsOverlay' +import MobileKaraokeLyricsPortal from './MobileKaraokeLyricsPortal' const emptyLyricLayers = { main: null, @@ -172,6 +173,7 @@ const Player = () => { hasTranslationLyric, hasPronunciationLyric, }) + const useInlineMobileLyrics = karaokeVisible && !isDesktop const applyLyricToRuntimePlayer = useCallback((trackId, lyric) => { if (!trackId) { @@ -535,10 +537,13 @@ const Player = () => { ) const onCoverClick = useCallback((mode, audioLists, audioInfo) => { + if (!isDesktop && karaokeVisible) { + return + } if (mode === 'full' && audioInfo?.song?.albumId) { window.location.href = `#/album/${audioInfo.song.albumId}/show` } - }, []) + }, [isDesktop, karaokeVisible]) const onAudioError = useCallback( (error, currentPlayId, audioLists, audioInfo) => { @@ -603,28 +608,55 @@ const Player = () => { onBeforeDestroy={onBeforeDestroy} getAudioInstance={setAudioInstance} /> - - setTranslationPreference((previous) => - hasTranslationLyric ? !previous : false, - ) - } - onTogglePronunciation={() => - setPronunciationPreference((previous) => - togglePronunciationPreference(previous, hasPronunciationLyric), - ) - } - audioInstance={audioInstance} - onClose={() => setKaraokeVisiblePreference(false)} - /> + {isDesktop && ( + + setTranslationPreference((previous) => + hasTranslationLyric ? !previous : false, + ) + } + onTogglePronunciation={() => + setPronunciationPreference((previous) => + togglePronunciationPreference(previous, hasPronunciationLyric), + ) + } + audioInstance={audioInstance} + onClose={() => setKaraokeVisiblePreference(false)} + /> + )} + + + setTranslationPreference((previous) => + hasTranslationLyric ? !previous : false, + ) + } + onTogglePronunciation={() => + setPronunciationPreference((previous) => + togglePronunciationPreference(previous, hasPronunciationLyric), + ) + } + audioInstance={audioInstance} + onClose={() => setKaraokeVisiblePreference(false)} + /> + ) diff --git a/ui/src/audioplayer/lyrics.js b/ui/src/audioplayer/lyrics.js index 6fa627ee5..ae49c89e5 100644 --- a/ui/src/audioplayer/lyrics.js +++ b/ui/src/audioplayer/lyrics.js @@ -86,7 +86,7 @@ const normalizeToken = (token) => { return null } const value = typeof token.value === 'string' ? token.value : '' - if (!value.trim()) { + if (value.length === 0) { return null } const byteStart = toByteOffset(token.byteStart) diff --git a/ui/src/audioplayer/lyrics.test.js b/ui/src/audioplayer/lyrics.test.js index 961fdb10b..ae5fb5a66 100644 --- a/ui/src/audioplayer/lyrics.test.js +++ b/ui/src/audioplayer/lyrics.test.js @@ -455,6 +455,43 @@ describe('lyrics helpers', () => { ]) }) + it('preserves whitespace-only cues for exact byte-range rendering', () => { + const lines = buildKaraokeLines({ + lang: 'kor', + synced: true, + line: [{ start: 0, end: 900, value: '눈을 뜬 순간' }], + cueLine: [ + { + index: 0, + start: 0, + end: 900, + value: '눈을 뜬 순간', + cue: [ + { start: 0, end: 150, value: '눈을', byteStart: 0, byteEnd: 5 }, + { start: 150, end: 250, value: ' ', byteStart: 6, byteEnd: 6 }, + { start: 250, end: 450, value: '뜬', byteStart: 7, byteEnd: 9 }, + { start: 450, end: 550, value: ' ', byteStart: 10, byteEnd: 10 }, + { start: 550, end: 900, value: '순간', byteStart: 11, byteEnd: 16 }, + ], + }, + ], + }) + + expect( + lines[0].tokens.map((token) => [ + token.value, + token.byteStart, + token.byteEnd, + ]), + ).toEqual([ + ['눈을', 0, 5], + [' ', 6, 6], + ['뜬', 7, 9], + [' ', 10, 10], + ['순간', 11, 16], + ]) + }) + it('maps UTF-8 byte offsets to string ranges for multibyte lyrics', () => { const text = '눈을 뜬 순간' diff --git a/ui/src/audioplayer/styles.js b/ui/src/audioplayer/styles.js index 30a14d4db..09ccb8fcf 100644 --- a/ui/src/audioplayer/styles.js +++ b/ui/src/audioplayer/styles.js @@ -62,12 +62,30 @@ const useStyle = makeStyles( // Fix cover display when image is not square aspectRatio: '1/1', display: 'flex', + position: 'relative', + }, + '& .react-jinke-music-player-mobile .react-jinke-music-player-mobile-cover.nd-mobile-lyrics-active': + { + width: 'calc(100% - 40px)', + maxWidth: 'none', + height: 'clamp(280px, 42vh, 460px)', + aspectRatio: 'auto', + borderRadius: 24, + border: '1px solid rgba(255, 255, 255, 0.1)', + boxShadow: '0 18px 40px rgba(0, 0, 0, 0.32)', + background: 'rgba(6, 8, 12, 0.82)', + cursor: 'default', }, '& .react-jinke-music-player-mobile .react-jinke-music-player-mobile-cover img.cover': { animationDuration: (props) => !props.enableCoverAnimation && '0s', objectFit: 'contain', // Fix cover display when image is not square }, + '& .react-jinke-music-player-mobile .react-jinke-music-player-mobile-cover.nd-mobile-lyrics-active img.cover': + { + opacity: 0, + pointerEvents: 'none', + }, // Hide old singer display '& .react-jinke-music-player-mobile .react-jinke-music-player-mobile-singer': { From 45fac622859848641dfb979cd76a456e989aa133 Mon Sep 17 00:00:00 2001 From: ranokay Date: Tue, 14 Apr 2026 16:17:38 +0300 Subject: [PATCH 13/14] refactor(lyrics): clean up karaoke parsing and edge cases --- core/lyrics/ttml.go | 20 ++-- core/lyrics/ttml_test.go | 30 ++++++ server/subsonic/media_retrieval_test.go | 51 ++++++++++ ui/src/audioplayer/lyrics.js | 127 ++++++++++++------------ ui/src/audioplayer/lyrics.test.js | 63 ++++++++++++ 5 files changed, 218 insertions(+), 73 deletions(-) diff --git a/core/lyrics/ttml.go b/core/lyrics/ttml.go index 6e4ce9da3..576d2ca3d 100644 --- a/core/lyrics/ttml.go +++ b/core/lyrics/ttml.go @@ -664,7 +664,6 @@ func (p *ttmlParser) buildMetadataLyrics(kind string, langOrder []string, entrie } func (p *ttmlParser) finalizeLyrics(lyrics model.Lyrics) model.Lyrics { - lyrics.Line = model.NormalizeCueLines(lyrics.Line) lyrics.Line, lyrics.Agents = p.resolveAgents(lyrics.Line) return model.NormalizeLyrics(lyrics) } @@ -674,14 +673,13 @@ func (p *ttmlParser) resolveAgents(lines []model.Line) ([]model.Line, []model.Ag return lines, nil } - normalized := model.NormalizeCueLines(lines) usedOrder := make([]string, 0, 4) usedSet := make(map[string]struct{}, 4) sawEmptyCue := false - for i := range normalized { - for j := range normalized[i].Cue { - agentID := strings.TrimSpace(normalized[i].Cue[j].AgentID) + for i := range lines { + for j := range lines[i].Cue { + agentID := strings.TrimSpace(lines[i].Cue[j].AgentID) if agentID == "" { sawEmptyCue = true continue @@ -694,7 +692,7 @@ func (p *ttmlParser) resolveAgents(lines []model.Line) ([]model.Line, []model.Ag } if len(usedOrder) == 0 { - return normalized, nil + return lines, nil } mainID := "" @@ -725,10 +723,10 @@ func (p *ttmlParser) resolveAgents(lines []model.Line) ([]model.Line, []model.Ag usedOrder = append([]string{mainID}, usedOrder...) } - for i := range normalized { - for j := range normalized[i].Cue { - if strings.TrimSpace(normalized[i].Cue[j].AgentID) == "" { - normalized[i].Cue[j].AgentID = mainID + for i := range lines { + for j := range lines[i].Cue { + if strings.TrimSpace(lines[i].Cue[j].AgentID) == "" { + lines[i].Cue[j].AgentID = mainID } } } @@ -747,7 +745,7 @@ func (p *ttmlParser) resolveAgents(lines []model.Line) ([]model.Line, []model.Ag agents = append(agents, agent) } - return normalized, agents + return lines, agents } func (p *ttmlParser) resolveCueAgentID(ctx ttmlTimingContext) string { diff --git a/core/lyrics/ttml_test.go b/core/lyrics/ttml_test.go index 5f9092e36..14676975d 100644 --- a/core/lyrics/ttml_test.go +++ b/core/lyrics/ttml_test.go @@ -215,6 +215,36 @@ var _ = Describe("parseTTML", func() { Expect(list[0].Line[1].Cue).To(HaveLen(1)) Expect(list[0].Line[1].Cue[0].AgentID).To(Equal("lead__bg")) }) + + It("should fill missing cue agent ids with the resolved main agent", func() { + content := []byte(` + + + + Guest Vocal + + + +
    +

    + Lead + Guest +

    +
    + +
    `) + + list, err := parseTTML(content) + Expect(err).ToNot(HaveOccurred()) + Expect(list).To(HaveLen(1)) + Expect(list[0].Agents).To(Equal([]model.Agent{ + {ID: "guest", Role: "main", Name: "Guest Vocal"}, + })) + Expect(list[0].Line).To(HaveLen(1)) + Expect(list[0].Line[0].Cue).To(HaveLen(2)) + Expect(list[0].Line[0].Cue[0].AgentID).To(Equal("guest")) + Expect(list[0].Line[0].Cue[1].AgentID).To(Equal("guest")) + }) }) Describe("Ambiguous decimal timing", func() { diff --git a/server/subsonic/media_retrieval_test.go b/server/subsonic/media_retrieval_test.go index d02d5b9bd..aedf08ff7 100644 --- a/server/subsonic/media_retrieval_test.go +++ b/server/subsonic/media_retrieval_test.go @@ -653,6 +653,57 @@ var _ = Describe("MediaRetrievalController", func() { }) }) + It("should keep enhanced line-level lyrics when no cue data is available", func() { + r := newGetRequest("id=1&enhanced=true") + + lineStart := int64(1000) + lineEnd := int64(3000) + lyricsJSON, err := json.Marshal(model.LyricList{ + { + Kind: "main", + Lang: "eng", + Synced: true, + Line: []model.Line{ + { + Start: &lineStart, + End: &lineEnd, + Value: "Line without word timing", + }, + }, + }, + }) + Expect(err).ToNot(HaveOccurred()) + + mockRepo.SetData(model.MediaFiles{ + { + ID: "1", + Artist: "Rick Astley", + Title: "Never Gonna Give You Up", + Lyrics: string(lyricsJSON), + }, + }) + + response, err := router.GetLyricsBySongId(r) + Expect(err).ToNot(HaveOccurred()) + compareResponses(response.LyricsList, responses.LyricsList{ + StructuredLyrics: responses.StructuredLyrics{ + { + DisplayArtist: "Rick Astley", + DisplayTitle: "Never Gonna Give You Up", + Kind: "main", + Lang: "eng", + Synced: true, + Line: []responses.Line{ + { + Start: &lineStart, + Value: "Line without word timing", + }, + }, + }, + }, + }) + }) + It("should return required cue byte offsets for ambiguous and multibyte cue lines", func() { r := newGetRequest("id=1&enhanced=true") diff --git a/ui/src/audioplayer/lyrics.js b/ui/src/audioplayer/lyrics.js index ae49c89e5..b44e4d9f0 100644 --- a/ui/src/audioplayer/lyrics.js +++ b/ui/src/audioplayer/lyrics.js @@ -1,6 +1,7 @@ const normalizeLanguageTag = (language) => (language || '').toLowerCase().replace('_', '-') +// Roughly one 60fps frame; keeps line/token switching stable near tight boundaries. const KARAOKE_SWITCH_EPSILON_MS = 18 const LYRIC_KIND_MAIN = 'main' const LYRIC_KIND_TRANSLATION = 'translation' @@ -379,6 +380,68 @@ export const structuredLyricsToLrc = (structuredLyrics, preferredLanguage) => { return structuredLyricToLrc(selected) } +const buildBaseKaraokeLines = (baseLines) => + baseLines.map((line, index) => ({ + index, + start: toTime(line.start), + end: toTime(line.end), + value: typeof line.value === 'string' ? line.value : '', + tokens: [], + })) + +export const buildKaraokeLinesFromCueLines = ( + rawCueLines, + baseLines, + agentLookup, +) => { + const normalizedCueLines = rawCueLines.map((cueLine, fallbackIndex) => { + const normalized = normalizeCueLine(cueLine, fallbackIndex, agentLookup) + return { + ...normalized, + tokens: normalized.tokens.map((token) => ({ + ...token, + role: normalized.role, + agentId: normalized.agentId, + agentName: normalized.agentName, + agentRole: normalized.agentRole, + })), + } + }) + + const byIndex = new Map() + for (const cueLine of normalizedCueLines) { + if (!byIndex.has(cueLine.index)) { + byIndex.set(cueLine.index, []) + } + byIndex.get(cueLine.index).push(cueLine) + } + + return Array.from(byIndex.entries()).map(([index, group]) => { + const first = group[0] + const baseLine = baseLines[index] || {} + const tokens = sortTokensByStart(group.flatMap((cueLine) => cueLine.tokens)) + const fallbackStart = + tokens.find((token) => token.start != null)?.start ?? null + const fallbackEnd = + [...tokens].reverse().find((token) => token.end != null)?.end ?? null + const value = + first.value || + (typeof baseLine.value === 'string' ? baseLine.value : '') || + tokens.map((token) => token.value).join('') + + return { + index, + start: first.start ?? toTime(baseLine.start) ?? fallbackStart, + end: first.end ?? toTime(baseLine.end) ?? fallbackEnd, + value, + agentId: first.agentId, + agentName: first.agentName, + agentRole: first.agentRole, + tokens, + } + }) +} + export const buildKaraokeLines = (structuredLyric) => { if (!structuredLyric) { return [] @@ -394,68 +457,8 @@ export const buildKaraokeLines = (structuredLyric) => { const lines = rawCueLines.length > 0 - ? (() => { - const normalizedCueLines = rawCueLines.map( - (cueLine, fallbackIndex) => { - const normalized = normalizeCueLine( - cueLine, - fallbackIndex, - agentLookup, - ) - return { - ...normalized, - tokens: normalized.tokens.map((token) => ({ - ...token, - role: normalized.role, - agentId: normalized.agentId, - agentName: normalized.agentName, - agentRole: normalized.agentRole, - })), - } - }, - ) - - const byIndex = new Map() - for (const cl of normalizedCueLines) { - if (!byIndex.has(cl.index)) { - byIndex.set(cl.index, []) - } - byIndex.get(cl.index).push(cl) - } - - return Array.from(byIndex.entries()).map(([index, group]) => { - const first = group[0] - const baseLine = baseLines[index] || {} - const tokens = sortTokensByStart(group.flatMap((cl) => cl.tokens)) - const fallbackStart = - tokens.find((token) => token.start != null)?.start ?? null - const fallbackEnd = - [...tokens].reverse().find((token) => token.end != null)?.end ?? - null - const value = - first.value || - (typeof baseLine.value === 'string' ? baseLine.value : '') || - tokens.map((token) => token.value).join('') - - return { - index, - start: first.start ?? toTime(baseLine.start) ?? fallbackStart, - end: first.end ?? toTime(baseLine.end) ?? fallbackEnd, - value, - agentId: first.agentId, - agentName: first.agentName, - agentRole: first.agentRole, - tokens, - } - }) - })() - : baseLines.map((line, index) => ({ - index, - start: toTime(line.start), - end: toTime(line.end), - value: typeof line.value === 'string' ? line.value : '', - tokens: [], - })) + ? buildKaraokeLinesFromCueLines(rawCueLines, baseLines, agentLookup) + : buildBaseKaraokeLines(baseLines) const normalized = lines .filter((line) => line.value || line.tokens.length > 0) diff --git a/ui/src/audioplayer/lyrics.test.js b/ui/src/audioplayer/lyrics.test.js index ae5fb5a66..1abea57a5 100644 --- a/ui/src/audioplayer/lyrics.test.js +++ b/ui/src/audioplayer/lyrics.test.js @@ -2,6 +2,7 @@ import { buildHighlightedAuxLine, buildHighlightedMainLine, buildKaraokeLines, + buildKaraokeLinesFromCueLines, findLayerLineIndexForMain, getActiveKaraokeState, getPreferredLyricLanguage, @@ -414,6 +415,68 @@ describe('lyrics helpers', () => { ]) }) + it('builds grouped karaoke lines directly from cue lines', () => { + const agentLookup = new Map([ + ['lead', { id: 'lead', role: 'main', name: 'Lead Vocal' }], + ['backing', { id: 'backing', role: 'bg', name: '' }], + ]) + + const lines = buildKaraokeLinesFromCueLines( + [ + { + index: 0, + start: 1000, + end: 3000, + value: 'Hello world', + agentId: 'lead', + cue: [{ start: 1000, end: 1500, value: 'Hello' }], + }, + { + index: 0, + start: 1000, + end: 3000, + value: 'Hello world', + agentId: 'backing', + cue: [{ start: 2000, end: 2500, value: 'world' }], + }, + ], + [{ start: 1000, end: 3000, value: 'Hello world' }], + agentLookup, + ) + + expect(lines).toEqual([ + { + agentId: 'lead', + agentName: 'Lead Vocal', + agentRole: 'main', + index: 0, + start: 1000, + end: 3000, + value: 'Hello world', + tokens: [ + { + start: 1000, + end: 1500, + value: 'Hello', + role: '', + agentId: 'lead', + agentName: 'Lead Vocal', + agentRole: 'main', + }, + { + start: 2000, + end: 2500, + value: 'world', + role: 'bg', + agentId: 'backing', + agentName: '', + agentRole: 'bg', + }, + ], + }, + ]) + }) + it('preserves cue byte offsets on karaoke tokens', () => { const lines = buildKaraokeLines({ lang: 'eng', From 7c6ecd0cf683d279201e79fd9e502c139bb78a17 Mon Sep 17 00:00:00 2001 From: ranokay Date: Tue, 14 Apr 2026 23:17:31 +0300 Subject: [PATCH 14/14] refine karaoke lyrics overlay UI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Remove glow effect, keep gradient text highlight with soft wipe edge - Theme-adaptive: use theme.palette.background.default (85% opacity) for overlay bg - Light theme support: dark text, borders, badges, settings panel colors - Auto-switch main lyrics color (white↔black) on theme change - Add black color preset, default for light themes - Merge TR/PR toggle buttons into language badges (clickable, with tooltips) - Fade edges via CSS mask-image (theme-independent, no pseudo-elements) - Rising effect (font-size scaling) for TR/PR lines matching main - Smooth scroll: custom rAF ease-out cubic animation (400ms) - Mobile: full-width panel, backdrop blur, transparent background - Timing: KARAOKE_RENDER_LEAD_MS=80, KARAOKE_SWITCH_EPSILON_MS=50 - Hide TR/PR badges when no data available - Badge/pill vertical centering with lineHeight:1 - Remove unused Button import, layerControls/layerToggle styles --- ui/src/audioplayer/KaraokeLyricsOverlay.jsx | 338 ++++++++++++-------- ui/src/audioplayer/lyrics.js | 2 +- ui/src/audioplayer/styles.js | 10 +- 3 files changed, 207 insertions(+), 143 deletions(-) diff --git a/ui/src/audioplayer/KaraokeLyricsOverlay.jsx b/ui/src/audioplayer/KaraokeLyricsOverlay.jsx index c016df9e5..aefb0127e 100644 --- a/ui/src/audioplayer/KaraokeLyricsOverlay.jsx +++ b/ui/src/audioplayer/KaraokeLyricsOverlay.jsx @@ -1,8 +1,7 @@ -import Button from '@material-ui/core/Button' import IconButton from '@material-ui/core/IconButton' import Popover from '@material-ui/core/Popover' import Slider from '@material-ui/core/Slider' -import { makeStyles } from '@material-ui/core/styles' +import { makeStyles, useTheme } from '@material-ui/core/styles' import Tooltip from '@material-ui/core/Tooltip' import Typography from '@material-ui/core/Typography' import CloseIcon from '@material-ui/icons/Close' @@ -29,7 +28,7 @@ import { utf8ByteRangeToCodeUnitRange, } from './lyrics' -const KARAOKE_RENDER_LEAD_MS = 24 +const KARAOKE_RENDER_LEAD_MS = 80 const KARAOKE_CLOCK_DRIFT_RESET_MS = 140 const KARAOKE_CLOCK_RESET_THRESHOLD_MS = 320 const KARAOKE_MONOTONIC_JITTER_MS = 60 @@ -49,15 +48,17 @@ const KARAOKE_LINE_HEIGHT_STEP = 0.02 const KARAOKE_GROUP_SPACING_BASE_PX = 14 const KARAOKE_AUX_LINE_HEIGHT = 1.2 const KARAOKE_MAIN_INACTIVE_FONT_FACTOR = 0.8 +const KARAOKE_AUX_INACTIVE_FONT_FACTOR = 0.88 const TOKEN_DONE_ALPHA = 1 const TOKEN_FUTURE_ALPHA = 0.34 const TOKEN_ACTIVE_ALPHA = 1 +const TOKEN_WIPE_SOFT_SPREAD_PCT = 12 const TOKEN_WIPE_EDGE_PCT = 8 -const TOKEN_WIPE_GLOW_PCT = 16 const COLOR_PRESETS = [ { key: 'white', label: 'White', value: 'rgba(255, 255, 255, 0.92)' }, + { key: 'black', label: 'Black', value: 'rgba(0, 0, 0, 0.87)' }, { key: 'blue', label: 'Blue', value: 'rgba(120, 160, 220, 0.75)' }, { key: 'green', label: 'Green', value: 'rgba(100, 200, 130, 0.7)' }, { key: 'pink', label: 'Pink', value: 'rgba(240, 140, 170, 0.75)' }, @@ -77,11 +78,11 @@ const DEFAULT_LYRICS_SETTINGS = { const SETTINGS_STORAGE_KEY = 'karaoke-lyrics-settings' -const createDefaultLyricsSettings = () => ({ +const createDefaultLyricsSettings = (isDark = true) => ({ lineHeight: KARAOKE_DEFAULT_LINE_HEIGHT, overlayHeight: KARAOKE_DEFAULT_HEIGHT_PX, tr: { ...DEFAULT_LYRICS_SETTINGS.tr }, - main: { ...DEFAULT_LYRICS_SETTINGS.main }, + main: { ...DEFAULT_LYRICS_SETTINGS.main, colorKey: isDark ? 'white' : 'black' }, pr: { ...DEFAULT_LYRICS_SETTINGS.pr }, }) @@ -135,7 +136,30 @@ const saveLyricsSettings = (settings) => { const getColorValue = (colorKey) => COLOR_PRESETS.find((c) => c.key === colorKey)?.value || COLOR_PRESETS[0].value -const useStyles = makeStyles((theme) => ({ +const hexToRgba = (hex, alpha) => { + const m = (hex || '').match(/#([0-9a-f]{2})([0-9a-f]{2})([0-9a-f]{2})/i) + if (m) return `rgba(${parseInt(m[1], 16)}, ${parseInt(m[2], 16)}, ${parseInt(m[3], 16)}, ${alpha})` + const rm = (hex || '').match(/rgba?\((\d+),\s*(\d+),\s*(\d+)/) + if (rm) return `rgba(${rm[1]}, ${rm[2]}, ${rm[3]}, ${alpha})` + return `rgba(48, 48, 48, ${alpha})` +} + +const useStyles = makeStyles((theme) => { + const isDark = theme.palette.type === 'dark' + const overlayBg = hexToRgba(theme.palette.background.default, 0.85) + const primaryMain = theme.palette.primary.main + const primaryRgb = (() => { + const m = (primaryMain || '').match(/#([0-9a-f]{2})([0-9a-f]{2})([0-9a-f]{2})/i) + if (m) return [parseInt(m[1], 16), parseInt(m[2], 16), parseInt(m[3], 16)] + const rm = (primaryMain || '').match(/rgba?\((\d+),\s*(\d+),\s*(\d+)/) + if (rm) return [parseInt(rm[1]), parseInt(rm[2]), parseInt(rm[3])] + return [144, 202, 249] + })() + const textPrimary = isDark ? 'rgba(255, 255, 255, 0.92)' : 'rgba(0, 0, 0, 0.87)' + const textSecondary = isDark ? 'rgba(255, 255, 255, 0.55)' : 'rgba(0, 0, 0, 0.54)' + const borderSubtle = isDark ? 'rgba(255, 255, 255, 0.12)' : 'rgba(0, 0, 0, 0.12)' + + return ({ overlay: { position: 'fixed', left: '50%', @@ -144,19 +168,19 @@ const useStyles = makeStyles((theme) => ({ zIndex: 1400, width: 'min(1000px, calc(100vw - 32px))', minHeight: KARAOKE_MIN_HEIGHT_PX, - background: 'rgba(6, 8, 12, 0.9)', + background: overlayBg, borderRadius: 12, - border: '1px solid rgba(255, 255, 255, 0.12)', + border: `1px solid ${borderSubtle}`, boxShadow: '0 18px 48px rgba(0, 0, 0, 0.42)', - backdropFilter: 'blur(10px)', - color: theme.palette.common.white, + backdropFilter: 'blur(20px)', + color: textPrimary, display: 'flex', flexDirection: 'column', overflow: 'hidden', '@media (max-width:810px)': { bottom: 78, width: 'calc(100vw - 12px)', - borderRadius: 8, + borderRadius: 12, minHeight: 180, maxHeight: '65vh', }, @@ -172,8 +196,9 @@ const useStyles = makeStyles((theme) => ({ borderRadius: 'inherit', border: 'none', boxShadow: 'none', - background: 'rgba(6, 8, 12, 0.92)', - backdropFilter: 'blur(12px)', + background: 'transparent', + backdropFilter: 'blur(16px)', + WebkitBackdropFilter: 'blur(16px)', zIndex: 1, }, resizeHandle: { @@ -190,7 +215,7 @@ const useStyles = makeStyles((theme) => ({ width: 56, height: 3, borderRadius: 999, - background: 'rgba(255, 255, 255, 0.22)', + background: `rgba(${primaryRgb.join(', ')}, 0.22)`, }, '@media (max-width:810px)': { display: 'none', @@ -223,20 +248,31 @@ const useStyles = makeStyles((theme) => ({ languageBadge: { display: 'inline-flex', alignItems: 'center', + justifyContent: 'center', gap: theme.spacing(0.35), padding: theme.spacing(0.2, 0.7), borderRadius: 999, - border: '1px solid rgba(148, 163, 184, 0.28)', - background: 'rgba(15, 23, 42, 0.42)', - color: 'rgba(226, 232, 240, 0.8)', + border: `1px solid ${borderSubtle}`, + background: isDark ? 'rgba(15, 23, 42, 0.42)' : 'rgba(0, 0, 0, 0.06)', + color: isDark ? 'rgba(226, 232, 240, 0.8)' : 'rgba(0, 0, 0, 0.6)', fontSize: 10, + lineHeight: 1, letterSpacing: '0.04em', whiteSpace: 'nowrap', + transition: `all ${KARAOKE_ANIMATION_MS}ms ease-in-out`, + userSelect: 'none', + }, + languageBadgeToggle: { + cursor: 'pointer', + '&:hover': { + borderColor: `rgba(${primaryRgb.join(', ')}, 0.35)`, + background: isDark ? 'rgba(15, 23, 42, 0.56)' : 'rgba(0, 0, 0, 0.1)', + }, }, languageBadgeActive: { - borderColor: 'rgba(148, 163, 184, 0.46)', - background: 'rgba(30, 41, 59, 0.56)', - color: 'rgba(248, 250, 252, 0.94)', + borderColor: `rgba(${primaryRgb.join(', ')}, 0.46)`, + background: `rgba(${primaryRgb.join(', ')}, 0.18)`, + color: isDark ? 'rgba(248, 250, 252, 0.94)' : 'rgba(0, 0, 0, 0.87)', }, languageBadgeLabel: { fontWeight: 700, @@ -246,35 +282,8 @@ const useStyles = makeStyles((theme) => ({ languageBadgeValue: { opacity: 0.9, }, - layerControls: { - display: 'flex', - alignItems: 'center', - gap: theme.spacing(0.5), - }, - layerToggle: { - minWidth: 34, - minHeight: 24, - padding: theme.spacing(0, 0.8), - fontSize: 10, - letterSpacing: '0.08em', - borderRadius: 999, - color: 'rgba(203, 213, 225, 0.95)', - background: 'rgba(100, 116, 139, 0.26)', - border: '1px solid rgba(148, 163, 184, 0.45)', - transition: `all ${KARAOKE_ANIMATION_MS}ms ease-in-out`, - '&.Mui-disabled': { - color: 'rgba(148, 163, 184, 0.45)', - borderColor: 'rgba(100, 116, 139, 0.3)', - background: 'rgba(71, 85, 105, 0.2)', - }, - }, - layerToggleActive: { - color: 'rgba(220, 252, 231, 0.98)', - borderColor: 'rgba(34, 197, 94, 0.96)', - background: 'rgba(34, 197, 94, 0.28)', - }, closeButton: { - color: 'rgba(255, 255, 255, 0.72)', + color: textSecondary, }, lineGroup: { display: 'flex', @@ -290,7 +299,7 @@ const useStyles = makeStyles((theme) => ({ fontWeight: 400, lineHeight: KARAOKE_AUX_LINE_HEIGHT, letterSpacing: '0.01em', - transition: `opacity ${KARAOKE_ANIMATION_MS}ms ease-in-out, font-size ${KARAOKE_ANIMATION_MS}ms ease-in-out`, + transition: `opacity ${KARAOKE_ANIMATION_MS}ms ease-in-out, color ${KARAOKE_ANIMATION_MS}ms ease-in-out, font-size 280ms ease-in-out, max-width 280ms ease-in-out`, }, inlinePr: { margin: 0, @@ -304,23 +313,29 @@ const useStyles = makeStyles((theme) => ({ boxSizing: 'border-box', textAlign: 'center', fontWeight: 400, - lineHeight: KARAOKE_AUX_LINE_HEIGHT, + lineHeight: 1, letterSpacing: '0.01em', - transition: `opacity ${KARAOKE_ANIMATION_MS}ms ease-in-out, font-size ${KARAOKE_ANIMATION_MS}ms ease-in-out`, + transition: `opacity ${KARAOKE_ANIMATION_MS}ms ease-in-out, color ${KARAOKE_ANIMATION_MS}ms ease-in-out, font-size 280ms ease-in-out, max-width 280ms ease-in-out`, padding: theme.spacing(0.15, 0.9), borderRadius: 999, - background: 'rgba(255, 255, 255, 0.08)', - border: '1px solid rgba(255, 255, 255, 0.12)', + background: isDark ? 'rgba(255, 255, 255, 0.08)' : 'rgba(0, 0, 0, 0.05)', + border: `1px solid ${borderSubtle}`, + }, + bodyWrapper: { + position: 'relative', + flex: 1, + overflow: 'hidden', }, body: { padding: theme.spacing(0.5, 2, 1.4, 2), overflowY: 'auto', overflowX: 'hidden', - scrollBehavior: 'smooth', - flex: 1, + height: '100%', overscrollBehavior: 'contain', scrollbarWidth: 'none', msOverflowStyle: 'none', + maskImage: 'linear-gradient(to bottom, transparent 0%, black 8%, black 92%, transparent 100%)', + WebkitMaskImage: 'linear-gradient(to bottom, transparent 0%, black 8%, black 92%, transparent 100%)', '&::-webkit-scrollbar': { display: 'none', width: 0, @@ -347,7 +362,7 @@ const useStyles = makeStyles((theme) => ({ lineHeight: 1.24, letterSpacing: '0.01em', textAlign: 'center', - color: 'rgba(255, 255, 255, 0.62)', + color: isDark ? 'rgba(255, 255, 255, 0.62)' : 'rgba(0, 0, 0, 0.52)', transition: `opacity ${KARAOKE_ANIMATION_MS}ms ease-in-out, color ${KARAOKE_ANIMATION_MS}ms ease-in-out, font-size 280ms ease-in-out, max-width 280ms ease-in-out`, }, token: { @@ -356,15 +371,15 @@ const useStyles = makeStyles((theme) => ({ transition: `color ${KARAOKE_ANIMATION_MS}ms ease-in-out, text-shadow ${KARAOKE_ANIMATION_MS}ms ease-in-out`, }, settingsButton: { - color: 'rgba(255, 255, 255, 0.55)', + color: textSecondary, padding: 4, '&:hover': { - color: 'rgba(255, 255, 255, 0.85)', + color: textPrimary, }, }, settingsPanel: { - background: 'rgba(12, 14, 20, 0.96)', - border: '1px solid rgba(255, 255, 255, 0.12)', + background: isDark ? 'rgba(12, 14, 20, 0.96)' : 'rgba(255, 255, 255, 0.96)', + border: `1px solid ${borderSubtle}`, borderRadius: 10, padding: theme.spacing(1.5, 2), width: 278, @@ -388,14 +403,14 @@ const useStyles = makeStyles((theme) => ({ fontWeight: 700, letterSpacing: '0.08em', textTransform: 'uppercase', - color: 'rgba(255, 255, 255, 0.78)', + color: isDark ? 'rgba(255, 255, 255, 0.78)' : 'rgba(0, 0, 0, 0.72)', }, settingsLabel: { fontSize: 10, fontWeight: 600, letterSpacing: '0.1em', textTransform: 'uppercase', - color: 'rgba(255, 255, 255, 0.55)', + color: isDark ? 'rgba(255, 255, 255, 0.55)' : 'rgba(0, 0, 0, 0.5)', marginBottom: 4, }, settingsRow: { @@ -405,7 +420,7 @@ const useStyles = makeStyles((theme) => ({ }, settingsSlider: { flex: 1, - color: 'rgba(255, 255, 255, 0.6)', + color: `rgba(${primaryRgb.join(', ')}, 0.6)`, '& .MuiSlider-thumb': { width: 12, height: 12, @@ -416,7 +431,7 @@ const useStyles = makeStyles((theme) => ({ }, settingsSliderValue: { fontSize: 11, - color: 'rgba(255, 255, 255, 0.5)', + color: isDark ? 'rgba(255, 255, 255, 0.5)' : 'rgba(0, 0, 0, 0.45)', minWidth: 22, textAlign: 'right', }, @@ -424,15 +439,15 @@ const useStyles = makeStyles((theme) => ({ fontSize: 10, letterSpacing: '0.06em', textTransform: 'uppercase', - color: 'rgba(255, 255, 255, 0.45)', + color: isDark ? 'rgba(255, 255, 255, 0.45)' : 'rgba(0, 0, 0, 0.42)', minWidth: 72, whiteSpace: 'nowrap', }, resetButton: { - color: 'rgba(255, 255, 255, 0.58)', + color: textSecondary, padding: 4, '&:hover': { - color: 'rgba(255, 255, 255, 0.9)', + color: textPrimary, }, }, colorDots: { @@ -452,9 +467,9 @@ const useStyles = makeStyles((theme) => ({ }, }, colorDotActive: { - borderColor: 'rgba(255, 255, 255, 0.85)', + borderColor: isDark ? 'rgba(255, 255, 255, 0.85)' : 'rgba(0, 0, 0, 0.7)', }, -})) +})}) const clamp = (v, min, max) => Math.max(min, Math.min(max, v)) const lerp = (from, to, t) => from + (to - from) * t @@ -479,6 +494,8 @@ const buildLanguageBadges = ({ pronunciationLyric, showTranslation, showPronunciation, + translationEnabled, + pronunciationEnabled, }) => [ { @@ -486,20 +503,25 @@ const buildLanguageBadges = ({ label: 'Main', lang: mainLyric?.lang, active: true, + toggleable: false, }, - { + pronunciationEnabled && { key: 'pr', label: 'PR', lang: pronunciationLyric?.lang, active: showPronunciation, + toggleable: true, + tooltip: showPronunciation ? 'Hide pronunciation' : 'Show pronunciation', }, - { + translationEnabled && { key: 'tr', label: 'TR', lang: translationLyric?.lang, active: showTranslation, + toggleable: true, + tooltip: showTranslation ? 'Hide translation' : 'Show translation', }, - ].filter((badge) => badge.lang) + ].filter((badge) => badge && badge.lang) const SettingsSection = ({ label, layer, settings, onChange, classes }) => { const s = settings[layer] @@ -913,22 +935,20 @@ const buildTokenWipeStyle = ({ const fillPct = clamp(fillProgress, 0, 1) * 100 const doneColor = `rgba(${r}, ${g}, ${b}, ${clamp(highlightAlpha, TOKEN_DONE_ALPHA, TOKEN_ACTIVE_ALPHA)})` const futureColor = `rgba(${r}, ${g}, ${b}, ${futureAlpha})` - const activeShadow = `0 0 8px rgba(${r}, ${g}, ${b}, 0.34)` if (fillPct <= 0) { return { color: futureColor, textShadow: 'none' } } const edgeStart = clamp(fillPct - TOKEN_WIPE_EDGE_PCT, 0, 100) - const glowStop = clamp(fillPct + TOKEN_WIPE_GLOW_PCT, 0, 100) - const glowColor = `rgba(${r}, ${g}, ${b}, ${clamp(highlightAlpha + 0.18, TOKEN_DONE_ALPHA, TOKEN_ACTIVE_ALPHA)})` + const softEnd = clamp(fillPct + TOKEN_WIPE_SOFT_SPREAD_PCT, 0, 100) return { color: 'transparent', WebkitTextFillColor: 'transparent', - backgroundImage: `linear-gradient(90deg, ${doneColor} 0%, ${doneColor} ${edgeStart}%, ${glowColor} ${fillPct}%, ${futureColor} ${glowStop}%, ${futureColor} 100%)`, + backgroundImage: `linear-gradient(90deg, ${doneColor} 0%, ${doneColor} ${edgeStart}%, ${doneColor} ${fillPct}%, ${futureColor} ${softEnd}%, ${futureColor} 100%)`, backgroundClip: 'text', WebkitBackgroundClip: 'text', - textShadow: activeShadow, + textShadow: 'none', } } @@ -1077,6 +1097,8 @@ const KaraokeLyricsOverlay = ({ inline = false, }) => { const classes = useStyles() + const theme = useTheme() + const isDark = theme.palette.type === 'dark' const [playbackMs, setPlaybackMs] = useState(0) const [maxHeightPx, setMaxHeightPx] = useState(getMaxHeightPx()) const [bodyViewportHeight, setBodyViewportHeight] = useState(0) @@ -1092,10 +1114,10 @@ const KaraokeLyricsOverlay = ({ }, []) const handleResetAppearance = useCallback(() => { - const defaults = createDefaultLyricsSettings() + const defaults = createDefaultLyricsSettings(isDark) setLyricsSettings(defaults) saveLyricsSettings(defaults) - }, []) + }, [isDark]) const bodyRef = useRef(null) const activeLineRef = useRef(null) @@ -1127,6 +1149,23 @@ const KaraokeLyricsOverlay = ({ return () => window.removeEventListener('resize', onResize) }, []) + useEffect(() => { + setLyricsSettings((prev) => { + const currentColor = prev.main.colorKey + const shouldSwap = + (isDark && currentColor === 'black') || + (!isDark && currentColor === 'white') + if (!shouldSwap) return prev + const newColorKey = isDark ? 'white' : 'black' + const updated = { + ...prev, + main: { ...prev.main, colorKey: newColorKey }, + } + saveLyricsSettings(updated) + return updated + }) + }, [isDark]) + useEffect(() => { const body = bodyRef.current if (!body) { @@ -1308,6 +1347,8 @@ const KaraokeLyricsOverlay = ({ pronunciationLyric, showTranslation, showPronunciation, + translationEnabled, + pronunciationEnabled, }) const trByMainIndex = useMemo(() => { @@ -1354,7 +1395,10 @@ const KaraokeLyricsOverlay = ({ return } - const rafId = window.requestAnimationFrame(() => { + let animFrameId = null + let scrollAnimId = null + + animFrameId = window.requestAnimationFrame(() => { const body = bodyRef.current const activeNode = activeLineRef.current if (!body || !activeNode) { @@ -1368,23 +1412,36 @@ const KaraokeLyricsOverlay = ({ bodyRect.top - (body.clientHeight - activeRect.height) / 2 const maxTop = Math.max(0, body.scrollHeight - body.clientHeight) - const centeredTop = clamp(body.scrollTop + deltaWithinBody, 0, maxTop) + const targetTop = clamp(body.scrollTop + deltaWithinBody, 0, maxTop) + const distance = targetTop - body.scrollTop - if (Math.abs(body.scrollTop - centeredTop) < 2) { + if (Math.abs(distance) < 2) { return } - if (typeof body.scrollTo === 'function') { - body.scrollTo({ - top: centeredTop, - behavior: 'smooth', - }) - } else { - body.scrollTop = centeredTop + const startTop = body.scrollTop + const duration = 400 + const startTime = performance.now() + + const easeOutCubic = (t) => 1 - Math.pow(1 - t, 3) + + const step = (now) => { + const elapsed = now - startTime + const progress = Math.min(elapsed / duration, 1) + const eased = easeOutCubic(progress) + body.scrollTop = startTop + distance * eased + if (progress < 1) { + scrollAnimId = window.requestAnimationFrame(step) + } } + + scrollAnimId = window.requestAnimationFrame(step) }) - return () => window.cancelAnimationFrame(rafId) + return () => { + if (animFrameId) window.cancelAnimationFrame(animFrameId) + if (scrollAnimId) window.cancelAnimationFrame(scrollAnimId) + } }, [ centerSpacerPx, hasTimedMainLines, @@ -1453,10 +1510,11 @@ const KaraokeLyricsOverlay = ({ const [r, g, b] = parseColorRGB( getColorValue(lyricsSettings[layerKey].colorKey), ) + const baseFontSize = lyricsSettings[layerKey].fontSize if (!hasTimedMainLines) { return { opacity: 0.94, - fontSize: lyricsSettings[layerKey].fontSize, + fontSize: baseFontSize, color: `rgba(${r}, ${g}, ${b}, 0.94)`, lineHeight: KARAOKE_AUX_LINE_HEIGHT, } @@ -1482,11 +1540,18 @@ const KaraokeLyricsOverlay = ({ opacity = Math.max(0.22, 0.5 - level * 0.08) } + const fontSize = isActive + ? baseFontSize + : Math.round(baseFontSize * KARAOKE_AUX_INACTIVE_FONT_FACTOR) + return { opacity, - fontSize: lyricsSettings[layerKey].fontSize, + fontSize, color, lineHeight: KARAOKE_AUX_LINE_HEIGHT, + maxWidth: isActive + ? '100%' + : `${Math.round(KARAOKE_AUX_INACTIVE_FONT_FACTOR * 100)}%`, } } @@ -1524,52 +1589,49 @@ const KaraokeLyricsOverlay = ({ >
    - {languageBadges.map((badge) => ( -
    - - {badge.label} - - {badge.lang} -
    - ))} -
    -
    - - - - - - - - - - + + {badge.label} + + {badge.lang} +
    + ) + return badge.toggleable ? ( + + {badgeEl} + + ) : badgeEl + })}
    @@ -1590,6 +1652,7 @@ const KaraokeLyricsOverlay = ({
    +
    +
    ) } diff --git a/ui/src/audioplayer/lyrics.js b/ui/src/audioplayer/lyrics.js index b44e4d9f0..98c638ab3 100644 --- a/ui/src/audioplayer/lyrics.js +++ b/ui/src/audioplayer/lyrics.js @@ -2,7 +2,7 @@ const normalizeLanguageTag = (language) => (language || '').toLowerCase().replace('_', '-') // Roughly one 60fps frame; keeps line/token switching stable near tight boundaries. -const KARAOKE_SWITCH_EPSILON_MS = 18 +const KARAOKE_SWITCH_EPSILON_MS = 50 const LYRIC_KIND_MAIN = 'main' const LYRIC_KIND_TRANSLATION = 'translation' const LYRIC_KIND_PRONUNCIATION = 'pronunciation' diff --git a/ui/src/audioplayer/styles.js b/ui/src/audioplayer/styles.js index 09ccb8fcf..30ccf7afb 100644 --- a/ui/src/audioplayer/styles.js +++ b/ui/src/audioplayer/styles.js @@ -66,14 +66,14 @@ const useStyle = makeStyles( }, '& .react-jinke-music-player-mobile .react-jinke-music-player-mobile-cover.nd-mobile-lyrics-active': { - width: 'calc(100% - 40px)', + width: '100%', maxWidth: 'none', height: 'clamp(280px, 42vh, 460px)', aspectRatio: 'auto', - borderRadius: 24, - border: '1px solid rgba(255, 255, 255, 0.1)', - boxShadow: '0 18px 40px rgba(0, 0, 0, 0.32)', - background: 'rgba(6, 8, 12, 0.82)', + borderRadius: 12, + border: 'none', + boxShadow: 'none', + background: 'transparent', cursor: 'default', }, '& .react-jinke-music-player-mobile .react-jinke-music-player-mobile-cover img.cover':