feat: add TTML lyrics support with token-level karaoke and translation/pronunciation layers

Add a full TTML (Timed Text Markup Language) sidecar lyrics parser that extracts
word/syllable-level timing from <span> elements, plus translation and pronunciation
(transliteration) tracks from Apple Music TTML metadata sections.

Backend changes:
- TTML parser (core/lyrics/ttml.go) with support for all TTML time formats,
  nested timing contexts, and bare decimal second offsets
- Translation/pronunciation tracks resolved via key-based metadata linking
- Line timing hydration from token-level start/end values
- 'kind' field added to Lyrics model and StructuredLyric API response
  (main/translation/pronunciation)
- 'tokenLine' array in API response for word-level timing data
- UTF-8 BOM and UTF-16 LE encoding support for TTML files
- Fix for ambiguous time resolution in pronunciation spans (pre-1-minute)

Frontend changes:
- KaraokeLyricsOverlay rewritten with scrollable multi-line layout,
  word-level wipe highlighting with eased alpha transitions,
  rAF-driven playback clock with drift correction
- Inline translation (above) and pronunciation (below) each main line,
  with smart filtering to hide redundant lines (same normalized text)
- TR/PR toggle buttons and layer selection via selectLyricLayers()
- Click-to-seek: click any lyric line to jump to that position
- Customization popover with font-size sliders and color presets
  for each line type (TR/Default/PR), persisted to localStorage
- Smooth font-size transition between active and inactive lines
- Resizable overlay height via drag handle
- lyrics.js: resolveKaraokeTokenWindow, buildSyntheticWordTokens,
  findLayerLineIndexForMain, token sorting, collapsed timing detection

API extension (non-breaking, additive):
- tokenLine[].token[] provides per-word start/end timing (ms)
- tokenLine[].index maps back to the corresponding line[] entry
- kind field: 'main', 'translation', 'pronunciation'
- Clients ignoring tokenLine/kind continue to work unchanged
This commit is contained in:
ranokay 2026-02-20 16:54:45 +02:00
parent ccee33f474
commit c77e0de976
No known key found for this signature in database
30 changed files with 4644 additions and 59 deletions

View File

@ -52,6 +52,7 @@ A share of the revenue helps fund the development of Navidrome at no additional
- **Multi-platform**, runs on macOS, Linux and Windows. **Docker** images are also provided
- Ready to use binaries for all major platforms, including **Raspberry Pi**
- Automatically **monitors your library** for changes, importing new files and reloading new metadata
- Supports synchronized lyrics from sidecar **.lrc** and **.ttml** files (via `lyricspriority`)
- **Themeable**, modern and responsive **Web interface** based on [Material UI](https://material-ui.com)
- **Compatible** with all Subsonic/Madsonic/Airsonic [clients](https://www.navidrome.org/docs/overview/#apps)
- **Transcoding** on the fly. Can be set per user/player. **Opus encoding is supported**

View File

@ -677,7 +677,7 @@ func setViperDefaults() {
viper.SetDefault("coverartquality", 75)
viper.SetDefault("artistartpriority", "artist.*, album/artist.*, external")
viper.SetDefault("discartpriority", "disc*.*, cd*.*, cover.*, folder.*, front.*, discsubtitle, embedded")
viper.SetDefault("lyricspriority", ".lrc,.txt,embedded")
viper.SetDefault("lyricspriority", ".lrc,.ttml,.txt,embedded")
viper.SetDefault("enablegravatar", false)
viper.SetDefault("enablefavourites", true)
viper.SetDefault("enablestarrating", true)

View File

@ -44,6 +44,35 @@ var _ = Describe("sources", func() {
},
}
ttmlLyrics := model.LyricList{
model.Lyrics{
Kind: "main",
Lang: "eng",
Line: []model.Line{
{
Start: gg.P(int64(18800)),
Value: "We're no strangers to love",
},
{
Start: gg.P(int64(22800)),
Value: "You know the rules and so do I",
},
},
Synced: true,
},
model.Lyrics{
Kind: "main",
Lang: "por",
Line: []model.Line{
{
Start: gg.P(int64(18800)),
Value: "Nao somos estranhos ao amor",
},
},
Synced: true,
},
}
unsyncedLyrics := model.LyricList{
model.Lyrics{
Lang: "xxx",
@ -80,7 +109,8 @@ var _ = Describe("sources", func() {
},
Entry("embedded > lrc > txt", "embedded,.lrc,.txt", embeddedLyrics),
Entry("lrc > embedded > txt", ".lrc,embedded,.txt", syncedLyrics),
Entry("txt > lrc > embedded", ".txt,.lrc,embedded", unsyncedLyrics))
Entry("txt > lrc > embedded", ".txt,.lrc,embedded", unsyncedLyrics),
Entry("ttml > lrc > embedded", ".ttml,.lrc,embedded", ttmlLyrics))
Context("Errors", func() {
var RegularUserContext = XContext

View File

@ -5,6 +5,7 @@ import (
"errors"
"os"
"path"
"strings"
"github.com/navidrome/navidrome/log"
"github.com/navidrome/navidrome/model"
@ -36,18 +37,31 @@ func fromExternalFile(ctx context.Context, mf *model.MediaFile, suffix string) (
return nil, err
}
lyrics, err := model.ToLyrics("xxx", string(contents))
if err != nil {
log.Error(ctx, "error parsing lyric external file", "path", externalLyric, err)
return nil, err
} else if lyrics == nil {
var list model.LyricList
if strings.EqualFold(suffix, ".ttml") {
list, err = parseTTML(contents)
if err != nil {
log.Error(ctx, "error parsing ttml external file", "path", externalLyric, err)
return nil, err
}
} else {
lyrics, err := model.ToLyrics("xxx", string(contents))
if err != nil {
log.Error(ctx, "error parsing lyric external file", "path", externalLyric, err)
return nil, err
}
if lyrics != nil {
list = model.LyricList{*lyrics}
}
}
if len(list) == 0 {
log.Trace(ctx, "empty lyrics from external file", "path", externalLyric)
return nil, nil
}
log.Trace(ctx, "retrieved lyrics from external file", "path", externalLyric)
return model.LyricList{*lyrics}, nil
return list, nil
}
// fromPlugin attempts to load lyrics from a plugin with the given name.

View File

@ -109,6 +109,41 @@ var _ = Describe("sources", func() {
}))
})
It("should return synchronized multilingual lyrics from a TTML file", func() {
mf := model.MediaFile{Path: "tests/fixtures/test.mp3"}
lyrics, err := fromExternalFile(ctx, &mf, ".ttml")
Expect(err).To(BeNil())
Expect(lyrics).To(Equal(model.LyricList{
{
Kind: "main",
Lang: "eng",
Line: []model.Line{
{
Start: gg.P(int64(18800)),
Value: "We're no strangers to love",
},
{
Start: gg.P(int64(22800)),
Value: "You know the rules and so do I",
},
},
Synced: true,
},
{
Kind: "main",
Lang: "por",
Line: []model.Line{
{
Start: gg.P(int64(18800)),
Value: "Nao somos estranhos ao amor",
},
},
Synced: true,
},
}))
})
It("should handle LRC files with UTF-8 BOM marker (issue #4631)", func() {
// The function looks for <basePath-without-ext><suffix>, so we need to pass
// a MediaFile with .mp3 path and look for .lrc suffix
@ -142,5 +177,33 @@ var _ = Describe("sources", func() {
Expect(lyrics[0].Line[1].Start).To(Equal(gg.P(int64(22801))))
Expect(lyrics[0].Line[1].Value).To(Equal("You know the rules and so do I"))
})
It("should handle TTML files with UTF-8 BOM marker", func() {
mf := model.MediaFile{Path: "tests/fixtures/bom-test.mp3"}
lyrics, err := fromExternalFile(ctx, &mf, ".ttml")
Expect(err).To(BeNil())
Expect(lyrics).To(HaveLen(1))
Expect(lyrics[0].Kind).To(Equal("main"))
Expect(lyrics[0].Synced).To(BeTrue())
Expect(lyrics[0].Line).To(HaveLen(1))
Expect(lyrics[0].Line[0].Start).To(Equal(gg.P(int64(0))))
Expect(lyrics[0].Line[0].Value).To(Equal("BOM test line"))
})
It("should handle UTF-16 LE encoded TTML files", func() {
mf := model.MediaFile{Path: "tests/fixtures/bom-utf16-test.mp3"}
lyrics, err := fromExternalFile(ctx, &mf, ".ttml")
Expect(err).To(BeNil())
Expect(lyrics).To(HaveLen(1))
Expect(lyrics[0].Kind).To(Equal("main"))
Expect(lyrics[0].Synced).To(BeTrue())
Expect(lyrics[0].Line).To(HaveLen(2))
Expect(lyrics[0].Line[0].Start).To(Equal(gg.P(int64(18800))))
Expect(lyrics[0].Line[0].Value).To(Equal("UTF16 line one"))
Expect(lyrics[0].Line[1].Start).To(Equal(gg.P(int64(22801))))
Expect(lyrics[0].Line[1].Value).To(Equal("UTF16 line two"))
})
})
})

View File

@ -0,0 +1,92 @@
package lyrics
import (
"context"
"os"
"path/filepath"
"testing"
"github.com/navidrome/navidrome/model"
)
func TestFromExternalFileTTML(t *testing.T) {
ctx := context.Background()
mf := model.MediaFile{Path: fixturePath("test.mp3")}
lyrics, err := fromExternalFile(ctx, &mf, ".ttml")
if err != nil {
t.Fatalf("fromExternalFile returned error: %v", err)
}
if len(lyrics) != 2 {
t.Fatalf("expected 2 lyric tracks, got %d", len(lyrics))
}
if lyrics[0].Lang != "eng" {
t.Fatalf("expected first language 'eng', got %q", lyrics[0].Lang)
}
if len(lyrics[0].Line) != 2 {
t.Fatalf("expected 2 english lines, got %d", len(lyrics[0].Line))
}
if lyrics[0].Line[0].Start == nil || *lyrics[0].Line[0].Start != 18800 {
t.Fatalf("expected first english line start to be 18800, got %v", lyrics[0].Line[0].Start)
}
}
func TestFromExternalFileTTMLWithUTF8BOM(t *testing.T) {
ctx := context.Background()
mf := model.MediaFile{Path: fixturePath("bom-test.ttml")}
lyrics, err := fromExternalFile(ctx, &mf, ".ttml")
if err != nil {
t.Fatalf("fromExternalFile returned error: %v", err)
}
if len(lyrics) != 1 {
t.Fatalf("expected 1 lyric track, got %d", len(lyrics))
}
if !lyrics[0].Synced {
t.Fatal("expected BOM TTML lyrics to be synced")
}
if len(lyrics[0].Line) != 1 {
t.Fatalf("expected 1 lyric line, got %d", len(lyrics[0].Line))
}
if lyrics[0].Line[0].Start == nil || *lyrics[0].Line[0].Start != 0 {
t.Fatalf("expected first line start 0, got %v", lyrics[0].Line[0].Start)
}
}
func TestFromExternalFileTTMLUTF16(t *testing.T) {
ctx := context.Background()
mf := model.MediaFile{Path: fixturePath("bom-utf16-test.ttml")}
lyrics, err := fromExternalFile(ctx, &mf, ".ttml")
if err != nil {
t.Fatalf("fromExternalFile returned error: %v", err)
}
if len(lyrics) != 1 {
t.Fatalf("expected 1 lyric track, got %d", len(lyrics))
}
if !lyrics[0].Synced {
t.Fatal("expected UTF16 TTML lyrics to be synced")
}
if len(lyrics[0].Line) != 2 {
t.Fatalf("expected 2 lyric lines, got %d", len(lyrics[0].Line))
}
if lyrics[0].Line[0].Start == nil || *lyrics[0].Line[0].Start != 18800 {
t.Fatalf("expected first line start 18800, got %v", lyrics[0].Line[0].Start)
}
if lyrics[0].Line[1].Start == nil || *lyrics[0].Line[1].Start != 22801 {
t.Fatalf("expected second line start 22801, got %v", lyrics[0].Line[1].Start)
}
}
func fixturePath(name string) string {
candidates := []string{
filepath.Join("tests", "fixtures", name),
filepath.Join("..", "..", "tests", "fixtures", name),
}
for _, candidate := range candidates {
if _, err := os.Stat(candidate); err == nil {
return candidate
}
}
return filepath.Join("tests", "fixtures", name)
}

886
core/lyrics/ttml.go Normal file
View File

@ -0,0 +1,886 @@
package lyrics
import (
"bytes"
"encoding/xml"
"errors"
"io"
"math"
"regexp"
"sort"
"strconv"
"strings"
"github.com/navidrome/navidrome/log"
"github.com/navidrome/navidrome/model"
"github.com/navidrome/navidrome/utils/str"
)
const (
defaultTTMLFrameRate = 30.0
defaultTTMLSubFrameRate = 1.0
defaultTTMLTickRate = 1.0
ttmlLyricKindMain = "main"
ttmlLyricKindTranslation = "translation"
ttmlLyricKindPronunciation = "pronunciation"
)
var offsetTimeRegex = regexp.MustCompile(`^([0-9]+(?:\.[0-9]+)?)(h|m|s|ms|f|t)$`)
var xmlEncodingRegex = regexp.MustCompile(`(?i)<\?xml([^>]*?)encoding\s*=\s*["'][^"']+["']([^>]*)\?>`)
type ttmlTimeKind int
const (
ttmlTimeAbsolute ttmlTimeKind = iota
ttmlTimeOffset
ttmlTimeAmbiguous
)
type ttmlTimingParams struct {
frameRate float64
subFrameRate float64
tickRate float64
}
type ttmlTimingContext struct {
lang string
role string
begin int64
hasBegin bool
end int64
hasEnd bool
invalid bool
}
type ttmlLineRef struct {
order int
line model.Line
}
type ttmlMetadataEntry struct {
key string
line model.Line
seq int
}
type ttmlResolvedMetadataLine struct {
order int
seq int
line model.Line
}
type ttmlParser struct {
decoder *xml.Decoder
params ttmlTimingParams
mainLangOrder []string
mainLinesByLang map[string][]model.Line
mainLineRefsByKey map[string]ttmlLineRef
mainLineOrder int
translationLangOrder []string
translationEntriesByLg map[string][]ttmlMetadataEntry
pronunciationLangOrder []string
pronunciationEntriesByLg map[string][]ttmlMetadataEntry
metadataSeq int
}
func parseTTML(contents []byte) (model.LyricList, error) {
contents = xmlEncodingRegex.ReplaceAll(contents, []byte(`<?xml$1encoding="UTF-8"$2?>`))
p := ttmlParser{
decoder: xml.NewDecoder(bytes.NewReader(contents)),
params: ttmlTimingParams{
frameRate: defaultTTMLFrameRate,
subFrameRate: defaultTTMLSubFrameRate,
tickRate: defaultTTMLTickRate,
},
mainLinesByLang: make(map[string][]model.Line),
mainLineRefsByKey: make(map[string]ttmlLineRef),
translationEntriesByLg: make(map[string][]ttmlMetadataEntry),
pronunciationEntriesByLg: make(map[string][]ttmlMetadataEntry),
}
root := ttmlTimingContext{lang: "xxx"}
for {
token, err := p.decoder.Token()
if errors.Is(err, io.EOF) {
break
}
if err != nil {
return nil, err
}
start, ok := token.(xml.StartElement)
if !ok {
continue
}
if err := p.parseElement(start, root); err != nil {
return nil, err
}
}
return p.toLyricList(), nil
}
func (p *ttmlParser) parseElement(start xml.StartElement, parent ttmlTimingContext) error {
local := strings.ToLower(start.Name.Local)
if local == "tt" {
p.updateTimingParams(start.Attr)
}
switch local {
case "translation":
return p.parseMetadataTrack(start, parent, ttmlLyricKindTranslation)
case "transliteration":
return p.parseMetadataTrack(start, parent, ttmlLyricKindPronunciation)
}
ctx := p.childContext(start.Attr, parent)
if local == "p" {
lineText, tokens, err := p.parseParagraph(ctx)
if err != nil {
return err
}
if ctx.invalid || lineText == "" {
return nil
}
parsedLine := model.Line{Value: lineText}
if ctx.hasBegin {
startMs := ctx.begin
parsedLine.Start = &startMs
}
if ctx.hasEnd {
endMs := ctx.end
parsedLine.End = &endMs
}
if len(tokens) > 0 {
parsedLine.Token = tokens
}
parsedLine = hydrateLineTimingFromTokens(parsedLine)
lineKey, _ := attrValue(start.Attr, "key")
p.addMainLine(ctx.lang, lineKey, parsedLine)
return nil
}
for {
token, err := p.decoder.Token()
if err != nil {
return err
}
switch t := token.(type) {
case xml.StartElement:
nextParent := ctx
if ctx.invalid {
// Best effort: ignore invalid timing in container elements, and
// continue traversing descendants with parent context.
nextParent = parent
}
if err := p.parseElement(t, nextParent); err != nil {
return err
}
case xml.EndElement:
if strings.EqualFold(t.Name.Local, start.Name.Local) {
return nil
}
}
}
}
func (p *ttmlParser) parseMetadataTrack(start xml.StartElement, parent ttmlTimingContext, kind string) error {
ctx := p.childContext(start.Attr, parent)
lang := normalizeTTMLLang(ctx.lang)
for {
token, err := p.decoder.Token()
if err != nil {
return err
}
switch t := token.(type) {
case xml.StartElement:
if strings.EqualFold(t.Name.Local, "text") {
entry, ok, err := p.parseMetadataText(t, ctx)
if err != nil {
return err
}
if ok {
p.addMetadataEntry(kind, lang, entry)
}
continue
}
nextParent := ctx
if ctx.invalid {
nextParent = parent
}
if err := p.parseElement(t, nextParent); err != nil {
return err
}
case xml.EndElement:
if strings.EqualFold(t.Name.Local, start.Name.Local) {
return nil
}
}
}
}
func (p *ttmlParser) parseMetadataText(start xml.StartElement, parent ttmlTimingContext) (ttmlMetadataEntry, bool, error) {
forKey, hasFor := attrValue(start.Attr, "for")
forKey = strings.TrimSpace(forKey)
value, tokens, err := p.parseInlineElement(start, parent)
if err != nil {
return ttmlMetadataEntry{}, false, err
}
if !hasFor || forKey == "" {
return ttmlMetadataEntry{}, false, nil
}
ctx := p.childContext(start.Attr, parent)
if ctx.invalid {
return ttmlMetadataEntry{}, false, nil
}
line := model.Line{Value: sanitizeTTMLText(value)}
if ctx.hasBegin {
startMs := ctx.begin
line.Start = &startMs
}
if ctx.hasEnd {
endMs := ctx.end
line.End = &endMs
}
if len(tokens) > 0 {
line.Token = tokens
}
line = hydrateLineTimingFromTokens(line)
if line.Value == "" && len(line.Token) == 0 {
return ttmlMetadataEntry{}, false, nil
}
return ttmlMetadataEntry{key: forKey, line: line}, true, nil
}
func (p *ttmlParser) parseParagraph(parent ttmlTimingContext) (string, []model.Token, error) {
var text strings.Builder
var tokens []model.Token
for {
token, err := p.decoder.Token()
if err != nil {
return "", nil, err
}
switch t := token.(type) {
case xml.StartElement:
value, inlineTokens, err := p.parseInlineElement(t, parent)
if err != nil {
return "", nil, err
}
text.WriteString(value)
tokens = append(tokens, inlineTokens...)
case xml.EndElement:
if strings.EqualFold(t.Name.Local, "p") {
return sanitizeTTMLText(text.String()), tokens, nil
}
case xml.CharData:
text.WriteString(string(t))
}
}
}
func (p *ttmlParser) parseInlineElement(start xml.StartElement, parent ttmlTimingContext) (string, []model.Token, error) {
local := strings.ToLower(start.Name.Local)
if local == "br" {
return "\n", nil, nil
}
ctx := p.childContext(start.Attr, parent)
_, hasBegin := attrValue(start.Attr, "begin")
_, hasEnd := attrValue(start.Attr, "end")
_, hasDur := attrValue(start.Attr, "dur")
hasOwnTiming := hasBegin || hasEnd || hasDur
var text strings.Builder
var tokens []model.Token
for {
token, err := p.decoder.Token()
if err != nil {
return "", nil, err
}
switch t := token.(type) {
case xml.StartElement:
value, inlineTokens, err := p.parseInlineElement(t, ctx)
if err != nil {
return "", nil, err
}
text.WriteString(value)
tokens = append(tokens, inlineTokens...)
case xml.EndElement:
if !strings.EqualFold(t.Name.Local, start.Name.Local) {
continue
}
value := text.String()
tokenText := sanitizeTTMLText(value)
if local == "span" && hasOwnTiming && !ctx.invalid && tokenText != "" && len(tokens) == 0 {
parsedToken := model.Token{
Value: tokenText,
Role: ctx.role,
}
if ctx.hasBegin {
startMs := ctx.begin
parsedToken.Start = &startMs
}
if ctx.hasEnd {
endMs := ctx.end
parsedToken.End = &endMs
}
tokens = append(tokens, parsedToken)
}
return value, tokens, nil
case xml.CharData:
text.WriteString(string(t))
}
}
}
func (p *ttmlParser) toLyricList() model.LyricList {
res := make(model.LyricList, 0, len(p.mainLangOrder)+len(p.translationLangOrder)+len(p.pronunciationLangOrder))
for _, lang := range p.mainLangOrder {
lines := p.mainLinesByLang[lang]
if len(lines) == 0 {
continue
}
res = append(res, model.Lyrics{
Kind: ttmlLyricKindMain,
Lang: lang,
Line: lines,
Synced: linesAreSynced(lines),
})
}
res = append(res, p.buildMetadataLyrics(ttmlLyricKindTranslation, p.translationLangOrder, p.translationEntriesByLg)...)
res = append(res, p.buildMetadataLyrics(ttmlLyricKindPronunciation, p.pronunciationLangOrder, p.pronunciationEntriesByLg)...)
return res
}
func (p *ttmlParser) buildMetadataLyrics(kind string, langOrder []string, entriesByLang map[string][]ttmlMetadataEntry) model.LyricList {
res := make(model.LyricList, 0, len(langOrder))
for _, lang := range langOrder {
entries := entriesByLang[lang]
if len(entries) == 0 {
continue
}
seenKeys := make(map[string]struct{}, len(entries))
resolved := make([]ttmlResolvedMetadataLine, 0, len(entries))
for _, entry := range entries {
if _, exists := seenKeys[entry.key]; exists {
continue
}
seenKeys[entry.key] = struct{}{}
ref, ok := p.mainLineRefsByKey[entry.key]
if !ok {
log.Warn("Skipping TTML metadata line without matching key", "kind", kind, "lang", lang, "key", entry.key)
continue
}
line := entry.line
if line.Start == nil && ref.line.Start != nil {
startMs := *ref.line.Start
line.Start = &startMs
}
if line.End == nil && ref.line.End != nil {
endMs := *ref.line.End
line.End = &endMs
}
line = hydrateLineTimingFromTokens(line)
if line.Value == "" && len(line.Token) == 0 {
continue
}
resolved = append(resolved, ttmlResolvedMetadataLine{
order: ref.order,
seq: entry.seq,
line: line,
})
}
if len(resolved) == 0 {
continue
}
sort.SliceStable(resolved, func(i, j int) bool {
if resolved[i].order != resolved[j].order {
return resolved[i].order < resolved[j].order
}
return resolved[i].seq < resolved[j].seq
})
lines := make([]model.Line, len(resolved))
for i := range resolved {
lines[i] = resolved[i].line
}
res = append(res, model.Lyrics{
Kind: kind,
Lang: lang,
Line: lines,
Synced: linesAreSynced(lines),
})
}
return res
}
func (p *ttmlParser) addMainLine(lang string, lineKey string, line model.Line) {
lang = normalizeTTMLLang(lang)
if _, ok := p.mainLinesByLang[lang]; !ok {
p.mainLangOrder = append(p.mainLangOrder, lang)
}
p.mainLinesByLang[lang] = append(p.mainLinesByLang[lang], line)
lineKey = strings.TrimSpace(lineKey)
if lineKey != "" {
if _, exists := p.mainLineRefsByKey[lineKey]; !exists {
p.mainLineRefsByKey[lineKey] = ttmlLineRef{
order: p.mainLineOrder,
line: line,
}
}
}
p.mainLineOrder++
}
func (p *ttmlParser) addMetadataEntry(kind string, lang string, entry ttmlMetadataEntry) {
lang = normalizeTTMLLang(lang)
entry.seq = p.metadataSeq
p.metadataSeq++
switch kind {
case ttmlLyricKindTranslation:
if _, ok := p.translationEntriesByLg[lang]; !ok {
p.translationLangOrder = append(p.translationLangOrder, lang)
}
p.translationEntriesByLg[lang] = append(p.translationEntriesByLg[lang], entry)
case ttmlLyricKindPronunciation:
if _, ok := p.pronunciationEntriesByLg[lang]; !ok {
p.pronunciationLangOrder = append(p.pronunciationLangOrder, lang)
}
p.pronunciationEntriesByLg[lang] = append(p.pronunciationEntriesByLg[lang], entry)
}
}
func (p *ttmlParser) childContext(attrs []xml.Attr, parent ttmlTimingContext) ttmlTimingContext {
ctx := parent
if lang, ok := attrValue(attrs, "lang"); ok {
ctx.lang = normalizeTTMLLang(lang)
}
if role, ok := attrValue(attrs, "role"); ok {
role = strings.TrimSpace(role)
if role != "" {
if ctx.role == "" {
ctx.role = role
} else if !strings.Contains(ctx.role, role) {
ctx.role = ctx.role + " " + role
}
}
}
beginExpr, hasBegin := attrValue(attrs, "begin")
endExpr, hasEnd := attrValue(attrs, "end")
durExpr, hasDur := attrValue(attrs, "dur")
if hasBegin {
begin, kind, ok := parseTTMLTimeExpression(beginExpr, p.params)
if !ok {
ctx.invalid = true
return ctx
}
base := int64(0)
if parent.hasBegin {
base = parent.begin
}
ctx.begin = resolveTTMLTime(begin, kind, base, parent)
ctx.hasBegin = true
} else {
ctx.begin = parent.begin
ctx.hasBegin = parent.hasBegin
}
var calculatedEnd int64
calculatedHasEnd := false
if hasEnd {
end, kind, ok := parseTTMLTimeExpression(endExpr, p.params)
if !ok {
ctx.invalid = true
return ctx
}
base := ctx.begin
if !ctx.hasBegin {
base = parent.begin
}
calculatedEnd = resolveTTMLTime(end, kind, base, parent)
calculatedHasEnd = true
}
if hasDur {
dur, ok := parseTTMLDurationExpression(durExpr, p.params)
if !ok {
ctx.invalid = true
return ctx
}
if ctx.hasBegin {
durEnd := ctx.begin + dur
if !calculatedHasEnd || durEnd < calculatedEnd {
calculatedEnd = durEnd
calculatedHasEnd = true
}
}
}
if !calculatedHasEnd && parent.hasEnd {
calculatedEnd = parent.end
calculatedHasEnd = true
}
ctx.end = calculatedEnd
ctx.hasEnd = calculatedHasEnd
return ctx
}
func (p *ttmlParser) updateTimingParams(attrs []xml.Attr) {
frameRate := p.params.frameRate
if value, ok := attrValue(attrs, "frameRate"); ok {
if parsed, err := strconv.ParseFloat(value, 64); err == nil && parsed > 0 {
frameRate = parsed
}
}
if value, ok := attrValue(attrs, "frameRateMultiplier"); ok {
parts := strings.Fields(value)
if len(parts) == 2 {
numerator, errA := strconv.ParseFloat(parts[0], 64)
denominator, errB := strconv.ParseFloat(parts[1], 64)
if errA == nil && errB == nil && denominator > 0 {
frameRate = frameRate * (numerator / denominator)
}
}
}
subFrameRate := p.params.subFrameRate
if value, ok := attrValue(attrs, "subFrameRate"); ok {
if parsed, err := strconv.ParseFloat(value, 64); err == nil && parsed > 0 {
subFrameRate = parsed
}
}
tickRate := p.params.tickRate
if value, ok := attrValue(attrs, "tickRate"); ok {
if parsed, err := strconv.ParseFloat(value, 64); err == nil && parsed > 0 {
tickRate = parsed
}
}
p.params.frameRate = max(frameRate, defaultTTMLFrameRate)
p.params.subFrameRate = max(subFrameRate, defaultTTMLSubFrameRate)
p.params.tickRate = max(tickRate, defaultTTMLTickRate)
}
func parseTTMLDurationExpression(expr string, params ttmlTimingParams) (int64, bool) {
value, _, ok := parseTTMLTimeExpression(expr, params)
return value, ok
}
func resolveTTMLTime(value int64, kind ttmlTimeKind, base int64, parent ttmlTimingContext) int64 {
switch kind {
case ttmlTimeAbsolute:
return value
case ttmlTimeOffset:
return base + value
case ttmlTimeAmbiguous:
absolute := value
offset := base + value
// No parent timing context → no reference frame for offsets.
// Prefer absolute when offset differs (i.e., base > 0).
if !parent.hasBegin && !parent.hasEnd && base != 0 {
return absolute
}
if parent.hasBegin && parent.hasEnd {
absoluteInParent := absolute >= parent.begin && absolute <= parent.end
offsetInParent := offset >= parent.begin && offset <= parent.end
if absoluteInParent && !offsetInParent {
return absolute
}
if offsetInParent && !absoluteInParent {
return offset
}
}
if parent.hasBegin {
if absolute < parent.begin && offset >= parent.begin {
return offset
}
if absolute >= parent.begin && offset > absolute {
return absolute
}
}
return offset
default:
return base + value
}
}
func parseTTMLTimeExpression(expr string, params ttmlTimingParams) (int64, ttmlTimeKind, bool) {
expr = strings.TrimSpace(expr)
if expr == "" {
return 0, ttmlTimeOffset, false
}
lower := strings.ToLower(expr)
if strings.Contains(lower, "wallclock(") ||
strings.Contains(lower, ".begin") ||
strings.Contains(lower, ".end") {
log.Warn("Unsupported TTML time expression", "value", expr)
return 0, ttmlTimeOffset, false
}
// Best-effort support for non-standard TTML seen in the wild where a
// bare decimal value is used (implicitly seconds), e.g. "0.170".
if value, err := strconv.ParseFloat(lower, 64); err == nil && value >= 0 {
return int64(math.Round(value * 1000)), ttmlTimeAmbiguous, true
}
if matches := offsetTimeRegex.FindStringSubmatch(lower); len(matches) == 3 {
value, err := strconv.ParseFloat(matches[1], 64)
if err != nil {
return 0, ttmlTimeOffset, false
}
unit := matches[2]
seconds := 0.0
switch unit {
case "h":
seconds = value * 60 * 60
case "m":
seconds = value * 60
case "s":
seconds = value
case "ms":
seconds = value / 1000
case "f":
seconds = value / params.frameRate
case "t":
seconds = value / params.tickRate
default:
return 0, ttmlTimeOffset, false
}
return int64(math.Round(seconds * 1000)), ttmlTimeOffset, true
}
colonCount := strings.Count(expr, ":")
switch colonCount {
case 1, 2:
clockMs, ok := parseTTMLClockTime(expr)
if !ok {
return 0, ttmlTimeAbsolute, false
}
return clockMs, ttmlTimeAbsolute, true
case 3:
framesMs, ok := parseTTMLFrameTime(expr, params)
if !ok {
return 0, ttmlTimeAbsolute, false
}
return framesMs, ttmlTimeAbsolute, true
default:
log.Warn("Unsupported TTML time expression", "value", expr)
return 0, ttmlTimeOffset, false
}
}
func parseTTMLClockTime(value string) (int64, bool) {
parts := strings.Split(value, ":")
if len(parts) != 2 && len(parts) != 3 {
return 0, false
}
hours := int64(0)
minutesIdx := 0
if len(parts) == 3 {
h, err := strconv.ParseInt(parts[0], 10, 64)
if err != nil {
return 0, false
}
hours = h
minutesIdx = 1
}
minutes, err := strconv.ParseInt(parts[minutesIdx], 10, 64)
if err != nil {
return 0, false
}
seconds, err := strconv.ParseFloat(parts[minutesIdx+1], 64)
if err != nil {
return 0, false
}
totalSeconds := float64(hours*60*60+minutes*60) + seconds
return int64(math.Round(totalSeconds * 1000)), true
}
func parseTTMLFrameTime(value string, params ttmlTimingParams) (int64, bool) {
parts := strings.Split(value, ":")
if len(parts) != 4 {
return 0, false
}
hours, err := strconv.ParseInt(parts[0], 10, 64)
if err != nil {
return 0, false
}
minutes, err := strconv.ParseInt(parts[1], 10, 64)
if err != nil {
return 0, false
}
seconds, err := strconv.ParseInt(parts[2], 10, 64)
if err != nil {
return 0, false
}
frameParts := strings.SplitN(parts[3], ".", 2)
frames, err := strconv.ParseFloat(frameParts[0], 64)
if err != nil {
return 0, false
}
subFrames := 0.0
if len(frameParts) == 2 {
subFrames, err = strconv.ParseFloat(frameParts[1], 64)
if err != nil {
return 0, false
}
}
totalSeconds := float64(hours*60*60 + minutes*60 + seconds)
totalSeconds += frames / params.frameRate
totalSeconds += subFrames / (params.subFrameRate * params.frameRate)
return int64(math.Round(totalSeconds * 1000)), true
}
func attrValue(attrs []xml.Attr, key string) (string, bool) {
for _, attr := range attrs {
if strings.EqualFold(attr.Name.Local, key) {
return strings.TrimSpace(attr.Value), true
}
}
return "", false
}
func normalizeTTMLLang(lang string) string {
lang = strings.ToLower(strings.TrimSpace(lang))
if lang == "" {
return "xxx"
}
return lang
}
func sanitizeTTMLText(raw string) string {
raw = str.SanitizeText(raw)
raw = strings.ReplaceAll(raw, "\r\n", "\n")
raw = strings.ReplaceAll(raw, "\r", "\n")
lines := strings.Split(raw, "\n")
for i := range lines {
lines[i] = strings.TrimSpace(lines[i])
}
return strings.TrimSpace(strings.Join(lines, "\n"))
}
func linesAreSynced(lines []model.Line) bool {
for i := range lines {
if lines[i].Start != nil {
return true
}
for j := range lines[i].Token {
if lines[i].Token[j].Start != nil {
return true
}
}
}
return false
}
func hydrateLineTimingFromTokens(line model.Line) model.Line {
if len(line.Token) == 0 {
return line
}
var earliestStart *int64
var latestEnd *int64
for i := range line.Token {
token := line.Token[i]
if token.Start != nil {
if earliestStart == nil || *token.Start < *earliestStart {
v := *token.Start
earliestStart = &v
}
}
candidateEnd := token.End
if candidateEnd == nil {
candidateEnd = token.Start
}
if candidateEnd != nil {
if latestEnd == nil || *candidateEnd > *latestEnd {
v := *candidateEnd
latestEnd = &v
}
}
}
if line.Start == nil && earliestStart != nil {
v := *earliestStart
line.Start = &v
}
if line.End == nil && latestEnd != nil {
v := *latestEnd
line.End = &v
}
return line
}
func max(v float64, fallback float64) float64 {
if v <= 0 {
return fallback
}
return v
}

398
core/lyrics/ttml_test.go Normal file
View File

@ -0,0 +1,398 @@
package lyrics
import (
"testing"
"github.com/navidrome/navidrome/model"
)
func TestParseTTML_MultiLanguageAndTiming(t *testing.T) {
content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
<tt xmlns="http://www.w3.org/ns/ttml" xmlns:ttp="http://www.w3.org/ns/ttml#parameter" ttp:frameRate="30" ttp:subFrameRate="2" ttp:tickRate="10">
<body>
<div xml:lang="eng" begin="1s">
<p begin="2s">Line one</p>
<p begin="00:00:04:15.1"><span>Line two</span><br/>with break</p>
</div>
<div xml:lang="por">
<p begin="45t">Linha</p>
</div>
</body>
</tt>`)
list, err := parseTTML(content)
if err != nil {
t.Fatalf("parseTTML returned error: %v", err)
}
if len(list) != 2 {
t.Fatalf("expected 2 lyric tracks, got %d", len(list))
}
eng := list[0]
if eng.Lang != "eng" {
t.Fatalf("expected first track language 'eng', got %q", eng.Lang)
}
if !eng.Synced {
t.Fatal("expected first track to be synced")
}
assertTimedLine(t, eng.Line[0], 3000, "Line one")
assertTimedLine(t, eng.Line[1], 4517, "Line two\nwith break")
por := list[1]
if por.Lang != "por" {
t.Fatalf("expected second track language 'por', got %q", por.Lang)
}
assertTimedLine(t, por.Line[0], 4500, "Linha")
}
func TestParseTTML_UnsupportedCueSkipped(t *testing.T) {
content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
<tt xmlns="http://www.w3.org/ns/ttml">
<body xml:lang="eng">
<div>
<p begin="wallclock(2026-01-01T00:00:00Z)">Skip me</p>
<p begin="1s">Keep me</p>
</div>
</body>
</tt>`)
list, err := parseTTML(content)
if err != nil {
t.Fatalf("parseTTML returned error: %v", err)
}
if len(list) != 1 {
t.Fatalf("expected 1 lyric track, got %d", len(list))
}
if len(list[0].Line) != 1 {
t.Fatalf("expected 1 line in lyric track, got %d", len(list[0].Line))
}
assertTimedLine(t, list[0].Line[0], 1000, "Keep me")
}
func TestParseTTML_BeginEndDurWithInheritance(t *testing.T) {
content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
<tt xmlns="http://www.w3.org/ns/ttml">
<body xml:lang="eng" begin="10s">
<div begin="5s" dur="8s">
<p begin="1s" dur="2s">First line</p>
<p begin="3s" end="5s">Second line</p>
</div>
</body>
</tt>`)
list, err := parseTTML(content)
if err != nil {
t.Fatalf("parseTTML returned error: %v", err)
}
if len(list) != 1 {
t.Fatalf("expected 1 lyric track, got %d", len(list))
}
if list[0].Lang != "eng" {
t.Fatalf("expected language 'eng', got %q", list[0].Lang)
}
if len(list[0].Line) != 2 {
t.Fatalf("expected 2 lines, got %d", len(list[0].Line))
}
assertTimedLine(t, list[0].Line[0], 16000, "First line")
assertTimedLine(t, list[0].Line[1], 18000, "Second line")
}
func TestParseTTML_NonStandardBareSecondOffsets(t *testing.T) {
content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
<tt xmlns="http://www.w3.org/ns/ttml">
<body xml:lang="eng" begin="10">
<div>
<p begin="0.170">First line</p>
<p begin="3.710">Second line</p>
</div>
</body>
</tt>`)
list, err := parseTTML(content)
if err != nil {
t.Fatalf("parseTTML returned error: %v", err)
}
if len(list) != 1 {
t.Fatalf("expected 1 lyric track, got %d", len(list))
}
if len(list[0].Line) != 2 {
t.Fatalf("expected 2 lines, got %d", len(list[0].Line))
}
assertTimedLine(t, list[0].Line[0], 10170, "First line")
assertTimedLine(t, list[0].Line[1], 13710, "Second line")
}
func TestParseTTML_WordTimingTokens(t *testing.T) {
content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
<tt xmlns="http://www.w3.org/ns/ttml" xmlns:ttm="http://www.w3.org/ns/ttml#metadata">
<body xml:lang="eng">
<div>
<p begin="00:01.000" end="00:03.000">
<span begin="00:01.000" end="00:01.400">He</span><span begin="00:01.400" end="00:01.800">llo</span>
<span ttm:role="x-bg"><span begin="00:02.000" end="00:02.500">echo</span></span>
</p>
</div>
</body>
</tt>`)
list, err := parseTTML(content)
if err != nil {
t.Fatalf("parseTTML returned error: %v", err)
}
if len(list) != 1 {
t.Fatalf("expected 1 lyric track, got %d", len(list))
}
if len(list[0].Line) != 1 {
t.Fatalf("expected 1 line, got %d", len(list[0].Line))
}
line := list[0].Line[0]
assertTimedLine(t, line, 1000, "Hello\necho")
if line.End == nil || *line.End != 3000 {
t.Fatalf("expected line end 3000, got %v", line.End)
}
if len(line.Token) != 3 {
t.Fatalf("expected 3 timed tokens, got %d", len(line.Token))
}
assertToken(t, line.Token[0], 1000, 1400, "He", "")
assertToken(t, line.Token[1], 1400, 1800, "llo", "")
assertToken(t, line.Token[2], 2000, 2500, "echo", "x-bg")
}
func TestParseTTML_AmbiguousDecimalTimingPrefersAbsoluteWhenInsideParentWindow(t *testing.T) {
content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
<tt xmlns="http://www.w3.org/ns/ttml">
<body xml:lang="eng">
<div begin="37.870" end="45.570">
<p begin="43.444" end="45.570">
<span begin="43.444" end="43.716">go</span>
<span begin="43.716" end="43.887">go</span>
</p>
</div>
</body>
</tt>`)
list, err := parseTTML(content)
if err != nil {
t.Fatalf("parseTTML returned error: %v", err)
}
if len(list) != 1 || len(list[0].Line) != 1 {
t.Fatalf("expected one parsed lyric line, got %#v", list)
}
line := list[0].Line[0]
assertTimedLine(t, line, 43444, "go\ngo")
if line.End == nil || *line.End != 45570 {
t.Fatalf("expected line end 45570, got %v", line.End)
}
if len(line.Token) != 2 {
t.Fatalf("expected 2 timed tokens, got %d", len(line.Token))
}
assertToken(t, line.Token[0], 43444, 43716, "go", "")
assertToken(t, line.Token[1], 43716, 43887, "go", "")
}
func TestParseTTML_UnsyncedFallback(t *testing.T) {
content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
<tt xmlns="http://www.w3.org/ns/ttml">
<body>
<div>
<p>No timing here</p>
</div>
</body>
</tt>`)
list, err := parseTTML(content)
if err != nil {
t.Fatalf("parseTTML returned error: %v", err)
}
if len(list) != 1 {
t.Fatalf("expected 1 lyric track, got %d", len(list))
}
if list[0].Lang != "xxx" {
t.Fatalf("expected default language 'xxx', got %q", list[0].Lang)
}
if list[0].Synced {
t.Fatal("expected lyric track to be unsynced")
}
if len(list[0].Line) != 1 {
t.Fatalf("expected 1 line, got %d", len(list[0].Line))
}
if list[0].Line[0].Start != nil {
t.Fatalf("expected line start to be nil, got %v", *list[0].Line[0].Start)
}
if list[0].Line[0].Value != "No timing here" {
t.Fatalf("expected line value %q, got %q", "No timing here", list[0].Line[0].Value)
}
}
func TestParseTTML_MetadataTracksByKey(t *testing.T) {
content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
<tt xmlns="http://www.w3.org/ns/ttml" xmlns:itunes="http://music.apple.com/lyric-ttml-internal">
<head>
<metadata>
<iTunesMetadata xmlns="http://music.apple.com/lyric-ttml-internal">
<translations>
<translation xml:lang="es">
<text for="L1">Hola</text>
<text for="MISSING">Skip me</text>
</translation>
</translations>
<transliterations>
<transliteration xml:lang="ja-Latn">
<text for="L2"><span begin="00:02.000" end="00:02.300" xmlns="http://www.w3.org/ns/ttml">ko</span><span begin="00:02.300" end="00:02.600" xmlns="http://www.w3.org/ns/ttml">nni</span></text>
</transliteration>
</transliterations>
</iTunesMetadata>
</metadata>
</head>
<body xml:lang="ja">
<div>
<p begin="00:01.000" end="00:01.500" itunes:key="L1">こんにちは</p>
<p begin="00:02.000" end="00:02.700" itunes:key="L2">こんばんは</p>
</div>
</body>
</tt>`)
list, err := parseTTML(content)
if err != nil {
t.Fatalf("parseTTML returned error: %v", err)
}
if len(list) != 3 {
t.Fatalf("expected 3 lyric tracks, got %d", len(list))
}
main := list[0]
if main.Kind != "main" {
t.Fatalf("expected main track kind %q, got %q", "main", main.Kind)
}
if main.Lang != "ja" {
t.Fatalf("expected main track language %q, got %q", "ja", main.Lang)
}
if len(main.Line) != 2 {
t.Fatalf("expected 2 lines in main track, got %d", len(main.Line))
}
translation := list[1]
if translation.Kind != "translation" {
t.Fatalf("expected translation kind %q, got %q", "translation", translation.Kind)
}
if translation.Lang != "es" {
t.Fatalf("expected translation language %q, got %q", "es", translation.Lang)
}
if len(translation.Line) != 1 {
t.Fatalf("expected 1 translation line, got %d", len(translation.Line))
}
assertTimedLine(t, translation.Line[0], 1000, "Hola")
if translation.Line[0].End == nil || *translation.Line[0].End != 1500 {
t.Fatalf("expected translation line end %d, got %v", 1500, translation.Line[0].End)
}
pronunciation := list[2]
if pronunciation.Kind != "pronunciation" {
t.Fatalf("expected pronunciation kind %q, got %q", "pronunciation", pronunciation.Kind)
}
if pronunciation.Lang != "ja-latn" {
t.Fatalf("expected pronunciation language %q, got %q", "ja-latn", pronunciation.Lang)
}
if len(pronunciation.Line) != 1 {
t.Fatalf("expected 1 pronunciation line, got %d", len(pronunciation.Line))
}
assertTimedLine(t, pronunciation.Line[0], 2000, "konni")
if pronunciation.Line[0].End == nil || *pronunciation.Line[0].End != 2600 {
t.Fatalf("expected pronunciation line end %d, got %v", 2600, pronunciation.Line[0].End)
}
if len(pronunciation.Line[0].Token) != 2 {
t.Fatalf("expected 2 pronunciation tokens, got %d", len(pronunciation.Line[0].Token))
}
assertToken(t, pronunciation.Line[0].Token[0], 2000, 2300, "ko", "")
assertToken(t, pronunciation.Line[0].Token[1], 2300, 2600, "nni", "")
}
func TestParseTTML_PronunciationBareDecimalEndTimes(t *testing.T) {
content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
<tt xmlns="http://www.w3.org/ns/ttml" xmlns:itunes="http://music.apple.com/lyric-ttml-internal">
<head>
<metadata>
<iTunesMetadata xmlns="http://music.apple.com/lyric-ttml-internal">
<transliterations>
<transliteration xml:lang="ja-Latn">
<text for="L1"><span begin="2.747" end="3.018" xmlns="http://www.w3.org/ns/ttml">I</span> <span begin="3.018" end="3.179" xmlns="http://www.w3.org/ns/ttml">woke</span> <span begin="3.179" end="3.582" xmlns="http://www.w3.org/ns/ttml">up</span></text>
</transliteration>
</transliterations>
</iTunesMetadata>
</metadata>
</head>
<body xml:lang="ja">
<div>
<p begin="00:02.747" end="00:04.000" itunes:key="L1">起きた</p>
</div>
</body>
</tt>`)
list, err := parseTTML(content)
if err != nil {
t.Fatalf("parseTTML returned error: %v", err)
}
var pronunciation *model.Lyrics
for i := range list {
if list[i].Kind == "pronunciation" {
pronunciation = &list[i]
break
}
}
if pronunciation == nil {
t.Fatal("expected a pronunciation track")
}
if len(pronunciation.Line) != 1 {
t.Fatalf("expected 1 pronunciation line, got %d", len(pronunciation.Line))
}
line := pronunciation.Line[0]
assertTimedLine(t, line, 2747, "I woke up")
if len(line.Token) != 3 {
t.Fatalf("expected 3 tokens, got %d", len(line.Token))
}
assertToken(t, line.Token[0], 2747, 3018, "I", "")
assertToken(t, line.Token[1], 3018, 3179, "woke", "")
assertToken(t, line.Token[2], 3179, 3582, "up", "")
}
func assertTimedLine(t *testing.T, line model.Line, expectedStart int64, expectedValue string) {
t.Helper()
if line.Start == nil {
t.Fatal("expected line start to be set, got nil")
}
if *line.Start != expectedStart {
t.Fatalf("expected line start %d, got %d", expectedStart, *line.Start)
}
if line.Value != expectedValue {
t.Fatalf("expected line value %q, got %q", expectedValue, line.Value)
}
}
func assertToken(t *testing.T, token model.Token, expectedStart int64, expectedEnd int64, expectedValue string, expectedRole string) {
t.Helper()
if token.Start == nil {
t.Fatal("expected token start to be set, got nil")
}
if *token.Start != expectedStart {
t.Fatalf("expected token start %d, got %d", expectedStart, *token.Start)
}
if token.End == nil {
t.Fatal("expected token end to be set, got nil")
}
if *token.End != expectedEnd {
t.Fatalf("expected token end %d, got %d", expectedEnd, *token.End)
}
if token.Value != expectedValue {
t.Fatalf("expected token value %q, got %q", expectedValue, token.Value)
}
if token.Role != expectedRole {
t.Fatalf("expected token role %q, got %q", expectedRole, token.Role)
}
}

View File

@ -11,14 +11,24 @@ import (
"github.com/navidrome/navidrome/utils/str"
)
type Line struct {
type Token struct {
Start *int64 `structs:"start,omitempty" json:"start,omitempty"`
End *int64 `structs:"end,omitempty" json:"end,omitempty"`
Value string `structs:"value" json:"value"`
Role string `structs:"role,omitempty" json:"role,omitempty"`
}
type Line struct {
Start *int64 `structs:"start,omitempty" json:"start,omitempty"`
End *int64 `structs:"end,omitempty" json:"end,omitempty"`
Value string `structs:"value" json:"value"`
Token []Token `structs:"token,omitempty" json:"token,omitempty"`
}
type Lyrics struct {
DisplayArtist string `structs:"displayArtist,omitempty" json:"displayArtist,omitempty"`
DisplayTitle string `structs:"displayTitle,omitempty" json:"displayTitle,omitempty"`
Kind string `structs:"kind,omitempty" json:"kind,omitempty"`
Lang string `structs:"lang" json:"lang"`
Line []Line `structs:"line" json:"line"`
Offset *int64 `structs:"offset,omitempty" json:"offset,omitempty"`

View File

@ -478,19 +478,47 @@ func mapExplicitStatus(explicitStatus string) string {
func buildStructuredLyric(mf *model.MediaFile, lyrics model.Lyrics) responses.StructuredLyric {
lines := make([]responses.Line, len(lyrics.Line))
tokenLines := make([]responses.TokenLine, 0, len(lyrics.Line))
for i, line := range lyrics.Line {
lines[i] = responses.Line{
Start: line.Start,
Value: line.Value,
}
if len(line.Token) == 0 {
continue
}
tokens := make([]responses.LyricToken, len(line.Token))
for j, token := range line.Token {
tokens[j] = responses.LyricToken{
Start: token.Start,
End: token.End,
Value: token.Value,
Role: token.Role,
}
}
tokenLines = append(tokenLines, responses.TokenLine{
Index: int32(i),
Start: line.Start,
End: line.End,
Value: line.Value,
Token: tokens,
})
}
kind := strings.TrimSpace(lyrics.Kind)
if kind == "" {
kind = "main"
}
structured := responses.StructuredLyric{
DisplayArtist: lyrics.DisplayArtist,
DisplayTitle: lyrics.DisplayTitle,
Kind: kind,
Lang: lyrics.Lang,
Line: lines,
TokenLine: tokenLines,
Offset: lyrics.Offset,
Synced: lyrics.Synced,
}

View File

@ -98,7 +98,9 @@ func (api *Router) GetLyrics(r *http.Request) (*responses.Subsonic, error) {
response := newResponse()
lyricsResponse := responses.Lyrics{}
response.Lyrics = &lyricsResponse
mediaFiles, err := api.ds.MediaFile(r.Context()).GetAll(filter.SongsByArtistTitleWithLyricsFirst(artist, title))
opts := filter.SongsByArtistTitleWithLyricsFirst(artist, title)
opts.Max = 0
mediaFiles, err := api.ds.MediaFile(r.Context()).GetAll(opts)
if err != nil {
return nil, err
@ -108,25 +110,26 @@ func (api *Router) GetLyrics(r *http.Request) (*responses.Subsonic, error) {
return response, nil
}
structuredLyrics, err := api.lyrics.GetLyrics(r.Context(), &mediaFiles[0])
if err != nil {
return nil, err
for i := range mediaFiles {
structuredLyrics, err := api.lyrics.GetLyrics(r.Context(), &mediaFiles[i])
if err != nil {
return nil, err
}
if len(structuredLyrics) == 0 {
continue
}
lyricsResponse.Artist = artist
lyricsResponse.Title = title
var lyricsText strings.Builder
for _, line := range structuredLyrics[0].Line {
lyricsText.WriteString(line.Value + "\n")
}
lyricsResponse.Value = lyricsText.String()
break
}
if len(structuredLyrics) == 0 {
return response, nil
}
lyricsResponse.Artist = artist
lyricsResponse.Title = title
var lyricsText strings.Builder
for _, line := range structuredLyrics[0].Line {
lyricsText.WriteString(line.Value + "\n")
}
lyricsResponse.Value = lyricsText.String()
return response, nil
}

View File

@ -186,6 +186,36 @@ var _ = Describe("MediaRetrievalController", func() {
Expect(response.Lyrics.Title).To(Equal("Never Gonna Give You Up"))
Expect(response.Lyrics.Value).To(Equal("We're no strangers to love\nYou know the rules and so do I\n"))
})
It("should continue searching candidates for sidecar lyrics", func() {
conf.Server.LyricsPriority = ".ttml,embedded"
r := newGetRequest("artist=Rick+Astley", "title=Never+Gonna+Give+You+Up")
baseTime := time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC)
mockRepo.SetData(model.MediaFiles{
{
ID: "1",
Path: "tests/fixtures/01 Invisible (RED) Edit Version.mp3",
Artist: "Rick Astley",
Title: "Never Gonna Give You Up",
Lyrics: "[]",
UpdatedAt: baseTime.Add(2 * time.Hour), // Newer, but no TTML sidecar
},
{
ID: "2",
Path: "tests/fixtures/test.mp3",
Artist: "Rick Astley",
Title: "Never Gonna Give You Up",
Lyrics: "[]",
UpdatedAt: baseTime.Add(1 * time.Hour), // Older, but has TTML sidecar
},
})
response, err := router.GetLyrics(r)
Expect(err).ToNot(HaveOccurred())
Expect(response.Lyrics.Artist).To(Equal("Rick Astley"))
Expect(response.Lyrics.Title).To(Equal("Never Gonna Give You Up"))
Expect(response.Lyrics.Value).To(Equal("We're no strangers to love\nYou know the rules and so do I\n"))
})
})
Describe("GetLyricsBySongId", func() {
@ -202,6 +232,11 @@ var _ = Describe("MediaRetrievalController", func() {
Expect(realLyric.DisplayArtist).To(Equal(expectedLyric.DisplayArtist))
Expect(realLyric.DisplayTitle).To(Equal(expectedLyric.DisplayTitle))
expectedKind := expectedLyric.Kind
if expectedKind == "" {
expectedKind = "main"
}
Expect(realLyric.Kind).To(Equal(expectedKind))
Expect(realLyric.Lang).To(Equal(expectedLyric.Lang))
Expect(realLyric.Synced).To(Equal(expectedLyric.Synced))
@ -222,6 +257,40 @@ var _ = Describe("MediaRetrievalController", func() {
Expect(*realLine.Start).To(Equal(*expectedLine.Start))
}
}
Expect(realLyric.TokenLine).To(HaveLen(len(expectedLyric.TokenLine)))
for j, realTokenLine := range realLyric.TokenLine {
expectedTokenLine := expectedLyric.TokenLine[j]
Expect(realTokenLine.Index).To(Equal(expectedTokenLine.Index))
Expect(realTokenLine.Value).To(Equal(expectedTokenLine.Value))
if expectedTokenLine.Start == nil {
Expect(realTokenLine.Start).To(BeNil())
} else {
Expect(*realTokenLine.Start).To(Equal(*expectedTokenLine.Start))
}
if expectedTokenLine.End == nil {
Expect(realTokenLine.End).To(BeNil())
} else {
Expect(*realTokenLine.End).To(Equal(*expectedTokenLine.End))
}
Expect(realTokenLine.Token).To(HaveLen(len(expectedTokenLine.Token)))
for k, realToken := range realTokenLine.Token {
expectedToken := expectedTokenLine.Token[k]
Expect(realToken.Value).To(Equal(expectedToken.Value))
Expect(realToken.Role).To(Equal(expectedToken.Role))
if expectedToken.Start == nil {
Expect(realToken.Start).To(BeNil())
} else {
Expect(*realToken.Start).To(Equal(*expectedToken.Start))
}
if expectedToken.End == nil {
Expect(realToken.End).To(BeNil())
} else {
Expect(*realToken.End).To(Equal(*expectedToken.End))
}
}
}
}
}
@ -323,6 +392,238 @@ var _ = Describe("MediaRetrievalController", func() {
},
})
})
It("should return multilingual TTML sidecar lyrics", func() {
conf.Server.LyricsPriority = ".ttml,embedded"
r := newGetRequest("id=1")
mockRepo.SetData(model.MediaFiles{
{
ID: "1",
Path: "tests/fixtures/test.mp3",
Artist: "Rick Astley",
Title: "Never Gonna Give You Up",
Lyrics: "[]",
},
})
response, err := router.GetLyricsBySongId(r)
Expect(err).ToNot(HaveOccurred())
porTime := int64(18800)
ttmlTime := int64(22800)
compareResponses(response.LyricsList, responses.LyricsList{
StructuredLyrics: responses.StructuredLyrics{
{
DisplayArtist: "Rick Astley",
DisplayTitle: "Never Gonna Give You Up",
Lang: "eng",
Synced: true,
Line: []responses.Line{
{
Start: &times[0],
Value: "We're no strangers to love",
},
{
Start: &ttmlTime,
Value: "You know the rules and so do I",
},
},
},
{
DisplayArtist: "Rick Astley",
DisplayTitle: "Never Gonna Give You Up",
Lang: "por",
Synced: true,
Line: []responses.Line{
{
Start: &porTime,
Value: "Nao somos estranhos ao amor",
},
},
},
},
})
})
It("should return metadata-linked translation and pronunciation tracks from TTML", func() {
conf.Server.LyricsPriority = ".ttml,embedded"
r := newGetRequest("id=1")
mockRepo.SetData(model.MediaFiles{
{
ID: "1",
Path: "tests/fixtures/test-metadata.mp3",
Artist: "Rick Astley",
Title: "Never Gonna Give You Up",
Lyrics: "[]",
},
})
response, err := router.GetLyricsBySongId(r)
Expect(err).ToNot(HaveOccurred())
mainStartA := int64(1000)
mainStartB := int64(2000)
tokenStartA := int64(2000)
tokenEndA := int64(2300)
tokenStartB := int64(2300)
tokenEndB := int64(2600)
compareResponses(response.LyricsList, responses.LyricsList{
StructuredLyrics: responses.StructuredLyrics{
{
DisplayArtist: "Rick Astley",
DisplayTitle: "Never Gonna Give You Up",
Kind: "main",
Lang: "ja",
Synced: true,
Line: []responses.Line{
{
Start: &mainStartA,
Value: "こんにちは",
},
{
Start: &mainStartB,
Value: "こんばんは",
},
},
},
{
DisplayArtist: "Rick Astley",
DisplayTitle: "Never Gonna Give You Up",
Kind: "translation",
Lang: "es",
Synced: true,
Line: []responses.Line{
{
Start: &mainStartA,
Value: "Hola",
},
},
},
{
DisplayArtist: "Rick Astley",
DisplayTitle: "Never Gonna Give You Up",
Kind: "pronunciation",
Lang: "ja-latn",
Synced: true,
Line: []responses.Line{
{
Start: &mainStartB,
Value: "konni",
},
},
TokenLine: []responses.TokenLine{
{
Index: 0,
Start: &mainStartB,
End: &tokenEndB,
Value: "konni",
Token: []responses.LyricToken{
{
Start: &tokenStartA,
End: &tokenEndA,
Value: "ko",
},
{
Start: &tokenStartB,
End: &tokenEndB,
Value: "nni",
},
},
},
},
},
},
})
})
It("should return tokenized lines for songLyrics v2 clients", func() {
r := newGetRequest("id=1")
lineStart := int64(1000)
lineEnd := int64(3000)
tokenStartA := int64(1000)
tokenEndA := int64(1400)
tokenStartB := int64(2000)
tokenEndB := int64(2500)
lyricsJson, err := json.Marshal(model.LyricList{
{
Lang: "eng",
Synced: true,
Line: []model.Line{
{
Start: &lineStart,
End: &lineEnd,
Value: "Hello echo",
Token: []model.Token{
{
Start: &tokenStartA,
End: &tokenEndA,
Value: "Hello",
},
{
Start: &tokenStartB,
End: &tokenEndB,
Value: "echo",
Role: "x-bg",
},
},
},
},
},
})
Expect(err).ToNot(HaveOccurred())
mockRepo.SetData(model.MediaFiles{
{
ID: "1",
Artist: "Rick Astley",
Title: "Never Gonna Give You Up",
Lyrics: string(lyricsJson),
},
})
response, err := router.GetLyricsBySongId(r)
Expect(err).ToNot(HaveOccurred())
compareResponses(response.LyricsList, responses.LyricsList{
StructuredLyrics: responses.StructuredLyrics{
{
DisplayArtist: "Rick Astley",
DisplayTitle: "Never Gonna Give You Up",
Lang: "eng",
Synced: true,
Line: []responses.Line{
{
Start: &lineStart,
Value: "Hello echo",
},
},
TokenLine: []responses.TokenLine{
{
Index: 0,
Start: &lineStart,
End: &lineEnd,
Value: "Hello echo",
Token: []responses.LyricToken{
{
Start: &tokenStartA,
End: &tokenEndA,
Value: "Hello",
},
{
Start: &tokenStartB,
End: &tokenEndB,
Value: "echo",
Role: "x-bg",
},
},
},
},
},
},
})
})
})
})

View File

@ -11,7 +11,7 @@ func (api *Router) GetOpenSubsonicExtensions(_ *http.Request) (*responses.Subson
response.OpenSubsonicExtensions = &responses.OpenSubsonicExtensions{
{Name: "transcodeOffset", Versions: []int32{1}},
{Name: "formPost", Versions: []int32{1}},
{Name: "songLyrics", Versions: []int32{1}},
{Name: "songLyrics", Versions: []int32{1, 2}},
{Name: "indexBasedQueue", Versions: []int32{1}},
{Name: "transcoding", Versions: []int32{1}},
}

View File

@ -38,7 +38,7 @@ var _ = Describe("GetOpenSubsonicExtensions", func() {
HaveLen(5),
ContainElement(responses.OpenSubsonicExtension{Name: "transcodeOffset", Versions: []int32{1}}),
ContainElement(responses.OpenSubsonicExtension{Name: "formPost", Versions: []int32{1}}),
ContainElement(responses.OpenSubsonicExtension{Name: "songLyrics", Versions: []int32{1}}),
ContainElement(responses.OpenSubsonicExtension{Name: "songLyrics", Versions: []int32{1, 2}}),
ContainElement(responses.OpenSubsonicExtension{Name: "indexBasedQueue", Versions: []int32{1}}),
ContainElement(responses.OpenSubsonicExtension{Name: "transcoding", Versions: []int32{1}}),
))

View File

@ -537,13 +537,30 @@ type Line struct {
Value string `xml:",chardata" json:"value"`
}
type LyricToken struct {
Start *int64 `xml:"start,attr,omitempty" json:"start,omitempty"`
End *int64 `xml:"end,attr,omitempty" json:"end,omitempty"`
Value string `xml:"value,attr" json:"value"`
Role string `xml:"role,attr,omitempty" json:"role,omitempty"`
}
type TokenLine struct {
Index int32 `xml:"index,attr" json:"index"`
Start *int64 `xml:"start,attr,omitempty" json:"start,omitempty"`
End *int64 `xml:"end,attr,omitempty" json:"end,omitempty"`
Value string `xml:"value,attr,omitempty" json:"value,omitempty"`
Token []LyricToken `xml:"token,omitempty" json:"token,omitempty"`
}
type StructuredLyric struct {
DisplayArtist string `xml:"displayArtist,attr,omitempty" json:"displayArtist,omitempty"`
DisplayTitle string `xml:"displayTitle,attr,omitempty" json:"displayTitle,omitempty"`
Lang string `xml:"lang,attr" json:"lang"`
Line []Line `xml:"line" json:"line"`
Offset *int64 `xml:"offset,attr,omitempty" json:"offset,omitempty"`
Synced bool `xml:"synced,attr" json:"synced"`
DisplayArtist string `xml:"displayArtist,attr,omitempty" json:"displayArtist,omitempty"`
DisplayTitle string `xml:"displayTitle,attr,omitempty" json:"displayTitle,omitempty"`
Kind string `xml:"kind,attr,omitempty" json:"kind,omitempty"`
Lang string `xml:"lang,attr" json:"lang"`
Line []Line `xml:"line" json:"line"`
TokenLine []TokenLine `xml:"tokenLine,omitempty" json:"tokenLine,omitempty"`
Offset *int64 `xml:"offset,attr,omitempty" json:"offset,omitempty"`
Synced bool `xml:"synced,attr" json:"synced"`
}
type StructuredLyrics []StructuredLyric

2
tests/fixtures/bom-test.ttml vendored Normal file
View File

@ -0,0 +1,2 @@
<?xml version="1.0" encoding="UTF-8"?>
<tt xmlns="http://www.w3.org/ns/ttml"><body><div xml:lang="eng"><p begin="00:00:00.00">BOM test line</p></div></body></tt>

BIN
tests/fixtures/bom-utf16-test.ttml vendored Normal file

Binary file not shown.

25
tests/fixtures/test-metadata.ttml vendored Normal file
View File

@ -0,0 +1,25 @@
<?xml version="1.0" encoding="UTF-8"?>
<tt xmlns="http://www.w3.org/ns/ttml" xmlns:itunes="http://music.apple.com/lyric-ttml-internal">
<head>
<metadata>
<iTunesMetadata xmlns="http://music.apple.com/lyric-ttml-internal">
<translations>
<translation xml:lang="es">
<text for="L1">Hola</text>
</translation>
</translations>
<transliterations>
<transliteration xml:lang="ja-Latn">
<text for="L2"><span begin="00:02.000" end="00:02.300" xmlns="http://www.w3.org/ns/ttml">ko</span><span begin="00:02.300" end="00:02.600" xmlns="http://www.w3.org/ns/ttml">nni</span></text>
</transliteration>
</transliterations>
</iTunesMetadata>
</metadata>
</head>
<body xml:lang="ja">
<div>
<p begin="00:01.000" end="00:01.500" itunes:key="L1">こんにちは</p>
<p begin="00:02.000" end="00:02.700" itunes:key="L2">こんばんは</p>
</div>
</body>
</tt>

12
tests/fixtures/test.ttml vendored Normal file
View File

@ -0,0 +1,12 @@
<?xml version="1.0" encoding="UTF-8"?>
<tt xmlns="http://www.w3.org/ns/ttml" xmlns:ttp="http://www.w3.org/ns/ttml#parameter" ttp:frameRate="30" ttp:subFrameRate="2" ttp:tickRate="10">
<body>
<div xml:lang="eng">
<p begin="00:00:18.80">We're no strangers to love</p>
<p begin="00:00:22:24">You know the rules and so do I</p>
</div>
<div xml:lang="por">
<p begin="188t">Nao somos estranhos ao amor</p>
</div>
</body>
</tt>

View File

@ -9,6 +9,7 @@ export const PLAYER_SET_VOLUME = 'PLAYER_SET_VOLUME'
export const PLAYER_SET_MODE = 'PLAYER_SET_MODE'
export const TRANSCODING_SET_PROFILE = 'TRANSCODING_SET_PROFILE'
export const PLAYER_REFRESH_QUEUE = 'PLAYER_REFRESH_QUEUE'
export const PLAYER_UPDATE_LYRIC = 'PLAYER_UPDATE_LYRIC'
export const setTrack = (data) => ({
type: PLAYER_SET_TRACK,
@ -114,3 +115,8 @@ export const refreshQueue = (resolvedUrls) => ({
type: PLAYER_REFRESH_QUEUE,
data: resolvedUrls,
})
export const updateQueueLyric = (trackId, lyric) => ({
type: PLAYER_UPDATE_LYRIC,
data: { trackId, lyric },
})

File diff suppressed because it is too large Load Diff

View File

@ -22,6 +22,7 @@ import {
refreshQueue,
setPlayMode,
setTranscodingProfile,
updateQueueLyric,
setVolume,
syncQueue,
} from '../actions'
@ -33,6 +34,25 @@ import { keyMap } from '../hotkeys'
import keyHandlers from './keyHandlers'
import { calculateGain } from '../utils/calculateReplayGain'
import { detectBrowserProfile, decisionService } from '../transcode'
import {
getPreferredLyricLanguage,
hasStructuredLyricContent,
selectLyricLayers,
structuredLyricToLrc,
} from './lyrics'
import KaraokeLyricsOverlay from './KaraokeLyricsOverlay'
const emptyLyricLayers = {
main: null,
translation: null,
pronunciation: null,
}
const normalizeLyricLayers = (layers) => ({
main: layers?.main || null,
translation: layers?.translation || null,
pronunciation: layers?.pronunciation || null,
})
const Player = () => {
const theme = useCurrentTheme()
@ -120,6 +140,72 @@ const Player = () => {
const gainInfo = useSelector((state) => state.replayGain)
const [context, setContext] = useState(null)
const [gainNode, setGainNode] = useState(null)
const lyricCacheRef = useRef(new Map())
const lyricRequestIdRef = useRef(0)
const playerRef = useRef(null)
const [karaokeVisible, setKaraokeVisible] = useState(false)
const [selectedLyricLayers, setSelectedLyricLayers] =
useState(emptyLyricLayers)
const [showTranslation, setShowTranslation] = useState(false)
const [showPronunciation, setShowPronunciation] = useState(false)
const currentTrackId = playerState.current?.trackId
const currentTrackIsRadio = playerState.current?.isRadio
const selectedStructuredLyric = selectedLyricLayers.main
const hasKaraokeLyric = hasStructuredLyricContent(selectedStructuredLyric)
const hasTranslationLyric = hasStructuredLyricContent(
selectedLyricLayers.translation,
)
const hasPronunciationLyric = hasStructuredLyricContent(
selectedLyricLayers.pronunciation,
)
const applyLyricToRuntimePlayer = useCallback((trackId, lyric) => {
if (!trackId) {
return
}
const player = playerRef.current
if (!player || typeof player.setState !== 'function') {
return
}
player.setState((prevState) => {
const prevLists = Array.isArray(prevState.audioLists)
? prevState.audioLists
: []
let changed = false
const audioLists = prevLists.map((item) => {
if (item.trackId !== trackId) {
return item
}
if (item.lyric === lyric) {
return item
}
changed = true
return {
...item,
lyric,
}
})
const currentItem = audioLists.find(
(item) => item.musicSrc === prevState.musicSrc,
)
const currentLyric =
typeof currentItem?.lyric === 'string'
? currentItem.lyric
: prevState.lyric
if (!changed && currentLyric === prevState.lyric) {
return null
}
return {
audioLists,
lyric: currentLyric,
}
})
}, [])
useEffect(() => {
if (
@ -166,6 +252,107 @@ const Player = () => {
return () => window.removeEventListener('beforeunload', handleBeforeUnload)
}, [playerState, audioInstance])
useEffect(() => {
if (!currentTrackId || currentTrackIsRadio) {
setSelectedLyricLayers(emptyLyricLayers)
setShowTranslation(false)
setShowPronunciation(false)
setKaraokeVisible(false)
return
}
const cached = lyricCacheRef.current.get(currentTrackId)
let layers = emptyLyricLayers
if (cached && typeof cached !== 'string') {
if (cached.layers) {
layers = normalizeLyricLayers(cached.layers)
} else if (cached.structuredLyric) {
layers = normalizeLyricLayers({
main: cached.structuredLyric,
})
}
}
setSelectedLyricLayers(layers)
setShowTranslation(false)
setShowPronunciation(hasStructuredLyricContent(layers.pronunciation))
}, [currentTrackId, currentTrackIsRadio])
useEffect(() => {
lyricRequestIdRef.current += 1
const requestId = lyricRequestIdRef.current
if (!currentTrackId || currentTrackIsRadio) {
return
}
const cached = lyricCacheRef.current.get(currentTrackId)
if (cached !== undefined) {
const cachedLyric =
typeof cached === 'string' ? cached : cached?.lrc || ''
const cachedLayers =
typeof cached === 'string'
? emptyLyricLayers
: cached?.layers
? normalizeLyricLayers(cached.layers)
: normalizeLyricLayers({ main: cached?.structuredLyric })
setSelectedLyricLayers(cachedLayers)
setShowTranslation(false)
setShowPronunciation(
hasStructuredLyricContent(cachedLayers.pronunciation),
)
if (cachedLyric) {
dispatch(updateQueueLyric(currentTrackId, cachedLyric))
applyLyricToRuntimePlayer(currentTrackId, cachedLyric)
}
return
}
subsonic
.getLyricsBySongId(currentTrackId)
.then((resp) => {
if (lyricRequestIdRef.current !== requestId) {
return
}
const structuredLyrics =
resp?.json?.['subsonic-response']?.lyricsList?.structuredLyrics || []
const layers = selectLyricLayers(
structuredLyrics,
getPreferredLyricLanguage(),
)
const lyric = layers.main ? structuredLyricToLrc(layers.main) : ''
lyricCacheRef.current.set(currentTrackId, {
lrc: lyric,
layers,
})
setSelectedLyricLayers(layers)
setShowTranslation(false)
setShowPronunciation(hasStructuredLyricContent(layers.pronunciation))
if (lyric !== '') {
dispatch(updateQueueLyric(currentTrackId, lyric))
applyLyricToRuntimePlayer(currentTrackId, lyric)
}
})
.catch(() => {
if (lyricRequestIdRef.current !== requestId) {
return
}
setSelectedLyricLayers(emptyLyricLayers)
setShowTranslation(false)
setShowPronunciation(false)
// Do not cache network/request failures as empty lyrics, so we can retry.
lyricCacheRef.current.delete(currentTrackId)
})
}, [dispatch, currentTrackId, currentTrackIsRadio, applyLyricToRuntimePlayer])
useEffect(() => {
if (!hasKaraokeLyric && karaokeVisible) {
setKaraokeVisible(false)
}
}, [hasKaraokeLyric, karaokeVisible])
const defaultOptions = useMemo(
() => ({
theme: playerTheme,
@ -177,7 +364,7 @@ const Player = () => {
clearPriorAudioLists: false,
showDestroy: true,
showDownload: false,
showLyric: true,
showLyric: false,
showReload: false,
toggleMode: !isDesktop,
glassBg: false,
@ -214,12 +401,24 @@ const Player = () => {
(playerState.clear || playerState.playIndex === 0),
clearPriorAudioLists: playerState.clear,
extendsContent: (
<PlayerToolbar id={current.trackId} isRadio={current.isRadio} />
<PlayerToolbar
id={current.trackId}
isRadio={current.isRadio}
onToggleLyrics={() => setKaraokeVisible((visible) => !visible)}
lyricsActive={karaokeVisible}
lyricsDisabled={!hasKaraokeLyric}
/>
),
defaultVolume: isMobilePlayer ? 1 : playerState.volume,
showMediaSession: !current.isRadio,
}
}, [playerState, defaultOptions, isMobilePlayer])
}, [
playerState,
defaultOptions,
isMobilePlayer,
karaokeVisible,
hasKaraokeLyric,
])
const onAudioListsChange = useCallback(
(_, audioLists, audioInfo) => dispatch(syncQueue(audioInfo, audioLists)),
@ -391,6 +590,7 @@ const Player = () => {
return (
<ThemeProvider theme={createMuiTheme(theme)}>
<ReactJkMusicPlayer
ref={playerRef}
{...options}
className={classes.player}
onAudioListsChange={onAudioListsChange}
@ -406,6 +606,28 @@ const Player = () => {
onBeforeDestroy={onBeforeDestroy}
getAudioInstance={setAudioInstance}
/>
<KaraokeLyricsOverlay
visible={karaokeVisible}
mainLyric={selectedLyricLayers.main}
translationLyric={selectedLyricLayers.translation}
pronunciationLyric={selectedLyricLayers.pronunciation}
showTranslation={showTranslation}
showPronunciation={showPronunciation}
translationEnabled={hasTranslationLyric}
pronunciationEnabled={hasPronunciationLyric}
onToggleTranslation={() =>
setShowTranslation((previous) =>
hasTranslationLyric ? !previous : false,
)
}
onTogglePronunciation={() =>
setShowPronunciation((previous) =>
hasPronunciationLyric ? !previous : false,
)
}
audioInstance={audioInstance}
onClose={() => setKaraokeVisible(false)}
/>
<GlobalHotKeys handlers={handlers} keyMap={keyMap} allowChanges />
</ThemeProvider>
)

View File

@ -4,7 +4,9 @@ import { useGetOne } from 'react-admin'
import { GlobalHotKeys } from 'react-hotkeys'
import IconButton from '@material-ui/core/IconButton'
import { useMediaQuery } from '@material-ui/core'
import Tooltip from '@material-ui/core/Tooltip'
import { RiSaveLine } from 'react-icons/ri'
import { RiFileMusicLine } from 'react-icons/ri'
import { LoveButton, useToggleLove } from '../common'
import { openSaveQueueDialog } from '../actions'
import { keyMap } from '../hotkeys'
@ -55,7 +57,13 @@ const useStyles = makeStyles((theme) => ({
},
}))
const PlayerToolbar = ({ id, isRadio }) => {
const PlayerToolbar = ({
id,
isRadio,
onToggleLyrics,
lyricsActive = false,
lyricsDisabled = false,
}) => {
const dispatch = useDispatch()
const { data, loading } = useGetOne('song', id, { enabled: !!id && !isRadio })
const [toggleLove, toggling] = useToggleLove('song', data)
@ -99,6 +107,25 @@ const PlayerToolbar = ({ id, isRadio }) => {
/>
)
const toggleLyricsButton = (
<Tooltip title="Toggle synchronized lyrics">
<span>
<IconButton
size={isDesktop ? 'small' : undefined}
onClick={onToggleLyrics}
disabled={!onToggleLyrics || lyricsDisabled}
data-testid="toggle-lyrics-button"
className={buttonClass}
color={lyricsActive ? 'primary' : 'default'}
>
<RiFileMusicLine
className={!isDesktop ? classes.mobileIcon : undefined}
/>
</IconButton>
</span>
</Tooltip>
)
return (
<>
<GlobalHotKeys keyMap={keyMap} handlers={handlers} allowChanges />
@ -106,11 +133,13 @@ const PlayerToolbar = ({ id, isRadio }) => {
<li className={`${listItemClass} item`}>
{saveQueueButton}
{loveButton}
{toggleLyricsButton}
</li>
) : (
<>
<li className={`${listItemClass} item`}>{saveQueueButton}</li>
<li className={`${listItemClass} item`}>{loveButton}</li>
<li className={`${listItemClass} item`}>{toggleLyricsButton}</li>
</>
)}
</>

View File

@ -71,6 +71,7 @@ describe('<PlayerToolbar />', () => {
// Verify both buttons are rendered
expect(screen.getByTestId('save-queue-button')).toBeInTheDocument()
expect(screen.getByTestId('love-button')).toBeInTheDocument()
expect(screen.getByTestId('toggle-lyrics-button')).toBeInTheDocument()
// Verify desktop classes are applied
expect(listItems[0].className).toContain('toolbar')
@ -102,6 +103,14 @@ describe('<PlayerToolbar />', () => {
type: 'OPEN_SAVE_QUEUE_DIALOG',
})
})
it('triggers lyric toggle callback when lyrics button is clicked', () => {
const onToggleLyrics = vi.fn()
render(<PlayerToolbar id="song-1" onToggleLyrics={onToggleLyrics} />)
fireEvent.click(screen.getByTestId('toggle-lyrics-button'))
expect(onToggleLyrics).toHaveBeenCalledTimes(1)
})
})
describe('Mobile layout', () => {
@ -114,11 +123,12 @@ describe('<PlayerToolbar />', () => {
// Each button should be in its own list item
const listItems = screen.getAllByRole('listitem')
expect(listItems).toHaveLength(2)
expect(listItems).toHaveLength(3)
// Verify both buttons are rendered
expect(screen.getByTestId('save-queue-button')).toBeInTheDocument()
expect(screen.getByTestId('love-button')).toBeInTheDocument()
expect(screen.getByTestId('toggle-lyrics-button')).toBeInTheDocument()
// Verify mobile classes are applied
expect(listItems[0].className).toContain('mobileListItem')
@ -140,6 +150,13 @@ describe('<PlayerToolbar />', () => {
const loveButton = screen.getByTestId('love-button')
expect(loveButton).toBeDisabled()
})
it('disables lyrics button when lyrics are unavailable', () => {
render(<PlayerToolbar id="song-1" lyricsDisabled={true} />)
const lyricsButton = screen.getByTestId('toggle-lyrics-button')
expect(lyricsButton).toBeDisabled()
})
})
describe('Common behavior', () => {

View File

@ -0,0 +1,617 @@
const normalizeLanguageTag = (language) =>
(language || '').toLowerCase().replace('_', '-')
const KARAOKE_SWITCH_EPSILON_MS = 18
const LYRIC_KIND_MAIN = 'main'
const LYRIC_KIND_TRANSLATION = 'translation'
const LYRIC_KIND_PRONUNCIATION = 'pronunciation'
const padTime = (value) => {
const str = value.toString()
return str.length === 1 ? `0${str}` : str
}
const toTime = (value) => {
const numeric = Number(value)
return Number.isFinite(numeric) ? numeric : null
}
const compareNullableTime = (a, b) => {
if (a == null && b == null) {
return 0
}
if (a == null) {
return 1
}
if (b == null) {
return -1
}
return a - b
}
const sortTokensByStart = (tokens) =>
tokens
.map((token, order) => ({ ...token, order }))
.sort((a, b) => {
const byStart = compareNullableTime(a.start, b.start)
if (byStart !== 0) {
return byStart
}
const byEnd = compareNullableTime(a.end, b.end)
if (byEnd !== 0) {
return byEnd
}
return a.order - b.order
})
.map(({ order, ...token }) => token)
const languageMatch = (candidate, preferred) => {
if (!candidate || !preferred) {
return false
}
return (
candidate === preferred ||
candidate.startsWith(`${preferred}-`) ||
preferred.startsWith(`${candidate}-`)
)
}
const hasTimedLines = (lyric) =>
lyric &&
lyric.synced &&
Array.isArray(lyric.line) &&
lyric.line.some((line) => Number.isFinite(Number(line.start)))
const normalizeToken = (token) => {
if (!token) {
return null
}
const value = typeof token.value === 'string' ? token.value : ''
if (!value.trim()) {
return null
}
return {
start: toTime(token.start),
end: toTime(token.end),
value,
role: typeof token.role === 'string' ? token.role : '',
}
}
const normalizeTokenLine = (tokenLine, fallbackIndex) => {
const index = Number.isFinite(Number(tokenLine?.index))
? Number(tokenLine.index)
: fallbackIndex
const tokens = sortTokensByStart(
Array.isArray(tokenLine?.token)
? tokenLine.token.map(normalizeToken).filter(Boolean)
: [],
)
return {
index,
start: toTime(tokenLine?.start),
end: toTime(tokenLine?.end),
value: typeof tokenLine?.value === 'string' ? tokenLine.value : '',
tokens,
}
}
const normalizeLyricKind = (kind) => {
const normalized = (kind || '').toLowerCase().trim()
switch (normalized) {
case LYRIC_KIND_TRANSLATION:
return LYRIC_KIND_TRANSLATION
case LYRIC_KIND_PRONUNCIATION:
return LYRIC_KIND_PRONUNCIATION
default:
return LYRIC_KIND_MAIN
}
}
const pickLyricByLanguage = (lyrics, preferredLanguage) => {
if (!Array.isArray(lyrics) || lyrics.length === 0) {
return null
}
const preferred = normalizeLanguageTag(preferredLanguage)
const preferredBase = preferred.split('-')[0]
return (
lyrics.find((lyric) =>
languageMatch(normalizeLanguageTag(lyric.lang), preferred),
) ||
lyrics.find((lyric) =>
languageMatch(normalizeLanguageTag(lyric.lang), preferredBase),
) ||
lyrics.find((lyric) =>
languageMatch(normalizeLanguageTag(lyric.lang), 'en'),
) ||
lyrics[0]
)
}
const lineTimeWindow = (lines, index) => {
const line = lines[index]
if (!line) {
return { start: null, end: null }
}
const start = toTime(line.start)
const end = toTime(line.end) ?? toTime(lines[index + 1]?.start)
return { start, end }
}
const buildSyntheticWordTokens = (line, token) => {
const text = typeof line?.value === 'string' ? line.value : ''
if (!text.trim()) {
return null
}
const chunks = text.match(/\S+\s*/g) || []
if (chunks.length < 2) {
return null
}
const normalizedLine = text.replace(/\s+/g, ' ').trim().toLowerCase()
const normalizedTokenValue = (token?.value || '')
.replace(/\s+/g, ' ')
.trim()
.toLowerCase()
if (!normalizedTokenValue || !normalizedLine) {
return null
}
const compressedLine = normalizedLine.replace(/\s+/g, '')
const compressedToken = normalizedTokenValue.replace(/\s+/g, '')
const tokenLooksLikeWholeLine =
compressedToken === compressedLine ||
compressedToken.length >= Math.floor(compressedLine.length * 0.8)
if (!tokenLooksLikeWholeLine) {
return null
}
const tokenStart = toTime(token?.start)
const tokenEnd = toTime(token?.end)
const lineStart = toTime(line?.start)
const lineEnd = toTime(line?.end)
const baseStart = tokenStart ?? lineStart
const baseEnd = tokenEnd ?? lineEnd
if (
baseStart == null ||
baseEnd == null ||
!Number.isFinite(baseStart) ||
!Number.isFinite(baseEnd) ||
baseEnd <= baseStart
) {
return null
}
const duration = baseEnd - baseStart
return chunks.map((chunk, idx) => ({
start: baseStart + (duration * idx) / chunks.length,
end: baseStart + (duration * (idx + 1)) / chunks.length,
value: chunk,
role: typeof token?.role === 'string' ? token.role : '',
}))
}
export const hasTokenTiming = (structuredLyric) =>
Boolean(
structuredLyric &&
Array.isArray(structuredLyric.tokenLine) &&
structuredLyric.tokenLine.some(
(tokenLine) =>
Array.isArray(tokenLine?.token) &&
tokenLine.token.some((token) => Number.isFinite(Number(token?.start))),
),
)
export const hasStructuredLyricContent = (structuredLyric) =>
Boolean(
structuredLyric &&
((Array.isArray(structuredLyric.line) &&
structuredLyric.line.some(
(line) => typeof line?.value === 'string' && line.value.trim() !== '',
)) ||
hasTokenTiming(structuredLyric)),
)
export const getPreferredLyricLanguage = () => {
if (typeof window !== 'undefined' && window.localStorage) {
const stored = window.localStorage.getItem('locale')
if (stored) {
return stored
}
}
if (typeof navigator !== 'undefined' && navigator.language) {
return navigator.language
}
return 'en'
}
export const selectLyricLayers = (structuredLyrics, preferredLanguage) => {
if (!Array.isArray(structuredLyrics)) {
return {
main: null,
translation: null,
pronunciation: null,
}
}
const synced = structuredLyrics.filter(hasTimedLines)
if (synced.length === 0) {
return {
main: null,
translation: null,
pronunciation: null,
}
}
const grouped = {
[LYRIC_KIND_MAIN]: [],
[LYRIC_KIND_TRANSLATION]: [],
[LYRIC_KIND_PRONUNCIATION]: [],
}
for (const lyric of synced) {
grouped[normalizeLyricKind(lyric?.kind)].push(lyric)
}
const mainCandidates = grouped[LYRIC_KIND_MAIN].length
? grouped[LYRIC_KIND_MAIN]
: synced
return {
main: pickLyricByLanguage(mainCandidates, preferredLanguage),
translation: pickLyricByLanguage(
grouped[LYRIC_KIND_TRANSLATION],
preferredLanguage,
),
pronunciation: pickLyricByLanguage(
grouped[LYRIC_KIND_PRONUNCIATION],
preferredLanguage,
),
}
}
export const pickStructuredLyric = (structuredLyrics, preferredLanguage) =>
selectLyricLayers(structuredLyrics, preferredLanguage).main
export const structuredLyricToLrc = (structuredLyric) => {
if (!structuredLyric || !Array.isArray(structuredLyric.line)) {
return ''
}
let lyricText = ''
for (const line of structuredLyric.line) {
const start = Number(line.start)
if (!Number.isFinite(start) || start < 0) {
continue
}
let time = Math.floor(start / 10)
const ms = time % 100
time = Math.floor(time / 100)
const sec = time % 60
time = Math.floor(time / 60)
const min = time % 60
lyricText += `[${padTime(min)}:${padTime(sec)}.${padTime(ms)}] ${line.value || ''}\n`
}
return lyricText
}
export const structuredLyricsToLrc = (structuredLyrics, preferredLanguage) => {
const selected = pickStructuredLyric(structuredLyrics, preferredLanguage)
if (!selected) {
return ''
}
return structuredLyricToLrc(selected)
}
export const buildKaraokeLines = (structuredLyric) => {
if (!structuredLyric) {
return []
}
const baseLines = Array.isArray(structuredLyric.line)
? structuredLyric.line
: []
const rawTokenLines = Array.isArray(structuredLyric.tokenLine)
? structuredLyric.tokenLine
: []
const lines =
rawTokenLines.length > 0
? rawTokenLines.map((tokenLine, fallbackIndex) => {
const normalized = normalizeTokenLine(tokenLine, fallbackIndex)
const baseLine = baseLines[normalized.index] || {}
const tokens = normalized.tokens
const fallbackStart =
tokens.find((token) => token.start != null)?.start ?? null
const fallbackEnd =
[...tokens].reverse().find((token) => token.end != null)?.end ??
null
const value =
normalized.value ||
(typeof baseLine.value === 'string' ? baseLine.value : '') ||
tokens.map((token) => token.value).join('')
return {
index: normalized.index,
start: normalized.start ?? toTime(baseLine.start) ?? fallbackStart,
end: normalized.end ?? toTime(baseLine.end) ?? fallbackEnd,
value,
tokens,
}
})
: baseLines.map((line, index) => ({
index,
start: toTime(line.start),
end: toTime(line.end),
value: typeof line.value === 'string' ? line.value : '',
tokens: [],
}))
const normalized = lines
.filter((line) => line.value || line.tokens.length > 0)
.sort((a, b) => {
if (a.start == null && b.start == null) {
return a.index - b.index
}
if (a.start == null) {
return 1
}
if (b.start == null) {
return -1
}
if (a.start !== b.start) {
return a.start - b.start
}
return a.index - b.index
})
.map((line) => {
const nextLine = { ...line }
if (nextLine.tokens.length === 1) {
const syntheticTokens = buildSyntheticWordTokens(
nextLine,
nextLine.tokens[0],
)
if (syntheticTokens) {
nextLine.tokens = syntheticTokens
}
}
return nextLine
})
for (let i = 0; i < normalized.length; i += 1) {
if (normalized[i].end == null) {
const nextStart = normalized[i + 1]?.start
if (nextStart != null) {
normalized[i].end = nextStart
}
}
}
return normalized
}
export const resolveKaraokeTokenWindow = (
line,
tokenIndex,
lineEndFallback = null,
) => {
const tokens = Array.isArray(line?.tokens) ? line.tokens : []
const token = tokens[tokenIndex]
if (!token) {
return { start: null, end: null }
}
const prevToken = tokenIndex > 0 ? tokens[tokenIndex - 1] : null
const nextToken =
tokenIndex + 1 < tokens.length ? tokens[tokenIndex + 1] : null
const lineStart = toTime(line?.start)
const lineEnd = toTime(line?.end) ?? toTime(lineEndFallback)
const tokenCount = tokens.length
const hasLineWindow =
lineStart != null &&
lineEnd != null &&
Number.isFinite(lineStart) &&
Number.isFinite(lineEnd) &&
lineEnd > lineStart
const estimatedStart =
hasLineWindow && tokenCount > 0
? lineStart + ((lineEnd - lineStart) * tokenIndex) / tokenCount
: null
const estimatedEnd =
hasLineWindow && tokenCount > 0
? lineStart + ((lineEnd - lineStart) * (tokenIndex + 1)) / tokenCount
: null
let explicitStartCount = 0
let explicitEndCount = 0
const uniqueStarts = new Set()
const uniqueEnds = new Set()
for (let i = 0; i < tokenCount; i += 1) {
const explicitStart = toTime(tokens[i]?.start)
if (explicitStart != null) {
explicitStartCount += 1
uniqueStarts.add(explicitStart)
}
const explicitEnd = toTime(tokens[i]?.end)
if (explicitEnd != null) {
explicitEndCount += 1
uniqueEnds.add(explicitEnd)
}
}
const collapsedStarts =
explicitStartCount > 1 && uniqueStarts.size <= Math.max(1, tokenCount / 4)
const collapsedEnds =
explicitEndCount > 1 && uniqueEnds.size <= Math.max(1, tokenCount / 4)
const shouldForceEstimated =
hasLineWindow && tokenCount > 1 && (collapsedStarts || collapsedEnds)
if (shouldForceEstimated) {
return {
start: estimatedStart,
end: estimatedEnd,
}
}
const prevEnd = toTime(prevToken?.end) ?? toTime(prevToken?.start)
let start = toTime(token.start)
if (start == null) {
start = prevEnd ?? estimatedStart ?? lineStart
}
let end = toTime(token.end)
if (end == null) {
const nextDirectStart = toTime(nextToken?.start)
const nextEstimatedStart =
hasLineWindow && tokenIndex + 1 < tokenCount
? lineStart + ((lineEnd - lineStart) * (tokenIndex + 1)) / tokenCount
: null
end = nextDirectStart ?? nextEstimatedStart ?? estimatedEnd ?? lineEnd
}
if (
tokenCount === 1 &&
hasLineWindow &&
(start == null || end == null || end <= start + 1)
) {
start = lineStart
end = lineEnd
}
if (start != null && end != null && end < start) {
end = start
}
return { start, end }
}
export const getActiveKaraokeState = (lines, currentTimeMs) => {
if (!Array.isArray(lines) || lines.length === 0) {
return { lineIndex: -1, tokenIndex: -1 }
}
const current = Number.isFinite(Number(currentTimeMs))
? Number(currentTimeMs)
: 0
let lineIndex = 0
for (let i = 0; i < lines.length; i += 1) {
const lineStart = toTime(lines[i]?.start)
if (lineStart == null || lineStart <= current + KARAOKE_SWITCH_EPSILON_MS) {
lineIndex = i
continue
}
break
}
for (let i = lineIndex; i >= 0; i -= 1) {
const lineStart = toTime(lines[i]?.start)
const lineEnd = toTime(lines[i]?.end) ?? toTime(lines[i + 1]?.start)
if (lineStart != null && current + KARAOKE_SWITCH_EPSILON_MS < lineStart) {
continue
}
if (lineEnd == null || current <= lineEnd + KARAOKE_SWITCH_EPSILON_MS) {
lineIndex = i
break
}
}
const activeLine = lines[lineIndex] || null
const tokens = Array.isArray(activeLine?.tokens) ? activeLine.tokens : []
let tokenIndex = -1
for (let i = 0; i < tokens.length; i += 1) {
const { start: tokenStart, end: tokenEnd } = resolveKaraokeTokenWindow(
activeLine,
i,
lines[lineIndex + 1]?.start,
)
if (
tokenStart == null ||
tokenStart <= current + KARAOKE_SWITCH_EPSILON_MS
) {
tokenIndex = i
if (tokenEnd != null && current <= tokenEnd + KARAOKE_SWITCH_EPSILON_MS) {
break
}
continue
}
break
}
return { lineIndex, tokenIndex }
}
export const findLayerLineIndexForMain = (mainLines, layerLines, mainIndex) => {
if (
!Array.isArray(mainLines) ||
!Array.isArray(layerLines) ||
mainLines.length === 0 ||
layerLines.length === 0 ||
mainIndex < 0 ||
mainIndex >= mainLines.length
) {
return -1
}
const { start: mainStart, end: mainEnd } = lineTimeWindow(
mainLines,
mainIndex,
)
if (mainStart == null) {
return -1
}
const mainWindowEnd = mainEnd ?? mainStart
const mainWindowDuration = Math.max(0, mainWindowEnd - mainStart)
const maxDelta = Math.max(550, Math.min(1400, mainWindowDuration + 420))
let bestIdx = -1
let bestScore = Number.POSITIVE_INFINITY
for (let i = 0; i < layerLines.length; i += 1) {
const { start, end } = lineTimeWindow(layerLines, i)
if (start != null && end != null) {
const overlap = Math.min(end, mainEnd ?? end) - Math.max(start, mainStart)
if (overlap >= 0) {
const score = Math.abs(start - mainStart) + Math.abs(i - mainIndex) * 30
if (score < bestScore) {
bestScore = score
bestIdx = i
}
continue
}
}
if (start != null) {
if (Math.abs(start - mainStart) > maxDelta) {
continue
}
const score = Math.abs(start - mainStart) + Math.abs(i - mainIndex) * 45
if (score < bestScore) {
bestScore = score
bestIdx = i
}
}
}
return bestIdx
}
export const resolveLayerLineForMain = (mainLines, layerLines, mainIndex) => {
const index = findLayerLineIndexForMain(mainLines, layerLines, mainIndex)
return {
index,
line: index >= 0 ? layerLines[index] : null,
}
}

View File

@ -0,0 +1,416 @@
import {
buildKaraokeLines,
findLayerLineIndexForMain,
getPreferredLyricLanguage,
getActiveKaraokeState,
hasStructuredLyricContent,
pickStructuredLyric,
resolveKaraokeTokenWindow,
resolveLayerLineForMain,
selectLyricLayers,
structuredLyricToLrc,
structuredLyricsToLrc,
} from './lyrics'
describe('lyrics helpers', () => {
beforeEach(() => {
localStorage.clear()
})
it('prefers a lyric track that matches the locale', () => {
const selected = pickStructuredLyric(
[
{
lang: 'eng',
synced: true,
line: [{ start: 1000, value: 'English line' }],
},
{
lang: 'pt-BR',
synced: true,
line: [{ start: 1000, value: 'Linha em portugues' }],
},
],
'pt-BR',
)
expect(selected.lang).toBe('pt-BR')
})
it('falls back to english when preferred locale is not available', () => {
const selected = pickStructuredLyric(
[
{
lang: 'eng',
synced: true,
line: [{ start: 1000, value: 'English line' }],
},
{
lang: 'deu',
synced: true,
line: [{ start: 1000, value: 'Deutsche Zeile' }],
},
],
'pt-BR',
)
expect(selected.lang).toBe('eng')
})
it('falls back to first synced track when english is missing', () => {
const selected = pickStructuredLyric(
[
{
lang: 'jpn',
synced: true,
line: [{ start: 1000, value: 'Nihongo' }],
},
{
lang: 'deu',
synced: true,
line: [{ start: 1000, value: 'Deutsch' }],
},
],
'pt-BR',
)
expect(selected.lang).toBe('jpn')
})
it('selects translation and pronunciation layers by kind', () => {
const layers = selectLyricLayers(
[
{
kind: 'main',
lang: 'ja',
synced: true,
line: [{ start: 1000, value: 'こんにちは' }],
},
{
kind: 'translation',
lang: 'es',
synced: true,
line: [{ start: 1000, value: 'Hola' }],
},
{
kind: 'pronunciation',
lang: 'ja-Latn',
synced: true,
line: [{ start: 1000, value: 'konnichiwa' }],
},
],
'es-MX',
)
expect(layers.main.lang).toBe('ja')
expect(layers.translation.lang).toBe('es')
expect(layers.pronunciation.lang).toBe('ja-Latn')
})
it('treats missing kind as main for backward compatibility', () => {
const layers = selectLyricLayers(
[
{
lang: 'eng',
synced: true,
line: [{ start: 1000, value: 'Main' }],
},
],
'eng',
)
expect(layers.main.lang).toBe('eng')
expect(layers.translation).toBeNull()
expect(layers.pronunciation).toBeNull()
})
it('matches layer line by timing for the active main line', () => {
const mainLines = [
{ index: 0, start: 1000, end: 1800, value: 'Line A', tokens: [] },
{ index: 1, start: 2000, end: 2800, value: 'Line B', tokens: [] },
]
const layerLines = [
{ index: 0, start: 900, end: 1750, value: 'A2', tokens: [] },
{ index: 1, start: 2050, end: 2900, value: 'B2', tokens: [] },
]
expect(findLayerLineIndexForMain(mainLines, layerLines, 1)).toBe(1)
expect(resolveLayerLineForMain(mainLines, layerLines, 0).line.value).toBe(
'A2',
)
})
it('matches metadata layers by nearest timing even when indexes differ', () => {
const mainLines = [
{ index: 0, start: 1000, end: 1800, value: 'Line A', tokens: [] },
{ index: 1, start: 2000, end: 2800, value: 'Line B', tokens: [] },
{ index: 2, start: 3000, end: 3800, value: 'Line C', tokens: [] },
]
const layerLines = [
{ index: 2, start: 3020, end: 3820, value: 'C2', tokens: [] },
{ index: 0, start: 980, end: 1760, value: 'A2', tokens: [] },
{ index: 1, start: 2010, end: 2810, value: 'B2', tokens: [] },
]
expect(findLayerLineIndexForMain(mainLines, layerLines, 1)).toBe(2)
expect(resolveLayerLineForMain(mainLines, layerLines, 2).line.value).toBe(
'C2',
)
})
it('returns no layer match when the nearest line is too far in time', () => {
const mainLines = [
{ index: 0, start: 1000, end: 1800, value: 'Line A', tokens: [] },
{ index: 1, start: 2000, end: 2800, value: 'Line B', tokens: [] },
]
const layerLines = [
{ index: 0, start: 60000, end: 60800, value: 'Far line', tokens: [] },
]
expect(findLayerLineIndexForMain(mainLines, layerLines, 1)).toBe(-1)
expect(resolveLayerLineForMain(mainLines, layerLines, 1).line).toBeNull()
})
it('converts a structured lyric track to LRC', () => {
const lrc = structuredLyricToLrc({
lang: 'eng',
synced: true,
line: [
{ start: 18800, value: "We're no strangers to love" },
{ start: 22801, value: 'You know the rules and so do I' },
],
})
expect(lrc).toBe(
"[00:18.80] We're no strangers to love\n[00:22.80] You know the rules and so do I\n",
)
})
it('returns empty text when no synced lyrics are available', () => {
const lrc = structuredLyricsToLrc(
[{ lang: 'eng', synced: false, line: [{ value: 'Unsynced line' }] }],
'eng',
)
expect(lrc).toBe('')
})
it('reads preferred language from localStorage first', () => {
localStorage.setItem('locale', 'pt-BR')
expect(getPreferredLyricLanguage()).toBe('pt-BR')
})
it('builds karaoke lines from tokenLine payload', () => {
const lines = buildKaraokeLines({
lang: 'eng',
synced: true,
line: [{ start: 1000, end: 3000, value: 'Hello world' }],
tokenLine: [
{
index: 0,
start: 1000,
end: 3000,
value: 'Hello world',
token: [
{ start: 1000, end: 1500, value: 'Hello' },
{ start: 2000, end: 2500, value: 'world', role: 'x-bg' },
],
},
],
})
expect(lines).toEqual([
{
index: 0,
start: 1000,
end: 3000,
value: 'Hello world',
tokens: [
{ start: 1000, end: 1500, value: 'Hello', role: '' },
{ start: 2000, end: 2500, value: 'world', role: 'x-bg' },
],
},
])
})
it('sorts token timing by start to keep playback stable', () => {
const lines = buildKaraokeLines({
lang: 'eng',
synced: true,
line: [{ start: 1000, end: 3000, value: 'Hello world' }],
tokenLine: [
{
index: 0,
start: 1000,
end: 3000,
value: 'Hello world',
token: [
{ start: 2000, end: 2500, value: 'world', role: '' },
{ start: 1000, end: 1500, value: 'Hello', role: '' },
],
},
],
})
expect(lines[0].tokens.map((token) => token.value)).toEqual([
'Hello',
'world',
])
})
it('splits a single full-line token into synthetic word tokens', () => {
const lines = buildKaraokeLines({
lang: 'ko-Latn',
synced: true,
line: [{ start: 1000, end: 2000, value: 'Da-la-lun, dun' }],
tokenLine: [
{
index: 0,
start: 1000,
end: 2000,
value: 'Da-la-lun, dun',
token: [{ start: 1000, end: 2000, value: 'Da-la-lun, dun' }],
},
],
})
expect(lines).toHaveLength(1)
expect(lines[0].tokens).toHaveLength(2)
expect(lines[0].tokens[0].value).toBe('Da-la-lun, ')
expect(lines[0].tokens[1].value).toBe('dun')
const firstWindow = resolveKaraokeTokenWindow(lines[0], 0)
const secondWindow = resolveKaraokeTokenWindow(lines[0], 1)
expect(firstWindow.start).toBeCloseTo(1000)
expect(firstWindow.end).toBeCloseTo(1500)
expect(secondWindow.start).toBeCloseTo(1500)
expect(secondWindow.end).toBeCloseTo(2000)
})
it('detects active line and token for karaoke timing', () => {
const state = getActiveKaraokeState(
[
{
index: 0,
start: 1000,
end: 3000,
value: 'Hello world',
tokens: [
{ start: 1000, end: 1500, value: 'Hello', role: '' },
{ start: 2000, end: 2500, value: 'world', role: '' },
],
},
{
index: 1,
start: 3500,
end: 5000,
value: 'Second line',
tokens: [],
},
],
2200,
)
expect(state).toEqual({ lineIndex: 0, tokenIndex: 1 })
})
it('resolves token window fallback boundaries from neighboring tokens', () => {
const line = {
start: 1000,
end: 3000,
value: 'Hello world',
tokens: [
{ start: 1200, value: 'Hello', role: '' },
{ start: 1800, value: 'world', role: '' },
],
}
expect(resolveKaraokeTokenWindow(line, 0)).toEqual({
start: 1200,
end: 1800,
})
expect(resolveKaraokeTokenWindow(line, 1)).toEqual({
start: 1800,
end: 3000,
})
})
it('infers sequential token windows when token timings are missing', () => {
const line = {
start: 1000,
end: 2000,
value: 'A B C',
tokens: [
{ value: 'A', role: '' },
{ value: 'B', role: '' },
{ value: 'C', role: '' },
],
}
const first = resolveKaraokeTokenWindow(line, 0)
const second = resolveKaraokeTokenWindow(line, 1)
const third = resolveKaraokeTokenWindow(line, 2)
expect(first.start).toBeCloseTo(1000)
expect(first.end).toBeCloseTo(1333.3333333333333)
expect(second.start).toBeCloseTo(1333.3333333333333)
expect(second.end).toBeCloseTo(1666.6666666666667)
expect(third.start).toBeCloseTo(1666.6666666666667)
expect(third.end).toBeCloseTo(2000)
})
it('falls back to sequential windows when token timings are collapsed', () => {
const line = {
start: 1000,
end: 2000,
value: 'A B C',
tokens: [
{ start: 1000, end: 2000, value: 'A', role: '' },
{ start: 1000, end: 2000, value: 'B', role: '' },
{ start: 1000, end: 2000, value: 'C', role: '' },
],
}
const first = resolveKaraokeTokenWindow(line, 0)
const second = resolveKaraokeTokenWindow(line, 1)
const third = resolveKaraokeTokenWindow(line, 2)
expect(first.start).toBeCloseTo(1000)
expect(first.end).toBeCloseTo(1333.3333333333333)
expect(second.start).toBeCloseTo(1333.3333333333333)
expect(second.end).toBeCloseTo(1666.6666666666667)
expect(third.start).toBeCloseTo(1666.6666666666667)
expect(third.end).toBeCloseTo(2000)
})
it('keeps token selection stable near tight token boundaries', () => {
const state = getActiveKaraokeState(
[
{
index: 0,
start: 1000,
end: 2000,
value: 'A B',
tokens: [
{ start: 1000, end: 1100, value: 'A', role: '' },
{ start: 1110, end: 1300, value: 'B', role: '' },
],
},
],
1108,
)
expect(state).toEqual({ lineIndex: 0, tokenIndex: 0 })
})
it('reports structured lyric content when token timing exists', () => {
expect(
hasStructuredLyricContent({
tokenLine: [{ token: [{ start: 100, value: 'a' }] }],
}),
).toBe(true)
})
})

View File

@ -7,6 +7,7 @@ import {
PLAYER_CURRENT,
PLAYER_PLAY_NEXT,
PLAYER_PLAY_TRACKS,
PLAYER_UPDATE_LYRIC,
PLAYER_SET_TRACK,
PLAYER_SET_VOLUME,
PLAYER_SYNC_QUEUE,
@ -60,21 +61,25 @@ const mapToAudioLists = (item) => {
let lyricText = ''
if (lyrics) {
const structured = JSON.parse(lyrics)
for (const structuredLyric of structured) {
if (structuredLyric.synced) {
for (const line of structuredLyric.line) {
let time = Math.floor(line.start / 10)
const ms = time % 100
time = Math.floor(time / 100)
const sec = time % 60
time = Math.floor(time / 60)
const min = time % 60
try {
const structured = JSON.parse(lyrics)
for (const structuredLyric of structured) {
if (structuredLyric.synced) {
for (const line of structuredLyric.line) {
let time = Math.floor(line.start / 10)
const ms = time % 100
time = Math.floor(time / 100)
const sec = time % 60
time = Math.floor(time / 60)
const min = time % 60
ms.toString()
lyricText += `[${pad(min)}:${pad(sec)}.${pad(ms)}] ${line.value}\n`
ms.toString()
lyricText += `[${pad(min)}:${pad(sec)}.${pad(ms)}] ${line.value}\n`
}
}
}
} catch {
lyricText = ''
}
}
@ -206,6 +211,45 @@ const reduceMode = (state, { data: { mode } }) => {
}
}
const reduceUpdateLyric = (state, { data: { trackId, lyric } }) => {
if (!trackId) {
return state
}
let changed = false
const queue = state.queue.map((item) => {
if (item.trackId !== trackId) {
return item
}
if (item.lyric === lyric) {
return item
}
changed = true
return {
...item,
lyric,
}
})
if (!changed) {
return state
}
const current =
state.current?.trackId === trackId
? {
...state.current,
lyric,
}
: state.current
return {
...state,
queue,
current,
}
}
export const playerReducer = (previousState = initialState, payload) => {
const { type } = payload
switch (type) {
@ -243,6 +287,8 @@ export const playerReducer = (previousState = initialState, payload) => {
previousState.savedPlayIndex >= 0 ? previousState.savedPlayIndex : 0,
}
}
case PLAYER_UPDATE_LYRIC:
return reduceUpdateLyric(previousState, payload)
default:
return previousState
}

View File

@ -1,11 +1,24 @@
import { describe, it, expect } from 'vitest'
import { describe, expect, it, vi } from 'vitest'
import { playerReducer } from './playerReducer'
import {
PLAYER_SYNC_QUEUE,
PLAYER_CURRENT,
PLAYER_REFRESH_QUEUE,
PLAYER_SET_TRACK,
PLAYER_SYNC_QUEUE,
PLAYER_UPDATE_LYRIC,
} from '../actions'
vi.mock('uuid', () => ({
v4: () => 'test-uuid',
}))
vi.mock('../subsonic', () => ({
default: {
streamUrl: vi.fn((id) => `/rest/stream?id=${id}`),
getCoverArtUrl: vi.fn(() => '/rest/getCoverArt?id=test'),
},
}))
describe('playerReducer', () => {
describe('pending track selection survives SYNC_QUEUE and premature CURRENT', () => {
// Simulates the real sequence when clicking a new song while one is playing:
@ -54,8 +67,6 @@ describe('playerReducer', () => {
})
it('CURRENT for old track preserves pending playIndex', () => {
// After SYNC_QUEUE, queue has new UUIDs. The old track's UUID (zzz)
// is at index 2, but playIndex is 0. This is a premature callback.
const stateAfterSync = {
...stateAfterPlayTracks,
queue: [
@ -71,7 +82,7 @@ describe('playerReducer', () => {
const result = playerReducer(stateAfterSync, action)
expect(result.playIndex).toBe(0)
expect(result.clear).toBe(true)
expect(result.savedPlayIndex).toBe(2) // preserved from before
expect(result.savedPlayIndex).toBe(2)
})
it('CURRENT for correct track consumes pending playIndex', () => {
@ -83,7 +94,6 @@ describe('playerReducer', () => {
{ trackId: 's3', uuid: 'zzz', name: 'Song 3' },
],
}
// Player switched to Song 1 (uuid 'xxx', index 0 == playIndex)
const action = {
type: PLAYER_CURRENT,
data: { uuid: 'xxx', name: 'Song 1', volume: 1 },
@ -142,4 +152,80 @@ describe('playerReducer', () => {
expect(result.playIndex).toBe(0)
})
})
it('maps embedded synced lyrics to LRC text', () => {
const lyrics = JSON.stringify([
{
lang: 'eng',
synced: true,
line: [{ start: 1000, value: 'Line one' }],
},
{
lang: 'eng',
synced: false,
line: [{ value: 'Unsynced line' }],
},
])
const state = playerReducer(undefined, {
type: PLAYER_SET_TRACK,
data: {
id: 'song-1',
title: 'Test Song',
artist: 'Test Artist',
album: 'Test Album',
duration: 60,
lyrics,
},
})
expect(state.queue).toHaveLength(1)
expect(state.queue[0].lyric).toBe('[00:01.00] Line one\n')
})
it('updates queue lyric by track id', () => {
const initial = playerReducer(undefined, {
type: PLAYER_SET_TRACK,
data: {
id: 'song-1',
title: 'Test Song',
artist: 'Test Artist',
album: 'Test Album',
duration: 60,
},
})
const updated = playerReducer(initial, {
type: PLAYER_UPDATE_LYRIC,
data: {
trackId: 'song-1',
lyric: '[00:01.00] Updated lyric\n',
},
})
expect(updated.queue[0].lyric).toBe('[00:01.00] Updated lyric\n')
})
it('returns same state when lyric update does not match any track', () => {
const initial = playerReducer(undefined, {
type: PLAYER_SET_TRACK,
data: {
id: 'song-1',
title: 'Test Song',
artist: 'Test Artist',
album: 'Test Album',
duration: 60,
},
})
const updated = playerReducer(initial, {
type: PLAYER_UPDATE_LYRIC,
data: {
trackId: 'missing-track',
lyric: '[00:01.00] Updated lyric\n',
},
})
expect(updated).toBe(initial)
})
})

View File

@ -120,6 +120,10 @@ const getTopSongs = (artist, count = 50) => {
return httpClient(url('getTopSongs', null, { artist, count }))
}
const getLyricsBySongId = (id) => {
return httpClient(url('getLyricsBySongId', id))
}
const streamUrl = (id, options) => {
return baseUrl(
url('stream', id, {
@ -149,4 +153,5 @@ export default {
getArtistInfo,
getTopSongs,
getSimilarSongs2,
getLyricsBySongId,
}

View File

@ -1,7 +1,12 @@
import { vi } from 'vitest'
import { COVER_ART_SIZE } from '../consts'
import { httpClient } from '../dataProvider'
import subsonic from './index'
vi.mock('../dataProvider', () => ({
httpClient: vi.fn(() => Promise.resolve({})),
}))
describe('getCoverArtUrl', () => {
beforeEach(() => {
// Mock window.location
@ -178,3 +183,29 @@ describe('getAvatarUrl', () => {
expect(url).toContain('username=john')
})
})
describe('getLyricsBySongId', () => {
beforeEach(() => {
vi.clearAllMocks()
const localStorageMock = {
getItem: vi.fn((key) => {
const values = {
username: 'testuser',
'subsonic-token': 'testtoken',
'subsonic-salt': 'testsalt',
}
return values[key] || null
}),
}
Object.defineProperty(window, 'localStorage', { value: localStorageMock })
})
it('calls the getLyricsBySongId endpoint', async () => {
await subsonic.getLyricsBySongId('song-1')
expect(httpClient).toHaveBeenCalledTimes(1)
const calledUrl = httpClient.mock.calls[0][0]
expect(calledUrl).toContain('/rest/getLyricsBySongId?')
expect(calledUrl).toContain('id=song-1')
})
})