diff --git a/conf/configuration.go b/conf/configuration.go
index 1d4c3a348..530ebe785 100644
--- a/conf/configuration.go
+++ b/conf/configuration.go
@@ -677,7 +677,7 @@ func setViperDefaults() {
viper.SetDefault("coverartquality", 75)
viper.SetDefault("artistartpriority", "artist.*, album/artist.*, external")
viper.SetDefault("discartpriority", "disc*.*, cd*.*, cover.*, folder.*, front.*, discsubtitle, embedded")
- viper.SetDefault("lyricspriority", ".lrc,.ttml,.txt,embedded")
+ viper.SetDefault("lyricspriority", ".ttml,.elrc,.lrc,.srt,.txt,embedded")
viper.SetDefault("enablegravatar", false)
viper.SetDefault("enablefavourites", true)
viper.SetDefault("enablestarrating", true)
diff --git a/core/lyrics/lyrics_test.go b/core/lyrics/lyrics_test.go
index d5f79a4d0..58e8ba82b 100644
--- a/core/lyrics/lyrics_test.go
+++ b/core/lyrics/lyrics_test.go
@@ -44,6 +44,36 @@ var _ = Describe("sources", func() {
},
}
+ elrcLyrics := model.LyricList{
+ model.Lyrics{
+ DisplayArtist: "ELRC Artist",
+ DisplayTitle: "ELRC Song",
+ Lang: "eng",
+ Line: []model.Line{
+ {
+ Start: gg.P(int64(1000)),
+ End: gg.P(int64(1500)),
+ Value: "Lead words",
+ Cue: []model.Cue{
+ {
+ Start: gg.P(int64(1000)),
+ Value: "Lead ",
+ },
+ {
+ Start: gg.P(int64(1500)),
+ Value: "words",
+ },
+ },
+ },
+ {
+ Start: gg.P(int64(3000)),
+ Value: "Fallback line",
+ },
+ },
+ Synced: true,
+ },
+ }
+
ttmlLyrics := model.LyricList{
model.Lyrics{
Kind: "main",
@@ -88,6 +118,25 @@ var _ = Describe("sources", func() {
},
}
+ srtLyrics := model.LyricList{
+ model.Lyrics{
+ Lang: "xxx",
+ Line: []model.Line{
+ {
+ Start: gg.P(int64(18800)),
+ End: gg.P(int64(22800)),
+ Value: "We're from subtitles",
+ },
+ {
+ Start: gg.P(int64(22801)),
+ End: gg.P(int64(26000)),
+ Value: "Another subtitle line",
+ },
+ },
+ Synced: true,
+ },
+ }
+
BeforeEach(func() {
DeferCleanup(configtest.SetupConfig())
@@ -109,8 +158,10 @@ var _ = Describe("sources", func() {
},
Entry("embedded > lrc > txt", "embedded,.lrc,.txt", embeddedLyrics),
Entry("lrc > embedded > txt", ".lrc,embedded,.txt", syncedLyrics),
+ Entry("elrc > lrc > embedded", ".elrc,.lrc,embedded", elrcLyrics),
+ Entry("srt > txt > embedded", ".srt,.txt,embedded", srtLyrics),
Entry("txt > lrc > embedded", ".txt,.lrc,embedded", unsyncedLyrics),
- Entry("ttml > lrc > embedded", ".ttml,.lrc,embedded", ttmlLyrics))
+ Entry("ttml > elrc > lrc > srt > embedded", ".ttml,.elrc,.lrc,.srt,embedded", ttmlLyrics))
Context("Errors", func() {
var RegularUserContext = XContext
diff --git a/core/lyrics/sources.go b/core/lyrics/sources.go
index 38a71cb8a..7586c944f 100644
--- a/core/lyrics/sources.go
+++ b/core/lyrics/sources.go
@@ -38,13 +38,20 @@ func fromExternalFile(ctx context.Context, mf *model.MediaFile, suffix string) (
}
var list model.LyricList
- if strings.EqualFold(suffix, ".ttml") {
+ switch {
+ case strings.EqualFold(suffix, ".ttml"):
list, err = parseTTML(contents)
if err != nil {
log.Error(ctx, "error parsing ttml external file", "path", externalLyric, err)
return nil, err
}
- } else {
+ case strings.EqualFold(suffix, ".srt"):
+ list, err = parseSRT(contents)
+ if err != nil {
+ log.Error(ctx, "error parsing srt external file", "path", externalLyric, err)
+ return nil, err
+ }
+ default:
lyrics, err := model.ToLyrics("xxx", string(contents))
if err != nil {
log.Error(ctx, "error parsing lyric external file", "path", externalLyric, err)
diff --git a/core/lyrics/sources_test.go b/core/lyrics/sources_test.go
index 3dd2825e6..a110390d8 100644
--- a/core/lyrics/sources_test.go
+++ b/core/lyrics/sources_test.go
@@ -106,10 +106,10 @@ var _ = Describe("sources", func() {
Expect(lyrics[0].Line[0].Cue).To(HaveLen(3))
Expect(*lyrics[0].Line[0].Cue[0].Start).To(Equal(int64(1000)))
Expect(lyrics[0].Line[0].Cue[0].Value).To(Equal("Some "))
- Expect(*lyrics[0].Line[0].Cue[0].End).To(Equal(int64(1500)))
+ Expect(lyrics[0].Line[0].Cue[0].End).To(BeNil())
Expect(*lyrics[0].Line[0].Cue[1].Start).To(Equal(int64(1500)))
Expect(lyrics[0].Line[0].Cue[1].Value).To(Equal("lyrics "))
- Expect(*lyrics[0].Line[0].Cue[1].End).To(Equal(int64(2000)))
+ Expect(lyrics[0].Line[0].Cue[1].End).To(BeNil())
Expect(*lyrics[0].Line[0].Cue[2].Start).To(Equal(int64(2000)))
Expect(lyrics[0].Line[0].Cue[2].Value).To(Equal("here"))
Expect(lyrics[0].Line[0].Cue[2].End).To(BeNil())
@@ -125,6 +125,33 @@ var _ = Describe("sources", func() {
Expect(lyrics[0].Line[2].Cue).To(BeNil())
})
+ It("should return Enhanced LRC lyrics from an ELRC file", func() {
+ mf := model.MediaFile{Path: "tests/fixtures/test.mp3"}
+ lyrics, err := fromExternalFile(ctx, &mf, ".elrc")
+
+ Expect(err).To(BeNil())
+ Expect(lyrics).To(HaveLen(1))
+ Expect(lyrics[0].DisplayArtist).To(Equal("ELRC Artist"))
+ Expect(lyrics[0].DisplayTitle).To(Equal("ELRC Song"))
+ Expect(lyrics[0].Lang).To(Equal("eng"))
+ Expect(lyrics[0].Synced).To(BeTrue())
+ Expect(lyrics[0].Line).To(HaveLen(2))
+
+ Expect(lyrics[0].Line[0].Start).To(Equal(gg.P(int64(1000))))
+ Expect(lyrics[0].Line[0].Value).To(Equal("Lead words"))
+ Expect(lyrics[0].Line[0].Cue).To(HaveLen(2))
+ Expect(*lyrics[0].Line[0].Cue[0].Start).To(Equal(int64(1000)))
+ Expect(lyrics[0].Line[0].Cue[0].Value).To(Equal("Lead "))
+ Expect(lyrics[0].Line[0].Cue[0].End).To(BeNil())
+ Expect(*lyrics[0].Line[0].Cue[1].Start).To(Equal(int64(1500)))
+ Expect(lyrics[0].Line[0].Cue[1].Value).To(Equal("words"))
+ Expect(lyrics[0].Line[0].Cue[1].End).To(BeNil())
+
+ Expect(lyrics[0].Line[1].Start).To(Equal(gg.P(int64(3000))))
+ Expect(lyrics[0].Line[1].Value).To(Equal("Fallback line"))
+ Expect(lyrics[0].Line[1].Cue).To(BeNil())
+ })
+
It("should return unsynchronized lyrics from a file", func() {
mf := model.MediaFile{Path: "tests/fixtures/test.mp3"}
lyrics, err := fromExternalFile(ctx, &mf, ".txt")
@@ -146,6 +173,31 @@ var _ = Describe("sources", func() {
}))
})
+ It("should return synchronized lyrics from an SRT file", func() {
+ mf := model.MediaFile{Path: "tests/fixtures/test.mp3"}
+ lyrics, err := fromExternalFile(ctx, &mf, ".srt")
+
+ Expect(err).To(BeNil())
+ Expect(lyrics).To(Equal(model.LyricList{
+ model.Lyrics{
+ Lang: "xxx",
+ Line: []model.Line{
+ {
+ Start: gg.P(int64(18800)),
+ End: gg.P(int64(22800)),
+ Value: "We're from subtitles",
+ },
+ {
+ Start: gg.P(int64(22801)),
+ End: gg.P(int64(26000)),
+ Value: "Another subtitle line",
+ },
+ },
+ Synced: true,
+ },
+ }))
+ })
+
It("should return synchronized multilingual lyrics from a TTML file", func() {
mf := model.MediaFile{Path: "tests/fixtures/test.mp3"}
lyrics, err := fromExternalFile(ctx, &mf, ".ttml")
diff --git a/core/lyrics/srt.go b/core/lyrics/srt.go
new file mode 100644
index 000000000..8fd77abb4
--- /dev/null
+++ b/core/lyrics/srt.go
@@ -0,0 +1,161 @@
+package lyrics
+
+import (
+ "bytes"
+ "regexp"
+ "strconv"
+ "strings"
+
+ "github.com/navidrome/navidrome/model"
+ "github.com/navidrome/navidrome/utils/str"
+)
+
+var srtTimeRegex = regexp.MustCompile(`^\s*(\d{1,2}):(\d{2}):(\d{2})[,.](\d{1,3})\s*$`)
+
+func parseSRT(contents []byte) (model.LyricList, error) {
+ raw := strings.ReplaceAll(string(contents), "\r\n", "\n")
+ raw = strings.ReplaceAll(raw, "\r", "\n")
+
+ blocks := splitSRTBlocks(raw)
+ lines := make([]model.Line, 0, len(blocks))
+
+ for _, block := range blocks {
+ line, ok, err := parseSRTBlock(block)
+ if err != nil {
+ return nil, err
+ }
+ if ok {
+ lines = append(lines, line)
+ }
+ }
+
+ if len(lines) == 0 {
+ return nil, nil
+ }
+
+ lyrics := model.NormalizeLyrics(model.Lyrics{
+ Lang: "xxx",
+ Line: lines,
+ Synced: true,
+ })
+ return model.LyricList{lyrics}, nil
+}
+
+func splitSRTBlocks(raw string) []string {
+ raw = strings.TrimSpace(raw)
+ if raw == "" {
+ return nil
+ }
+
+ parts := strings.Split(raw, "\n\n")
+ blocks := make([]string, 0, len(parts))
+ for _, part := range parts {
+ part = strings.TrimSpace(part)
+ if part != "" {
+ blocks = append(blocks, part)
+ }
+ }
+ return blocks
+}
+
+func parseSRTBlock(block string) (model.Line, bool, error) {
+ scanner := bytes.Split([]byte(block), []byte("\n"))
+ if len(scanner) == 0 {
+ return model.Line{}, false, nil
+ }
+
+ lines := make([]string, 0, len(scanner))
+ for _, line := range scanner {
+ lines = append(lines, strings.TrimSpace(string(line)))
+ }
+
+ if len(lines) == 0 {
+ return model.Line{}, false, nil
+ }
+
+ startIdx := 0
+ if digitsOnly(lines[0]) {
+ startIdx = 1
+ }
+ if startIdx >= len(lines) {
+ return model.Line{}, false, nil
+ }
+
+ timing := strings.Split(lines[startIdx], "-->")
+ if len(timing) != 2 {
+ return model.Line{}, false, nil
+ }
+
+ startMs, err := parseSRTTime(timing[0])
+ if err != nil {
+ return model.Line{}, false, err
+ }
+ endMs, err := parseSRTTime(timing[1])
+ if err != nil {
+ return model.Line{}, false, err
+ }
+
+ textLines := make([]string, 0, len(lines)-startIdx-1)
+ for _, line := range lines[startIdx+1:] {
+ if line == "" {
+ continue
+ }
+ textLines = append(textLines, line)
+ }
+
+ value := str.SanitizeText(strings.Join(textLines, "\n"))
+ if value == "" {
+ return model.Line{}, false, nil
+ }
+
+ return model.Line{
+ Start: &startMs,
+ End: &endMs,
+ Value: value,
+ }, true, nil
+}
+
+func parseSRTTime(value string) (int64, error) {
+ match := srtTimeRegex.FindStringSubmatch(strings.TrimSpace(value))
+ if match == nil {
+ return 0, strconv.ErrSyntax
+ }
+
+ hours, err := strconv.ParseInt(match[1], 10, 64)
+ if err != nil {
+ return 0, err
+ }
+ minutes, err := strconv.ParseInt(match[2], 10, 64)
+ if err != nil {
+ return 0, err
+ }
+ seconds, err := strconv.ParseInt(match[3], 10, 64)
+ if err != nil {
+ return 0, err
+ }
+ millis, err := strconv.ParseInt(match[4], 10, 64)
+ if err != nil {
+ return 0, err
+ }
+
+ switch len(match[4]) {
+ case 1:
+ millis *= 100
+ case 2:
+ millis *= 10
+ }
+
+ return (((hours*60)+minutes)*60+seconds)*1000 + millis, nil
+}
+
+func digitsOnly(value string) bool {
+ if value == "" {
+ return false
+ }
+ for _, ch := range value {
+ if ch < '0' || ch > '9' {
+ return false
+ }
+ }
+ return true
+}
diff --git a/core/lyrics/ttml.go b/core/lyrics/ttml.go
index a0bdcac5a..e79dfe846 100644
--- a/core/lyrics/ttml.go
+++ b/core/lyrics/ttml.go
@@ -46,6 +46,7 @@ type ttmlTimingParams struct {
type ttmlTimingContext struct {
lang string
role string
+ agentID string
begin int64
hasBegin bool
end int64
@@ -70,6 +71,12 @@ type ttmlResolvedMetadataLine struct {
line model.Line
}
+type ttmlDefinedAgent struct {
+ ID string
+ Type string
+ Name string
+}
+
type ttmlParser struct {
decoder *xml.Decoder
params ttmlTimingParams
@@ -86,6 +93,8 @@ type ttmlParser struct {
pronunciationLangOrder []string
pronunciationEntriesByLg map[string][]ttmlMetadataEntry
+ definedAgents map[string]ttmlDefinedAgent
+
metadataSeq int
}
@@ -103,6 +112,7 @@ func parseTTML(contents []byte) (model.LyricList, error) {
mainLineRefsByKey: make(map[string]ttmlLineRef),
translationEntriesByLg: make(map[string][]ttmlMetadataEntry),
pronunciationEntriesByLg: make(map[string][]ttmlMetadataEntry),
+ definedAgents: make(map[string]ttmlDefinedAgent),
}
root := ttmlTimingContext{lang: "xxx"}
@@ -140,6 +150,8 @@ func (p *ttmlParser) parseElement(start xml.StartElement, parent ttmlTimingConte
return p.parseMetadataTrack(start, parent, ttmlLyricKindTranslation)
case "transliteration":
return p.parseMetadataTrack(start, parent, ttmlLyricKindPronunciation)
+ case "agent":
+ return p.parseAgentDefinition(start)
}
ctx := p.childContext(start.Attr, parent)
@@ -234,6 +246,49 @@ func (p *ttmlParser) parseMetadataTrack(start xml.StartElement, parent ttmlTimin
}
}
+func (p *ttmlParser) parseAgentDefinition(start xml.StartElement) error {
+ id, ok := attrValue(start.Attr, "id")
+ id = strings.TrimSpace(id)
+ if !ok || id == "" {
+ return p.skipElement(start)
+ }
+
+ agent := ttmlDefinedAgent{
+ ID: id,
+ Type: strings.ToLower(strings.TrimSpace(attrOrEmpty(start.Attr, "type"))),
+ }
+
+ for {
+ token, err := p.decoder.Token()
+ if err != nil {
+ return err
+ }
+
+ switch t := token.(type) {
+ case xml.StartElement:
+ if strings.EqualFold(t.Name.Local, "name") {
+ name, err := p.collectElementText(t)
+ if err != nil {
+ return err
+ }
+ name = sanitizeTTMLText(name)
+ if name != "" && agent.Name == "" {
+ agent.Name = name
+ }
+ continue
+ }
+ if err := p.skipElement(t); err != nil {
+ return err
+ }
+ case xml.EndElement:
+ if strings.EqualFold(t.Name.Local, start.Name.Local) {
+ p.definedAgents[agent.ID] = agent
+ return nil
+ }
+ }
+ }
+}
+
func (p *ttmlParser) parseMetadataText(start xml.StartElement, parent ttmlTimingContext) (ttmlMetadataEntry, bool, error) {
forKey, hasFor := attrValue(start.Attr, "for")
forKey = strings.TrimSpace(forKey)
@@ -338,8 +393,8 @@ func (p *ttmlParser) parseInlineElement(start xml.StartElement, parent ttmlTimin
tokenText := sanitizeTTMLText(value)
if local == "span" && hasOwnTiming && !ctx.invalid && tokenText != "" && len(tokens) == 0 {
parsedToken := model.Cue{
- Value: tokenText,
- Role: ctx.role,
+ Value: tokenText,
+ AgentID: p.resolveCueAgentID(ctx),
}
if ctx.hasBegin {
startMs := ctx.begin
@@ -366,12 +421,12 @@ func (p *ttmlParser) toLyricList() model.LyricList {
if len(lines) == 0 {
continue
}
- res = append(res, model.Lyrics{
+ res = append(res, p.finalizeLyrics(model.Lyrics{
Kind: ttmlLyricKindMain,
Lang: lang,
Line: lines,
Synced: linesAreSynced(lines),
- })
+ }))
}
res = append(res, p.buildMetadataLyrics(ttmlLyricKindTranslation, p.translationLangOrder, p.translationEntriesByLg)...)
@@ -440,17 +495,168 @@ func (p *ttmlParser) buildMetadataLyrics(kind string, langOrder []string, entrie
lines[i] = resolved[i].line
}
- res = append(res, model.Lyrics{
+ res = append(res, p.finalizeLyrics(model.Lyrics{
Kind: kind,
Lang: lang,
Line: lines,
Synced: linesAreSynced(lines),
- })
+ }))
}
return res
}
+func (p *ttmlParser) finalizeLyrics(lyrics model.Lyrics) model.Lyrics {
+ lyrics.Line = model.NormalizeCueLines(lyrics.Line)
+ lyrics.Line, lyrics.Agents = p.resolveAgents(lyrics.Line)
+ return model.NormalizeLyrics(lyrics)
+}
+
+func (p *ttmlParser) resolveAgents(lines []model.Line) ([]model.Line, []model.Agent) {
+ if len(lines) == 0 {
+ return lines, nil
+ }
+
+ normalized := model.NormalizeCueLines(lines)
+ usedOrder := make([]string, 0, 4)
+ usedSet := make(map[string]struct{}, 4)
+ sawEmptyCue := false
+
+ for i := range normalized {
+ for j := range normalized[i].Cue {
+ agentID := strings.TrimSpace(normalized[i].Cue[j].AgentID)
+ if agentID == "" {
+ sawEmptyCue = true
+ continue
+ }
+ if _, exists := usedSet[agentID]; !exists {
+ usedSet[agentID] = struct{}{}
+ usedOrder = append(usedOrder, agentID)
+ }
+ }
+ }
+
+ if len(usedOrder) == 0 {
+ return normalized, nil
+ }
+
+ mainID := ""
+ for _, agentID := range usedOrder {
+ role := p.baseRoleForAgent(agentID)
+ if role != "bg" && role != "group" {
+ mainID = agentID
+ break
+ }
+ }
+ if mainID == "" && sawEmptyCue {
+ mainID = "main"
+ }
+ if mainID == "" {
+ for _, agentID := range usedOrder {
+ if p.baseRoleForAgent(agentID) != "bg" {
+ mainID = agentID
+ break
+ }
+ }
+ }
+ if mainID == "" {
+ mainID = usedOrder[0]
+ }
+
+ if _, exists := usedSet[mainID]; !exists {
+ usedSet[mainID] = struct{}{}
+ usedOrder = append([]string{mainID}, usedOrder...)
+ }
+
+ for i := range normalized {
+ for j := range normalized[i].Cue {
+ if strings.TrimSpace(normalized[i].Cue[j].AgentID) == "" {
+ normalized[i].Cue[j].AgentID = mainID
+ }
+ }
+ }
+
+ agents := make([]model.Agent, 0, len(usedOrder))
+ for _, agentID := range usedOrder {
+ role := p.baseRoleForAgent(agentID)
+ if agentID == mainID {
+ role = "main"
+ }
+ agent := model.Agent{
+ ID: agentID,
+ Role: role,
+ Name: p.agentNameForID(agentID),
+ }
+ agents = append(agents, agent)
+ }
+
+ return normalized, agents
+}
+
+func (p *ttmlParser) resolveCueAgentID(ctx ttmlTimingContext) string {
+ agentID := strings.TrimSpace(ctx.agentID)
+ if contextHasRole(ctx.role, "x-bg") {
+ if agentID == "" {
+ agentID = "main"
+ }
+ return backgroundAgentID(agentID)
+ }
+ return agentID
+}
+
+func (p *ttmlParser) baseRoleForAgent(agentID string) string {
+ if isBackgroundAgentID(agentID) {
+ return "bg"
+ }
+
+ if agent, ok := p.definedAgents[agentID]; ok {
+ switch agent.Type {
+ case "group":
+ return "group"
+ default:
+ return "voice"
+ }
+ }
+
+ return "voice"
+}
+
+func (p *ttmlParser) agentNameForID(agentID string) string {
+ if isBackgroundAgentID(agentID) {
+ baseID := strings.TrimSuffix(agentID, "__bg")
+ if baseID == "main" {
+ return ""
+ }
+ if agent, ok := p.definedAgents[baseID]; ok {
+ return agent.Name
+ }
+ return ""
+ }
+
+ if agent, ok := p.definedAgents[agentID]; ok {
+ return agent.Name
+ }
+
+ return ""
+}
+
+func backgroundAgentID(agentID string) string {
+ return agentID + "__bg"
+}
+
+func isBackgroundAgentID(agentID string) bool {
+ return strings.HasSuffix(agentID, "__bg")
+}
+
+func contextHasRole(roles string, role string) bool {
+ for _, candidate := range strings.Fields(strings.ToLower(roles)) {
+ if candidate == strings.ToLower(role) {
+ return true
+ }
+ }
+ return false
+}
+
func (p *ttmlParser) addMainLine(lang string, lineKey string, line model.Line) {
lang = normalizeTTMLLang(lang)
if _, ok := p.mainLinesByLang[lang]; !ok {
@@ -495,6 +701,9 @@ func (p *ttmlParser) childContext(attrs []xml.Attr, parent ttmlTimingContext) tt
if lang, ok := attrValue(attrs, "lang"); ok {
ctx.lang = normalizeTTMLLang(lang)
}
+ if agentID, ok := attrValue(attrs, "agent"); ok {
+ ctx.agentID = strings.TrimSpace(agentID)
+ }
if role, ok := attrValue(attrs, "role"); ok {
role = strings.TrimSpace(role)
if role != "" {
@@ -805,6 +1014,55 @@ func attrValue(attrs []xml.Attr, key string) (string, bool) {
return "", false
}
+func attrOrEmpty(attrs []xml.Attr, key string) string {
+ value, _ := attrValue(attrs, key)
+ return value
+}
+
+func (p *ttmlParser) collectElementText(start xml.StartElement) (string, error) {
+ var text strings.Builder
+
+ for {
+ token, err := p.decoder.Token()
+ if err != nil {
+ return "", err
+ }
+
+ switch t := token.(type) {
+ case xml.StartElement:
+ value, err := p.collectElementText(t)
+ if err != nil {
+ return "", err
+ }
+ text.WriteString(value)
+ case xml.EndElement:
+ if strings.EqualFold(t.Name.Local, start.Name.Local) {
+ return text.String(), nil
+ }
+ case xml.CharData:
+ text.WriteString(string(t))
+ }
+ }
+}
+
+func (p *ttmlParser) skipElement(_ xml.StartElement) error {
+ depth := 1
+ for depth > 0 {
+ token, err := p.decoder.Token()
+ if err != nil {
+ return err
+ }
+
+ switch token.(type) {
+ case xml.StartElement:
+ depth++
+ case xml.EndElement:
+ depth--
+ }
+ }
+ return nil
+}
+
func normalizeTTMLLang(lang string) string {
lang = strings.ToLower(strings.TrimSpace(lang))
if lang == "" {
@@ -840,42 +1098,7 @@ func linesAreSynced(lines []model.Line) bool {
}
func hydrateLineTimingFromTokens(line model.Line) model.Line {
- if len(line.Cue) == 0 {
- return line
- }
-
- var earliestStart *int64
- var latestEnd *int64
- for i := range line.Cue {
- token := line.Cue[i]
- if token.Start != nil {
- if earliestStart == nil || *token.Start < *earliestStart {
- v := *token.Start
- earliestStart = &v
- }
- }
-
- candidateEnd := token.End
- if candidateEnd == nil {
- candidateEnd = token.Start
- }
- if candidateEnd != nil {
- if latestEnd == nil || *candidateEnd > *latestEnd {
- v := *candidateEnd
- latestEnd = &v
- }
- }
- }
-
- if line.Start == nil && earliestStart != nil {
- v := *earliestStart
- line.Start = &v
- }
- if line.End == nil && latestEnd != nil {
- v := *latestEnd
- line.End = &v
- }
- return line
+ return model.NormalizeLineTiming(line)
}
func max(v float64, fallback float64) float64 {
diff --git a/core/lyrics/ttml_test.go b/core/lyrics/ttml_test.go
index 8ec16f679..5fc484a3b 100644
--- a/core/lyrics/ttml_test.go
+++ b/core/lyrics/ttml_test.go
@@ -129,6 +129,10 @@ var _ = Describe("parseTTML", func() {
list, err := parseTTML(content)
Expect(err).ToNot(HaveOccurred())
Expect(list).To(HaveLen(1))
+ Expect(list[0].Agents).To(Equal([]model.Agent{
+ {ID: "main", Role: "main"},
+ {ID: "main__bg", Role: "bg"},
+ }))
Expect(list[0].Line).To(HaveLen(1))
line := list[0].Line[0]
@@ -137,9 +141,41 @@ var _ = Describe("parseTTML", func() {
Expect(line.End).To(Equal(gg.P(int64(3000))))
Expect(line.Cue).To(HaveLen(3))
- Expect(line.Cue[0]).To(Equal(model.Cue{Start: gg.P(int64(1000)), End: gg.P(int64(1400)), Value: "He"}))
- Expect(line.Cue[1]).To(Equal(model.Cue{Start: gg.P(int64(1400)), End: gg.P(int64(1800)), Value: "llo"}))
- Expect(line.Cue[2]).To(Equal(model.Cue{Start: gg.P(int64(2000)), End: gg.P(int64(2500)), Value: "echo", Role: "x-bg"}))
+ Expect(line.Cue[0]).To(Equal(model.Cue{Start: gg.P(int64(1000)), End: gg.P(int64(1400)), Value: "He", AgentID: "main"}))
+ Expect(line.Cue[1]).To(Equal(model.Cue{Start: gg.P(int64(1400)), End: gg.P(int64(1800)), Value: "llo", AgentID: "main"}))
+ Expect(line.Cue[2]).To(Equal(model.Cue{Start: gg.P(int64(2000)), End: gg.P(int64(2500)), Value: "echo", AgentID: "main__bg"}))
+ })
+
+ It("should parse named TTML agents into main, voice, and group roles", func() {
+ content := []byte(`
+
+
+
+ Chris Martin
+ Jin
+ All
+
+
+
+
+
+`)
+
+ list, err := parseTTML(content)
+ Expect(err).ToNot(HaveOccurred())
+ Expect(list).To(HaveLen(1))
+ Expect(list[0].Agents).To(Equal([]model.Agent{
+ {ID: "v1", Role: "main", Name: "Chris Martin"},
+ {ID: "v2", Role: "voice", Name: "Jin"},
+ {ID: "v1000", Role: "group", Name: "All"},
+ }))
+ Expect(list[0].Line[0].Cue[0].AgentID).To(Equal("v1"))
+ Expect(list[0].Line[1].Cue[0].AgentID).To(Equal("v2"))
+ Expect(list[0].Line[2].Cue[0].AgentID).To(Equal("v1000"))
})
})
diff --git a/model/lyrics.go b/model/lyrics.go
index 9fcd4992e..725c3aa94 100644
--- a/model/lyrics.go
+++ b/model/lyrics.go
@@ -12,10 +12,16 @@ import (
)
type Cue struct {
- Start *int64 `structs:"start,omitempty" json:"start,omitempty"`
- End *int64 `structs:"end,omitempty" json:"end,omitempty"`
- Value string `structs:"value" json:"value"`
- Role string `structs:"role,omitempty" json:"role,omitempty"`
+ Start *int64 `structs:"start,omitempty" json:"start,omitempty"`
+ End *int64 `structs:"end,omitempty" json:"end,omitempty"`
+ Value string `structs:"value" json:"value"`
+ AgentID string `structs:"agentId,omitempty" json:"agentId,omitempty"`
+}
+
+type Agent struct {
+ ID string `structs:"id" json:"id"`
+ Role string `structs:"role" json:"role"`
+ Name string `structs:"name,omitempty" json:"name,omitempty"`
}
type Line struct {
@@ -26,13 +32,14 @@ type Line struct {
}
type Lyrics struct {
- DisplayArtist string `structs:"displayArtist,omitempty" json:"displayArtist,omitempty"`
- DisplayTitle string `structs:"displayTitle,omitempty" json:"displayTitle,omitempty"`
- Kind string `structs:"kind,omitempty" json:"kind,omitempty"`
- Lang string `structs:"lang" json:"lang"`
- Line []Line `structs:"line" json:"line"`
- Offset *int64 `structs:"offset,omitempty" json:"offset,omitempty"`
- Synced bool `structs:"synced" json:"synced"`
+ DisplayArtist string `structs:"displayArtist,omitempty" json:"displayArtist,omitempty"`
+ DisplayTitle string `structs:"displayTitle,omitempty" json:"displayTitle,omitempty"`
+ Kind string `structs:"kind,omitempty" json:"kind,omitempty"`
+ Lang string `structs:"lang" json:"lang"`
+ Agents []Agent `structs:"agents,omitempty" json:"agents,omitempty"`
+ Line []Line `structs:"line" json:"line"`
+ Offset *int64 `structs:"offset,omitempty" json:"offset,omitempty"`
+ Synced bool `structs:"synced" json:"synced"`
}
// support the standard [mm:ss.mm], as well as [hh:*] and [*.mmm]
@@ -199,7 +206,7 @@ func ToLyrics(language, text string) (*Lyrics, error) {
DisplayArtist: artist,
DisplayTitle: title,
Lang: language,
- Line: structuredLines,
+ Line: NormalizeCueLines(structuredLines),
Offset: offset,
Synced: synced,
}
@@ -265,11 +272,6 @@ func parseEnhancedCues(text string) []Cue {
Start: &start,
Value: seg.text,
}
- // Derive End from the next cue's Start
- if i+1 < len(segments) {
- end := segments[i+1].start
- cues[i].End = &end
- }
}
return cues
}
@@ -338,3 +340,127 @@ func parseTime(line string, match []int) (int64, error) {
}
type LyricList []Lyrics
+
+func NormalizeLyrics(lyrics Lyrics) Lyrics {
+ lyrics.Line = NormalizeCueLines(lyrics.Line)
+ if len(lyrics.Agents) == 0 {
+ lyrics.Agents = nil
+ }
+ return lyrics
+}
+
+func NormalizeCueLines(lines []Line) []Line {
+ if len(lines) == 0 {
+ return lines
+ }
+
+ normalized := make([]Line, len(lines))
+ copy(normalized, lines)
+
+ for i := range normalized {
+ var fallbackEnd *int64
+ if normalized[i].End != nil {
+ v := *normalized[i].End
+ fallbackEnd = &v
+ } else if i+1 < len(normalized) && normalized[i+1].Start != nil {
+ v := *normalized[i+1].Start
+ fallbackEnd = &v
+ }
+
+ normalized[i] = normalizeCueLine(normalized[i], fallbackEnd)
+ }
+
+ return normalized
+}
+
+func NormalizeLineTiming(line Line) Line {
+ if len(line.Cue) == 0 {
+ return line
+ }
+
+ var earliestStart *int64
+ var latestEnd *int64
+ for i := range line.Cue {
+ token := line.Cue[i]
+ if token.Start != nil {
+ if earliestStart == nil || *token.Start < *earliestStart {
+ v := *token.Start
+ earliestStart = &v
+ }
+ }
+
+ candidateEnd := token.End
+ if candidateEnd == nil {
+ candidateEnd = token.Start
+ }
+ if candidateEnd != nil {
+ if latestEnd == nil || *candidateEnd > *latestEnd {
+ v := *candidateEnd
+ latestEnd = &v
+ }
+ }
+ }
+
+ if line.Start == nil && earliestStart != nil {
+ v := *earliestStart
+ line.Start = &v
+ }
+ if line.End == nil && latestEnd != nil {
+ v := *latestEnd
+ line.End = &v
+ }
+ return line
+}
+
+func normalizeCueLine(line Line, fallbackEnd *int64) Line {
+ if len(line.Cue) == 0 {
+ return line
+ }
+
+ hasAnyEnd := false
+ for i := range line.Cue {
+ if line.Cue[i].End != nil {
+ hasAnyEnd = true
+ break
+ }
+ }
+ if !hasAnyEnd {
+ line.Cue = clearCueEnds(line.Cue)
+ return NormalizeLineTiming(line)
+ }
+
+ for i := range line.Cue {
+ if line.Cue[i].End != nil {
+ continue
+ }
+
+ if i+1 < len(line.Cue) && line.Cue[i+1].Start != nil {
+ v := *line.Cue[i+1].Start
+ line.Cue[i].End = &v
+ continue
+ }
+
+ if fallbackEnd != nil {
+ v := *fallbackEnd
+ line.Cue[i].End = &v
+ }
+ }
+
+ for i := range line.Cue {
+ if line.Cue[i].End == nil {
+ line.Cue = clearCueEnds(line.Cue)
+ return NormalizeLineTiming(line)
+ }
+ }
+
+ return NormalizeLineTiming(line)
+}
+
+func clearCueEnds(cues []Cue) []Cue {
+ normalized := make([]Cue, len(cues))
+ copy(normalized, cues)
+ for i := range normalized {
+ normalized[i].End = nil
+ }
+ return normalized
+}
diff --git a/model/lyrics_test.go b/model/lyrics_test.go
index 2228306d0..9aad7d968 100644
--- a/model/lyrics_test.go
+++ b/model/lyrics_test.go
@@ -129,8 +129,8 @@ var _ = Describe("ToLyrics", func() {
Expect(line0.Start).To(Equal(&t1000))
Expect(line0.Value).To(Equal("Some lyrics here"))
Expect(line0.Cue).To(Equal([]Cue{
- {Start: &t1000, End: &t1500, Value: "Some "},
- {Start: &t1500, End: &t2000, Value: "lyrics "},
+ {Start: &t1000, Value: "Some "},
+ {Start: &t1500, Value: "lyrics "},
{Start: &t2000, Value: "here"},
}))
@@ -138,7 +138,7 @@ var _ = Describe("ToLyrics", func() {
Expect(line1.Start).To(Equal(&t3000))
Expect(line1.Value).To(Equal("More words"))
Expect(line1.Cue).To(Equal([]Cue{
- {Start: &t3000, End: &t3500, Value: "More "},
+ {Start: &t3000, Value: "More "},
{Start: &t3500, Value: "words"},
}))
})
@@ -161,7 +161,7 @@ var _ = Describe("ToLyrics", func() {
t1000, t1500, t5000, t5500 := int64(1000), int64(1500), int64(5000), int64(5500)
Expect(lyrics.Line[0].Cue).To(Equal([]Cue{
- {Start: &t1000, End: &t1500, Value: "Some "},
+ {Start: &t1000, Value: "Some "},
{Start: &t1500, Value: "lyrics"},
}))
Expect(lyrics.Line[0].Value).To(Equal("Some lyrics"))
@@ -170,7 +170,7 @@ var _ = Describe("ToLyrics", func() {
Expect(lyrics.Line[1].Value).To(Equal("Plain line"))
Expect(lyrics.Line[2].Cue).To(Equal([]Cue{
- {Start: &t5000, End: &t5500, Value: "More "},
+ {Start: &t5000, Value: "More "},
{Start: &t5500, Value: "words"},
}))
Expect(lyrics.Line[2].Value).To(Equal("More words"))
diff --git a/server/subsonic/helpers.go b/server/subsonic/helpers.go
index 305f1818e..7545a71c0 100644
--- a/server/subsonic/helpers.go
+++ b/server/subsonic/helpers.go
@@ -476,14 +476,22 @@ func mapExplicitStatus(explicitStatus string) string {
return ""
}
-// sanitizeRole strips the TTML x- prefix from role values for the API.
-func sanitizeRole(role string) string {
- return strings.TrimPrefix(role, "x-")
-}
-
func buildStructuredLyric(mf *model.MediaFile, lyrics model.Lyrics, enhanced bool) responses.StructuredLyric {
lines := make([]responses.Line, len(lyrics.Line))
var cueLines []responses.CueLine
+ agentOrderByID := make(map[string]int, len(lyrics.Agents))
+ agentRoleByID := make(map[string]string, len(lyrics.Agents))
+ responseAgents := make([]responses.Agent, 0, len(lyrics.Agents))
+
+ for i, agent := range lyrics.Agents {
+ agentOrderByID[agent.ID] = i
+ agentRoleByID[agent.ID] = agent.Role
+ responseAgents = append(responseAgents, responses.Agent{
+ ID: agent.ID,
+ Role: agent.Role,
+ Name: agent.Name,
+ })
+ }
for i, line := range lyrics.Line {
lines[i] = responses.Line{
@@ -494,41 +502,50 @@ func buildStructuredLyric(mf *model.MediaFile, lyrics model.Lyrics, enhanced boo
continue
}
- // Group cues by role, preserving order of first appearance
- roleOrder := make([]string, 0, 2)
- cuesByRole := make(map[string][]responses.LyricCue)
+ agentOrder := make([]string, 0, 2)
+ cuesByAgent := make(map[string][]model.Cue)
for _, cue := range line.Cue {
if cue.Start == nil {
continue
}
- role := sanitizeRole(cue.Role)
- if _, exists := cuesByRole[role]; !exists {
- roleOrder = append(roleOrder, role)
+ agentID := strings.TrimSpace(cue.AgentID)
+ if _, exists := cuesByAgent[agentID]; !exists {
+ agentOrder = append(agentOrder, agentID)
}
- cuesByRole[role] = append(cuesByRole[role], responses.LyricCue{
- Start: *cue.Start,
- End: cue.End,
- Value: cue.Value,
- })
+ cuesByAgent[agentID] = append(cuesByAgent[agentID], cue)
}
- // Ensure main vocals (empty role) always comes first
- sort.SliceStable(roleOrder, func(i, j int) bool {
- return roleOrder[i] == "" && roleOrder[j] != ""
+ sort.SliceStable(agentOrder, func(i, j int) bool {
+ leftRole := agentRoleByID[agentOrder[i]]
+ rightRole := agentRoleByID[agentOrder[j]]
+ if leftRole == "main" && rightRole != "main" {
+ return true
+ }
+ if rightRole == "main" && leftRole != "main" {
+ return false
+ }
+
+ leftOrder, leftOK := agentOrderByID[agentOrder[i]]
+ rightOrder, rightOK := agentOrderByID[agentOrder[j]]
+ if leftOK && rightOK && leftOrder != rightOrder {
+ return leftOrder < rightOrder
+ }
+ if leftOK != rightOK {
+ return leftOK
+ }
+ return i < j
})
- // Create a separate CueLine for each role group
- for _, role := range roleOrder {
- cues := cuesByRole[role]
+ for _, agentID := range agentOrder {
cueLine := responses.CueLine{
Index: int32(i),
Start: line.Start,
End: line.End,
Value: line.Value,
- Cue: cues,
+ Cue: buildLyricCues(cuesByAgent[agentID], line.End),
}
- if role != "" {
- cueLine.Role = role
+ if agentID != "" {
+ cueLine.AgentID = agentID
}
cueLines = append(cueLines, cueLine)
}
@@ -550,6 +567,9 @@ func buildStructuredLyric(mf *model.MediaFile, lyrics model.Lyrics, enhanced boo
kind = "main"
}
structured.Kind = kind
+ if len(cueLines) > 0 && len(responseAgents) > 0 {
+ structured.Agents = responseAgents
+ }
}
if structured.DisplayArtist == "" {
@@ -562,6 +582,67 @@ func buildStructuredLyric(mf *model.MediaFile, lyrics model.Lyrics, enhanced boo
return structured
}
+func buildLyricCues(cues []model.Cue, lineEnd *int64) []responses.LyricCue {
+ if len(cues) == 0 {
+ return nil
+ }
+
+ hasAnyEnd := false
+ for i := range cues {
+ if cues[i].End != nil {
+ hasAnyEnd = true
+ break
+ }
+ }
+
+ normalized := make([]responses.LyricCue, 0, len(cues))
+ for i := range cues {
+ if cues[i].Start == nil {
+ continue
+ }
+
+ cue := responses.LyricCue{
+ Start: *cues[i].Start,
+ Value: cues[i].Value,
+ }
+ if hasAnyEnd {
+ end := cues[i].End
+ if end == nil {
+ if i+1 < len(cues) && cues[i+1].Start != nil {
+ v := *cues[i+1].Start
+ end = &v
+ } else if lineEnd != nil {
+ v := *lineEnd
+ end = &v
+ }
+ }
+ if end != nil && i+1 < len(cues) && cues[i+1].Start != nil && *end > *cues[i+1].Start {
+ v := *cues[i+1].Start
+ end = &v
+ }
+ if end != nil && *end < cue.Start {
+ v := cue.Start
+ end = &v
+ }
+ cue.End = end
+ }
+ normalized = append(normalized, cue)
+ }
+
+ if hasAnyEnd {
+ for i := range normalized {
+ if normalized[i].End == nil {
+ for j := range normalized {
+ normalized[j].End = nil
+ }
+ break
+ }
+ }
+ }
+
+ return normalized
+}
+
func buildLyricsList(mf *model.MediaFile, lyricsList model.LyricList, enhanced bool) *responses.LyricsList {
var filtered model.LyricList
if enhanced {
diff --git a/server/subsonic/media_retrieval_test.go b/server/subsonic/media_retrieval_test.go
index 0fdbb3854..5489492ce 100644
--- a/server/subsonic/media_retrieval_test.go
+++ b/server/subsonic/media_retrieval_test.go
@@ -235,6 +235,7 @@ var _ = Describe("MediaRetrievalController", func() {
Expect(realLyric.Kind).To(Equal(expectedLyric.Kind))
Expect(realLyric.Lang).To(Equal(expectedLyric.Lang))
Expect(realLyric.Synced).To(Equal(expectedLyric.Synced))
+ Expect(realLyric.Agents).To(Equal(expectedLyric.Agents))
if expectedLyric.Offset == nil {
Expect(realLyric.Offset).To(BeNil())
@@ -259,7 +260,7 @@ var _ = Describe("MediaRetrievalController", func() {
expectedCueLine := expectedLyric.CueLine[j]
Expect(realCueLine.Index).To(Equal(expectedCueLine.Index))
Expect(realCueLine.Value).To(Equal(expectedCueLine.Value))
- Expect(realCueLine.Role).To(Equal(expectedCueLine.Role))
+ Expect(realCueLine.AgentID).To(Equal(expectedCueLine.AgentID))
if expectedCueLine.Start == nil {
Expect(realCueLine.Start).To(BeNil())
} else {
@@ -542,6 +543,7 @@ var _ = Describe("MediaRetrievalController", func() {
lyricsJson, err := json.Marshal(model.LyricList{
{
Lang: "eng",
+ Agents: []model.Agent{{ID: "lead", Role: "main"}, {ID: "lead__bg", Role: "bg"}},
Synced: true,
Line: []model.Line{
{
@@ -550,15 +552,16 @@ var _ = Describe("MediaRetrievalController", func() {
Value: "Hello echo",
Cue: []model.Cue{
{
- Start: &tokenStartA,
- End: &tokenEndA,
- Value: "Hello",
+ Start: &tokenStartA,
+ End: &tokenEndA,
+ Value: "Hello",
+ AgentID: "lead",
},
{
- Start: &tokenStartB,
- End: &tokenEndB,
- Value: "echo",
- Role: "x-bg",
+ Start: &tokenStartB,
+ End: &tokenEndB,
+ Value: "echo",
+ AgentID: "lead__bg",
},
},
},
@@ -586,6 +589,10 @@ var _ = Describe("MediaRetrievalController", func() {
Kind: "main",
Lang: "eng",
Synced: true,
+ Agents: []responses.Agent{
+ {ID: "lead", Role: "main"},
+ {ID: "lead__bg", Role: "bg"},
+ },
Line: []responses.Line{
{
Start: &lineStart,
@@ -594,10 +601,11 @@ var _ = Describe("MediaRetrievalController", func() {
},
CueLine: []responses.CueLine{
{
- Index: 0,
- Start: &lineStart,
- End: &lineEnd,
- Value: "Hello echo",
+ Index: 0,
+ Start: &lineStart,
+ End: &lineEnd,
+ Value: "Hello echo",
+ AgentID: "lead",
Cue: []responses.LyricCue{
{
Start: tokenStartA,
@@ -607,11 +615,11 @@ var _ = Describe("MediaRetrievalController", func() {
},
},
{
- Index: 0,
- Start: &lineStart,
- End: &lineEnd,
- Value: "Hello echo",
- Role: "bg",
+ Index: 0,
+ Start: &lineStart,
+ End: &lineEnd,
+ Value: "Hello echo",
+ AgentID: "lead__bg",
Cue: []responses.LyricCue{
{
Start: tokenStartB,
diff --git a/server/subsonic/responses/responses.go b/server/subsonic/responses/responses.go
index f5446a961..344dd9999 100644
--- a/server/subsonic/responses/responses.go
+++ b/server/subsonic/responses/responses.go
@@ -543,13 +543,19 @@ type LyricCue struct {
Value string `xml:",chardata" json:"value"`
}
+type Agent struct {
+ ID string `xml:"id,attr" json:"id"`
+ Role string `xml:"role,attr" json:"role"`
+ Name string `xml:"name,attr,omitempty" json:"name,omitempty"`
+}
+
type CueLine struct {
- Index int32 `xml:"index,attr" json:"index"`
- Start *int64 `xml:"start,attr,omitempty" json:"start,omitempty"`
- End *int64 `xml:"end,attr,omitempty" json:"end,omitempty"`
- Value string `xml:"value,attr,omitempty" json:"value,omitempty"`
- Role string `xml:"role,attr,omitempty" json:"role,omitempty"`
- Cue []LyricCue `xml:"cue,omitempty" json:"cue,omitempty"`
+ Index int32 `xml:"index,attr" json:"index"`
+ Start *int64 `xml:"start,attr,omitempty" json:"start,omitempty"`
+ End *int64 `xml:"end,attr,omitempty" json:"end,omitempty"`
+ Value string `xml:"value,attr,omitempty" json:"value,omitempty"`
+ AgentID string `xml:"agentId,attr,omitempty" json:"agentId,omitempty"`
+ Cue []LyricCue `xml:"cue,omitempty" json:"cue,omitempty"`
}
type StructuredLyric struct {
@@ -558,6 +564,7 @@ type StructuredLyric struct {
Kind string `xml:"kind,attr,omitempty" json:"kind,omitempty"`
Lang string `xml:"lang,attr" json:"lang"`
Line []Line `xml:"line" json:"line"`
+ Agents []Agent `xml:"agent,omitempty" json:"agents,omitempty"`
CueLine []CueLine `xml:"cueLine,omitempty" json:"cueLine,omitempty"`
Offset *int64 `xml:"offset,attr,omitempty" json:"offset,omitempty"`
Synced bool `xml:"synced,attr" json:"synced"`
diff --git a/tests/fixtures/test.elrc b/tests/fixtures/test.elrc
new file mode 100644
index 000000000..01c3d2cdd
--- /dev/null
+++ b/tests/fixtures/test.elrc
@@ -0,0 +1,5 @@
+[ar:ELRC Artist]
+[ti:ELRC Song]
+[lang:eng]
+[00:01.00]<00:01.00>Lead <00:01.50>words
+[00:03.00]Fallback line
diff --git a/tests/fixtures/test.srt b/tests/fixtures/test.srt
new file mode 100644
index 000000000..3c9c09a39
--- /dev/null
+++ b/tests/fixtures/test.srt
@@ -0,0 +1,7 @@
+1
+00:00:18,800 --> 00:00:22,800
+We're from subtitles
+
+2
+00:00:22,801 --> 00:00:26,000
+Another subtitle line
diff --git a/ui/src/audioplayer/PlayerToolbar.jsx b/ui/src/audioplayer/PlayerToolbar.jsx
index 869df475d..8487b0655 100644
--- a/ui/src/audioplayer/PlayerToolbar.jsx
+++ b/ui/src/audioplayer/PlayerToolbar.jsx
@@ -108,7 +108,7 @@ const PlayerToolbar = ({
)
const toggleLyricsButton = (
-
+
Array.isArray(lyric.line) &&
lyric.line.some((line) => Number.isFinite(Number(line.start)))
+const preferTimedLyrics = (lyrics) => {
+ const timed = lyrics.filter(hasTimedLines)
+ return timed.length > 0 ? timed : lyrics
+}
+
const normalizeToken = (token) => {
if (!token) {
return null
@@ -77,10 +82,38 @@ const normalizeToken = (token) => {
}
}
-const normalizeCueLine = (cueLine, fallbackIndex) => {
+const buildAgentLookup = (structuredLyric) => {
+ const lookup = new Map()
+ const agents = Array.isArray(structuredLyric?.agents) ? structuredLyric.agents : []
+ for (const agent of agents) {
+ const id = typeof agent?.id === 'string' ? agent.id : ''
+ if (!id || lookup.has(id)) {
+ continue
+ }
+ lookup.set(id, {
+ id,
+ role: typeof agent?.role === 'string' ? agent.role : '',
+ name: typeof agent?.name === 'string' ? agent.name : '',
+ })
+ }
+ return lookup
+}
+
+const deriveUiRole = (agent) => {
+ if (!agent?.role || agent.role === 'main') {
+ return ''
+ }
+ return agent.role
+}
+
+const normalizeCueLine = (cueLine, fallbackIndex, agentLookup) => {
const index = Number.isFinite(Number(cueLine?.index))
? Number(cueLine.index)
: fallbackIndex
+ const agentId = typeof cueLine?.agentId === 'string' ? cueLine.agentId : ''
+ const agent = agentId ? agentLookup.get(agentId) || null : null
+ const fallbackRole =
+ typeof cueLine?.role === 'string' ? cueLine.role : ''
const tokens = sortTokensByStart(
Array.isArray(cueLine?.cue)
? cueLine.cue.map(normalizeToken).filter(Boolean)
@@ -92,7 +125,10 @@ const normalizeCueLine = (cueLine, fallbackIndex) => {
start: toTime(cueLine?.start),
end: toTime(cueLine?.end),
value: typeof cueLine?.value === 'string' ? cueLine.value : '',
- role: typeof cueLine?.role === 'string' ? cueLine.role : '',
+ role: agent ? deriveUiRole(agent) : fallbackRole,
+ agentId,
+ agentRole: agent?.role || fallbackRole,
+ agentName: agent?.name || '',
tokens,
}
}
@@ -194,6 +230,9 @@ const buildSyntheticWordTokens = (line, token) => {
end: baseStart + (duration * (idx + 1)) / chunks.length,
value: chunk,
role: typeof token?.role === 'string' ? token.role : '',
+ agentId: typeof token?.agentId === 'string' ? token.agentId : '',
+ agentName: typeof token?.agentName === 'string' ? token.agentName : '',
+ agentRole: typeof token?.agentRole === 'string' ? token.agentRole : '',
}))
}
@@ -240,8 +279,8 @@ export const selectLyricLayers = (structuredLyrics, preferredLanguage) => {
}
}
- const synced = structuredLyrics.filter(hasTimedLines)
- if (synced.length === 0) {
+ const available = structuredLyrics.filter(hasStructuredLyricContent)
+ if (available.length === 0) {
return {
main: null,
translation: null,
@@ -255,22 +294,25 @@ export const selectLyricLayers = (structuredLyrics, preferredLanguage) => {
[LYRIC_KIND_PRONUNCIATION]: [],
}
- for (const lyric of synced) {
+ for (const lyric of available) {
grouped[normalizeLyricKind(lyric?.kind)].push(lyric)
}
const mainCandidates = grouped[LYRIC_KIND_MAIN].length
? grouped[LYRIC_KIND_MAIN]
- : synced
+ : available
return {
- main: pickLyricByLanguage(mainCandidates, preferredLanguage),
+ main: pickLyricByLanguage(
+ preferTimedLyrics(mainCandidates),
+ preferredLanguage,
+ ),
translation: pickLyricByLanguage(
- grouped[LYRIC_KIND_TRANSLATION],
+ preferTimedLyrics(grouped[LYRIC_KIND_TRANSLATION]),
preferredLanguage,
),
pronunciation: pickLyricByLanguage(
- grouped[LYRIC_KIND_PRONUNCIATION],
+ preferTimedLyrics(grouped[LYRIC_KIND_PRONUNCIATION]),
preferredLanguage,
),
}
@@ -316,6 +358,7 @@ export const buildKaraokeLines = (structuredLyric) => {
return []
}
+ const agentLookup = buildAgentLookup(structuredLyric)
const baseLines = Array.isArray(structuredLyric.line)
? structuredLyric.line
: []
@@ -328,12 +371,19 @@ export const buildKaraokeLines = (structuredLyric) => {
? (() => {
const normalizedCueLines = rawCueLines.map(
(cueLine, fallbackIndex) => {
- const normalized = normalizeCueLine(cueLine, fallbackIndex)
+ const normalized = normalizeCueLine(
+ cueLine,
+ fallbackIndex,
+ agentLookup,
+ )
return {
...normalized,
tokens: normalized.tokens.map((token) => ({
...token,
role: normalized.role,
+ agentId: normalized.agentId,
+ agentName: normalized.agentName,
+ agentRole: normalized.agentRole,
})),
}
},
@@ -366,6 +416,9 @@ export const buildKaraokeLines = (structuredLyric) => {
start: first.start ?? toTime(baseLine.start) ?? fallbackStart,
end: first.end ?? toTime(baseLine.end) ?? fallbackEnd,
value,
+ agentId: first.agentId,
+ agentName: first.agentName,
+ agentRole: first.agentRole,
tokens,
}
})
diff --git a/ui/src/audioplayer/lyrics.test.js b/ui/src/audioplayer/lyrics.test.js
index 6cb3a1b87..3a5f83b2d 100644
--- a/ui/src/audioplayer/lyrics.test.js
+++ b/ui/src/audioplayer/lyrics.test.js
@@ -124,6 +124,49 @@ describe('lyrics helpers', () => {
expect(layers.pronunciation).toBeNull()
})
+ it('falls back to unsynced lyric content when no timed track exists', () => {
+ const layers = selectLyricLayers(
+ [
+ {
+ lang: 'eng',
+ synced: false,
+ line: [{ value: 'Plain embedded lyric' }],
+ },
+ ],
+ 'eng',
+ )
+
+ expect(layers.main).toEqual({
+ lang: 'eng',
+ synced: false,
+ line: [{ value: 'Plain embedded lyric' }],
+ })
+ })
+
+ it('still prefers timed lyrics when both timed and untimed tracks exist', () => {
+ const layers = selectLyricLayers(
+ [
+ {
+ lang: 'eng',
+ synced: false,
+ line: [{ value: 'Plain lyric' }],
+ },
+ {
+ lang: 'eng',
+ synced: true,
+ line: [{ start: 1000, value: 'Timed lyric' }],
+ },
+ ],
+ 'eng',
+ )
+
+ expect(layers.main).toEqual({
+ lang: 'eng',
+ synced: true,
+ line: [{ start: 1000, value: 'Timed lyric' }],
+ })
+ })
+
it('matches layer line by timing for the active main line', () => {
const mainLines = [
{ index: 0, start: 1000, end: 1800, value: 'Line A', tokens: [] },
@@ -200,43 +243,88 @@ describe('lyrics helpers', () => {
expect(getPreferredLyricLanguage()).toBe('pt-BR')
})
- it('builds karaoke lines from cueLine payload', () => {
+ it('builds karaoke lines from agent-based cueLine payload', () => {
+ const lines = buildKaraokeLines({
+ lang: 'eng',
+ synced: true,
+ line: [{ start: 1000, end: 3000, value: 'Hello world' }],
+ agents: [
+ { id: 'lead', role: 'main', name: 'Lead Vocal' },
+ { id: 'backing', role: 'bg' },
+ ],
+ cueLine: [
+ {
+ index: 0,
+ start: 1000,
+ end: 3000,
+ value: 'Hello world',
+ agentId: 'lead',
+ cue: [{ start: 1000, end: 1500, value: 'Hello' }],
+ },
+ {
+ index: 0,
+ start: 1000,
+ end: 3000,
+ value: 'Hello world',
+ agentId: 'backing',
+ cue: [{ start: 2000, end: 2500, value: 'world' }],
+ },
+ ],
+ })
+
+ expect(lines).toEqual([
+ {
+ agentId: 'lead',
+ agentName: 'Lead Vocal',
+ agentRole: 'main',
+ index: 0,
+ start: 1000,
+ end: 3000,
+ value: 'Hello world',
+ tokens: [
+ {
+ start: 1000,
+ end: 1500,
+ value: 'Hello',
+ role: '',
+ agentId: 'lead',
+ agentName: 'Lead Vocal',
+ agentRole: 'main',
+ },
+ {
+ start: 2000,
+ end: 2500,
+ value: 'world',
+ role: 'bg',
+ agentId: 'backing',
+ agentName: '',
+ agentRole: 'bg',
+ },
+ ],
+ },
+ ])
+ })
+
+ it('falls back to legacy cueLine role values when agents are absent', () => {
const lines = buildKaraokeLines({
lang: 'eng',
synced: true,
line: [{ start: 1000, end: 3000, value: 'Hello world' }],
cueLine: [
- {
- index: 0,
- start: 1000,
- end: 3000,
- value: 'Hello world',
- role: '',
- cue: [{ start: 1000, end: 1500, value: 'Hello' }],
- },
{
index: 0,
start: 1000,
end: 3000,
value: 'Hello world',
role: 'bg',
- cue: [{ start: 2000, end: 2500, value: 'world' }],
+ cue: [{ start: 1000, end: 1500, value: 'Hello' }],
},
],
})
- expect(lines).toEqual([
- {
- index: 0,
- start: 1000,
- end: 3000,
- value: 'Hello world',
- tokens: [
- { start: 1000, end: 1500, value: 'Hello', role: '' },
- { start: 2000, end: 2500, value: 'world', role: 'bg' },
- ],
- },
- ])
+ expect(lines[0].tokens[0].role).toBe('bg')
+ expect(lines[0].tokens[0].agentId).toBe('')
+ expect(lines[0].tokens[0].agentName).toBe('')
})
it('sorts token timing by start to keep playback stable', () => {