diff --git a/conf/configuration.go b/conf/configuration.go index 1d4c3a348..530ebe785 100644 --- a/conf/configuration.go +++ b/conf/configuration.go @@ -677,7 +677,7 @@ func setViperDefaults() { viper.SetDefault("coverartquality", 75) viper.SetDefault("artistartpriority", "artist.*, album/artist.*, external") viper.SetDefault("discartpriority", "disc*.*, cd*.*, cover.*, folder.*, front.*, discsubtitle, embedded") - viper.SetDefault("lyricspriority", ".lrc,.ttml,.txt,embedded") + viper.SetDefault("lyricspriority", ".ttml,.elrc,.lrc,.srt,.txt,embedded") viper.SetDefault("enablegravatar", false) viper.SetDefault("enablefavourites", true) viper.SetDefault("enablestarrating", true) diff --git a/core/lyrics/lyrics_test.go b/core/lyrics/lyrics_test.go index d5f79a4d0..58e8ba82b 100644 --- a/core/lyrics/lyrics_test.go +++ b/core/lyrics/lyrics_test.go @@ -44,6 +44,36 @@ var _ = Describe("sources", func() { }, } + elrcLyrics := model.LyricList{ + model.Lyrics{ + DisplayArtist: "ELRC Artist", + DisplayTitle: "ELRC Song", + Lang: "eng", + Line: []model.Line{ + { + Start: gg.P(int64(1000)), + End: gg.P(int64(1500)), + Value: "Lead words", + Cue: []model.Cue{ + { + Start: gg.P(int64(1000)), + Value: "Lead ", + }, + { + Start: gg.P(int64(1500)), + Value: "words", + }, + }, + }, + { + Start: gg.P(int64(3000)), + Value: "Fallback line", + }, + }, + Synced: true, + }, + } + ttmlLyrics := model.LyricList{ model.Lyrics{ Kind: "main", @@ -88,6 +118,25 @@ var _ = Describe("sources", func() { }, } + srtLyrics := model.LyricList{ + model.Lyrics{ + Lang: "xxx", + Line: []model.Line{ + { + Start: gg.P(int64(18800)), + End: gg.P(int64(22800)), + Value: "We're from subtitles", + }, + { + Start: gg.P(int64(22801)), + End: gg.P(int64(26000)), + Value: "Another subtitle line", + }, + }, + Synced: true, + }, + } + BeforeEach(func() { DeferCleanup(configtest.SetupConfig()) @@ -109,8 +158,10 @@ var _ = Describe("sources", func() { }, Entry("embedded > lrc > txt", "embedded,.lrc,.txt", embeddedLyrics), Entry("lrc > embedded > txt", ".lrc,embedded,.txt", syncedLyrics), + Entry("elrc > lrc > embedded", ".elrc,.lrc,embedded", elrcLyrics), + Entry("srt > txt > embedded", ".srt,.txt,embedded", srtLyrics), Entry("txt > lrc > embedded", ".txt,.lrc,embedded", unsyncedLyrics), - Entry("ttml > lrc > embedded", ".ttml,.lrc,embedded", ttmlLyrics)) + Entry("ttml > elrc > lrc > srt > embedded", ".ttml,.elrc,.lrc,.srt,embedded", ttmlLyrics)) Context("Errors", func() { var RegularUserContext = XContext diff --git a/core/lyrics/sources.go b/core/lyrics/sources.go index 38a71cb8a..7586c944f 100644 --- a/core/lyrics/sources.go +++ b/core/lyrics/sources.go @@ -38,13 +38,20 @@ func fromExternalFile(ctx context.Context, mf *model.MediaFile, suffix string) ( } var list model.LyricList - if strings.EqualFold(suffix, ".ttml") { + switch { + case strings.EqualFold(suffix, ".ttml"): list, err = parseTTML(contents) if err != nil { log.Error(ctx, "error parsing ttml external file", "path", externalLyric, err) return nil, err } - } else { + case strings.EqualFold(suffix, ".srt"): + list, err = parseSRT(contents) + if err != nil { + log.Error(ctx, "error parsing srt external file", "path", externalLyric, err) + return nil, err + } + default: lyrics, err := model.ToLyrics("xxx", string(contents)) if err != nil { log.Error(ctx, "error parsing lyric external file", "path", externalLyric, err) diff --git a/core/lyrics/sources_test.go b/core/lyrics/sources_test.go index 3dd2825e6..a110390d8 100644 --- a/core/lyrics/sources_test.go +++ b/core/lyrics/sources_test.go @@ -106,10 +106,10 @@ var _ = Describe("sources", func() { Expect(lyrics[0].Line[0].Cue).To(HaveLen(3)) Expect(*lyrics[0].Line[0].Cue[0].Start).To(Equal(int64(1000))) Expect(lyrics[0].Line[0].Cue[0].Value).To(Equal("Some ")) - Expect(*lyrics[0].Line[0].Cue[0].End).To(Equal(int64(1500))) + Expect(lyrics[0].Line[0].Cue[0].End).To(BeNil()) Expect(*lyrics[0].Line[0].Cue[1].Start).To(Equal(int64(1500))) Expect(lyrics[0].Line[0].Cue[1].Value).To(Equal("lyrics ")) - Expect(*lyrics[0].Line[0].Cue[1].End).To(Equal(int64(2000))) + Expect(lyrics[0].Line[0].Cue[1].End).To(BeNil()) Expect(*lyrics[0].Line[0].Cue[2].Start).To(Equal(int64(2000))) Expect(lyrics[0].Line[0].Cue[2].Value).To(Equal("here")) Expect(lyrics[0].Line[0].Cue[2].End).To(BeNil()) @@ -125,6 +125,33 @@ var _ = Describe("sources", func() { Expect(lyrics[0].Line[2].Cue).To(BeNil()) }) + It("should return Enhanced LRC lyrics from an ELRC file", func() { + mf := model.MediaFile{Path: "tests/fixtures/test.mp3"} + lyrics, err := fromExternalFile(ctx, &mf, ".elrc") + + Expect(err).To(BeNil()) + Expect(lyrics).To(HaveLen(1)) + Expect(lyrics[0].DisplayArtist).To(Equal("ELRC Artist")) + Expect(lyrics[0].DisplayTitle).To(Equal("ELRC Song")) + Expect(lyrics[0].Lang).To(Equal("eng")) + Expect(lyrics[0].Synced).To(BeTrue()) + Expect(lyrics[0].Line).To(HaveLen(2)) + + Expect(lyrics[0].Line[0].Start).To(Equal(gg.P(int64(1000)))) + Expect(lyrics[0].Line[0].Value).To(Equal("Lead words")) + Expect(lyrics[0].Line[0].Cue).To(HaveLen(2)) + Expect(*lyrics[0].Line[0].Cue[0].Start).To(Equal(int64(1000))) + Expect(lyrics[0].Line[0].Cue[0].Value).To(Equal("Lead ")) + Expect(lyrics[0].Line[0].Cue[0].End).To(BeNil()) + Expect(*lyrics[0].Line[0].Cue[1].Start).To(Equal(int64(1500))) + Expect(lyrics[0].Line[0].Cue[1].Value).To(Equal("words")) + Expect(lyrics[0].Line[0].Cue[1].End).To(BeNil()) + + Expect(lyrics[0].Line[1].Start).To(Equal(gg.P(int64(3000)))) + Expect(lyrics[0].Line[1].Value).To(Equal("Fallback line")) + Expect(lyrics[0].Line[1].Cue).To(BeNil()) + }) + It("should return unsynchronized lyrics from a file", func() { mf := model.MediaFile{Path: "tests/fixtures/test.mp3"} lyrics, err := fromExternalFile(ctx, &mf, ".txt") @@ -146,6 +173,31 @@ var _ = Describe("sources", func() { })) }) + It("should return synchronized lyrics from an SRT file", func() { + mf := model.MediaFile{Path: "tests/fixtures/test.mp3"} + lyrics, err := fromExternalFile(ctx, &mf, ".srt") + + Expect(err).To(BeNil()) + Expect(lyrics).To(Equal(model.LyricList{ + model.Lyrics{ + Lang: "xxx", + Line: []model.Line{ + { + Start: gg.P(int64(18800)), + End: gg.P(int64(22800)), + Value: "We're from subtitles", + }, + { + Start: gg.P(int64(22801)), + End: gg.P(int64(26000)), + Value: "Another subtitle line", + }, + }, + Synced: true, + }, + })) + }) + It("should return synchronized multilingual lyrics from a TTML file", func() { mf := model.MediaFile{Path: "tests/fixtures/test.mp3"} lyrics, err := fromExternalFile(ctx, &mf, ".ttml") diff --git a/core/lyrics/srt.go b/core/lyrics/srt.go new file mode 100644 index 000000000..8fd77abb4 --- /dev/null +++ b/core/lyrics/srt.go @@ -0,0 +1,161 @@ +package lyrics + +import ( + "bytes" + "regexp" + "strconv" + "strings" + + "github.com/navidrome/navidrome/model" + "github.com/navidrome/navidrome/utils/str" +) + +var srtTimeRegex = regexp.MustCompile(`^\s*(\d{1,2}):(\d{2}):(\d{2})[,.](\d{1,3})\s*$`) + +func parseSRT(contents []byte) (model.LyricList, error) { + raw := strings.ReplaceAll(string(contents), "\r\n", "\n") + raw = strings.ReplaceAll(raw, "\r", "\n") + + blocks := splitSRTBlocks(raw) + lines := make([]model.Line, 0, len(blocks)) + + for _, block := range blocks { + line, ok, err := parseSRTBlock(block) + if err != nil { + return nil, err + } + if ok { + lines = append(lines, line) + } + } + + if len(lines) == 0 { + return nil, nil + } + + lyrics := model.NormalizeLyrics(model.Lyrics{ + Lang: "xxx", + Line: lines, + Synced: true, + }) + return model.LyricList{lyrics}, nil +} + +func splitSRTBlocks(raw string) []string { + raw = strings.TrimSpace(raw) + if raw == "" { + return nil + } + + parts := strings.Split(raw, "\n\n") + blocks := make([]string, 0, len(parts)) + for _, part := range parts { + part = strings.TrimSpace(part) + if part != "" { + blocks = append(blocks, part) + } + } + return blocks +} + +func parseSRTBlock(block string) (model.Line, bool, error) { + scanner := bytes.Split([]byte(block), []byte("\n")) + if len(scanner) == 0 { + return model.Line{}, false, nil + } + + lines := make([]string, 0, len(scanner)) + for _, line := range scanner { + lines = append(lines, strings.TrimSpace(string(line))) + } + + if len(lines) == 0 { + return model.Line{}, false, nil + } + + startIdx := 0 + if digitsOnly(lines[0]) { + startIdx = 1 + } + if startIdx >= len(lines) { + return model.Line{}, false, nil + } + + timing := strings.Split(lines[startIdx], "-->") + if len(timing) != 2 { + return model.Line{}, false, nil + } + + startMs, err := parseSRTTime(timing[0]) + if err != nil { + return model.Line{}, false, err + } + endMs, err := parseSRTTime(timing[1]) + if err != nil { + return model.Line{}, false, err + } + + textLines := make([]string, 0, len(lines)-startIdx-1) + for _, line := range lines[startIdx+1:] { + if line == "" { + continue + } + textLines = append(textLines, line) + } + + value := str.SanitizeText(strings.Join(textLines, "\n")) + if value == "" { + return model.Line{}, false, nil + } + + return model.Line{ + Start: &startMs, + End: &endMs, + Value: value, + }, true, nil +} + +func parseSRTTime(value string) (int64, error) { + match := srtTimeRegex.FindStringSubmatch(strings.TrimSpace(value)) + if match == nil { + return 0, strconv.ErrSyntax + } + + hours, err := strconv.ParseInt(match[1], 10, 64) + if err != nil { + return 0, err + } + minutes, err := strconv.ParseInt(match[2], 10, 64) + if err != nil { + return 0, err + } + seconds, err := strconv.ParseInt(match[3], 10, 64) + if err != nil { + return 0, err + } + millis, err := strconv.ParseInt(match[4], 10, 64) + if err != nil { + return 0, err + } + + switch len(match[4]) { + case 1: + millis *= 100 + case 2: + millis *= 10 + } + + return (((hours*60)+minutes)*60+seconds)*1000 + millis, nil +} + +func digitsOnly(value string) bool { + if value == "" { + return false + } + for _, ch := range value { + if ch < '0' || ch > '9' { + return false + } + } + return true +} diff --git a/core/lyrics/ttml.go b/core/lyrics/ttml.go index a0bdcac5a..e79dfe846 100644 --- a/core/lyrics/ttml.go +++ b/core/lyrics/ttml.go @@ -46,6 +46,7 @@ type ttmlTimingParams struct { type ttmlTimingContext struct { lang string role string + agentID string begin int64 hasBegin bool end int64 @@ -70,6 +71,12 @@ type ttmlResolvedMetadataLine struct { line model.Line } +type ttmlDefinedAgent struct { + ID string + Type string + Name string +} + type ttmlParser struct { decoder *xml.Decoder params ttmlTimingParams @@ -86,6 +93,8 @@ type ttmlParser struct { pronunciationLangOrder []string pronunciationEntriesByLg map[string][]ttmlMetadataEntry + definedAgents map[string]ttmlDefinedAgent + metadataSeq int } @@ -103,6 +112,7 @@ func parseTTML(contents []byte) (model.LyricList, error) { mainLineRefsByKey: make(map[string]ttmlLineRef), translationEntriesByLg: make(map[string][]ttmlMetadataEntry), pronunciationEntriesByLg: make(map[string][]ttmlMetadataEntry), + definedAgents: make(map[string]ttmlDefinedAgent), } root := ttmlTimingContext{lang: "xxx"} @@ -140,6 +150,8 @@ func (p *ttmlParser) parseElement(start xml.StartElement, parent ttmlTimingConte return p.parseMetadataTrack(start, parent, ttmlLyricKindTranslation) case "transliteration": return p.parseMetadataTrack(start, parent, ttmlLyricKindPronunciation) + case "agent": + return p.parseAgentDefinition(start) } ctx := p.childContext(start.Attr, parent) @@ -234,6 +246,49 @@ func (p *ttmlParser) parseMetadataTrack(start xml.StartElement, parent ttmlTimin } } +func (p *ttmlParser) parseAgentDefinition(start xml.StartElement) error { + id, ok := attrValue(start.Attr, "id") + id = strings.TrimSpace(id) + if !ok || id == "" { + return p.skipElement(start) + } + + agent := ttmlDefinedAgent{ + ID: id, + Type: strings.ToLower(strings.TrimSpace(attrOrEmpty(start.Attr, "type"))), + } + + for { + token, err := p.decoder.Token() + if err != nil { + return err + } + + switch t := token.(type) { + case xml.StartElement: + if strings.EqualFold(t.Name.Local, "name") { + name, err := p.collectElementText(t) + if err != nil { + return err + } + name = sanitizeTTMLText(name) + if name != "" && agent.Name == "" { + agent.Name = name + } + continue + } + if err := p.skipElement(t); err != nil { + return err + } + case xml.EndElement: + if strings.EqualFold(t.Name.Local, start.Name.Local) { + p.definedAgents[agent.ID] = agent + return nil + } + } + } +} + func (p *ttmlParser) parseMetadataText(start xml.StartElement, parent ttmlTimingContext) (ttmlMetadataEntry, bool, error) { forKey, hasFor := attrValue(start.Attr, "for") forKey = strings.TrimSpace(forKey) @@ -338,8 +393,8 @@ func (p *ttmlParser) parseInlineElement(start xml.StartElement, parent ttmlTimin tokenText := sanitizeTTMLText(value) if local == "span" && hasOwnTiming && !ctx.invalid && tokenText != "" && len(tokens) == 0 { parsedToken := model.Cue{ - Value: tokenText, - Role: ctx.role, + Value: tokenText, + AgentID: p.resolveCueAgentID(ctx), } if ctx.hasBegin { startMs := ctx.begin @@ -366,12 +421,12 @@ func (p *ttmlParser) toLyricList() model.LyricList { if len(lines) == 0 { continue } - res = append(res, model.Lyrics{ + res = append(res, p.finalizeLyrics(model.Lyrics{ Kind: ttmlLyricKindMain, Lang: lang, Line: lines, Synced: linesAreSynced(lines), - }) + })) } res = append(res, p.buildMetadataLyrics(ttmlLyricKindTranslation, p.translationLangOrder, p.translationEntriesByLg)...) @@ -440,17 +495,168 @@ func (p *ttmlParser) buildMetadataLyrics(kind string, langOrder []string, entrie lines[i] = resolved[i].line } - res = append(res, model.Lyrics{ + res = append(res, p.finalizeLyrics(model.Lyrics{ Kind: kind, Lang: lang, Line: lines, Synced: linesAreSynced(lines), - }) + })) } return res } +func (p *ttmlParser) finalizeLyrics(lyrics model.Lyrics) model.Lyrics { + lyrics.Line = model.NormalizeCueLines(lyrics.Line) + lyrics.Line, lyrics.Agents = p.resolveAgents(lyrics.Line) + return model.NormalizeLyrics(lyrics) +} + +func (p *ttmlParser) resolveAgents(lines []model.Line) ([]model.Line, []model.Agent) { + if len(lines) == 0 { + return lines, nil + } + + normalized := model.NormalizeCueLines(lines) + usedOrder := make([]string, 0, 4) + usedSet := make(map[string]struct{}, 4) + sawEmptyCue := false + + for i := range normalized { + for j := range normalized[i].Cue { + agentID := strings.TrimSpace(normalized[i].Cue[j].AgentID) + if agentID == "" { + sawEmptyCue = true + continue + } + if _, exists := usedSet[agentID]; !exists { + usedSet[agentID] = struct{}{} + usedOrder = append(usedOrder, agentID) + } + } + } + + if len(usedOrder) == 0 { + return normalized, nil + } + + mainID := "" + for _, agentID := range usedOrder { + role := p.baseRoleForAgent(agentID) + if role != "bg" && role != "group" { + mainID = agentID + break + } + } + if mainID == "" && sawEmptyCue { + mainID = "main" + } + if mainID == "" { + for _, agentID := range usedOrder { + if p.baseRoleForAgent(agentID) != "bg" { + mainID = agentID + break + } + } + } + if mainID == "" { + mainID = usedOrder[0] + } + + if _, exists := usedSet[mainID]; !exists { + usedSet[mainID] = struct{}{} + usedOrder = append([]string{mainID}, usedOrder...) + } + + for i := range normalized { + for j := range normalized[i].Cue { + if strings.TrimSpace(normalized[i].Cue[j].AgentID) == "" { + normalized[i].Cue[j].AgentID = mainID + } + } + } + + agents := make([]model.Agent, 0, len(usedOrder)) + for _, agentID := range usedOrder { + role := p.baseRoleForAgent(agentID) + if agentID == mainID { + role = "main" + } + agent := model.Agent{ + ID: agentID, + Role: role, + Name: p.agentNameForID(agentID), + } + agents = append(agents, agent) + } + + return normalized, agents +} + +func (p *ttmlParser) resolveCueAgentID(ctx ttmlTimingContext) string { + agentID := strings.TrimSpace(ctx.agentID) + if contextHasRole(ctx.role, "x-bg") { + if agentID == "" { + agentID = "main" + } + return backgroundAgentID(agentID) + } + return agentID +} + +func (p *ttmlParser) baseRoleForAgent(agentID string) string { + if isBackgroundAgentID(agentID) { + return "bg" + } + + if agent, ok := p.definedAgents[agentID]; ok { + switch agent.Type { + case "group": + return "group" + default: + return "voice" + } + } + + return "voice" +} + +func (p *ttmlParser) agentNameForID(agentID string) string { + if isBackgroundAgentID(agentID) { + baseID := strings.TrimSuffix(agentID, "__bg") + if baseID == "main" { + return "" + } + if agent, ok := p.definedAgents[baseID]; ok { + return agent.Name + } + return "" + } + + if agent, ok := p.definedAgents[agentID]; ok { + return agent.Name + } + + return "" +} + +func backgroundAgentID(agentID string) string { + return agentID + "__bg" +} + +func isBackgroundAgentID(agentID string) bool { + return strings.HasSuffix(agentID, "__bg") +} + +func contextHasRole(roles string, role string) bool { + for _, candidate := range strings.Fields(strings.ToLower(roles)) { + if candidate == strings.ToLower(role) { + return true + } + } + return false +} + func (p *ttmlParser) addMainLine(lang string, lineKey string, line model.Line) { lang = normalizeTTMLLang(lang) if _, ok := p.mainLinesByLang[lang]; !ok { @@ -495,6 +701,9 @@ func (p *ttmlParser) childContext(attrs []xml.Attr, parent ttmlTimingContext) tt if lang, ok := attrValue(attrs, "lang"); ok { ctx.lang = normalizeTTMLLang(lang) } + if agentID, ok := attrValue(attrs, "agent"); ok { + ctx.agentID = strings.TrimSpace(agentID) + } if role, ok := attrValue(attrs, "role"); ok { role = strings.TrimSpace(role) if role != "" { @@ -805,6 +1014,55 @@ func attrValue(attrs []xml.Attr, key string) (string, bool) { return "", false } +func attrOrEmpty(attrs []xml.Attr, key string) string { + value, _ := attrValue(attrs, key) + return value +} + +func (p *ttmlParser) collectElementText(start xml.StartElement) (string, error) { + var text strings.Builder + + for { + token, err := p.decoder.Token() + if err != nil { + return "", err + } + + switch t := token.(type) { + case xml.StartElement: + value, err := p.collectElementText(t) + if err != nil { + return "", err + } + text.WriteString(value) + case xml.EndElement: + if strings.EqualFold(t.Name.Local, start.Name.Local) { + return text.String(), nil + } + case xml.CharData: + text.WriteString(string(t)) + } + } +} + +func (p *ttmlParser) skipElement(_ xml.StartElement) error { + depth := 1 + for depth > 0 { + token, err := p.decoder.Token() + if err != nil { + return err + } + + switch token.(type) { + case xml.StartElement: + depth++ + case xml.EndElement: + depth-- + } + } + return nil +} + func normalizeTTMLLang(lang string) string { lang = strings.ToLower(strings.TrimSpace(lang)) if lang == "" { @@ -840,42 +1098,7 @@ func linesAreSynced(lines []model.Line) bool { } func hydrateLineTimingFromTokens(line model.Line) model.Line { - if len(line.Cue) == 0 { - return line - } - - var earliestStart *int64 - var latestEnd *int64 - for i := range line.Cue { - token := line.Cue[i] - if token.Start != nil { - if earliestStart == nil || *token.Start < *earliestStart { - v := *token.Start - earliestStart = &v - } - } - - candidateEnd := token.End - if candidateEnd == nil { - candidateEnd = token.Start - } - if candidateEnd != nil { - if latestEnd == nil || *candidateEnd > *latestEnd { - v := *candidateEnd - latestEnd = &v - } - } - } - - if line.Start == nil && earliestStart != nil { - v := *earliestStart - line.Start = &v - } - if line.End == nil && latestEnd != nil { - v := *latestEnd - line.End = &v - } - return line + return model.NormalizeLineTiming(line) } func max(v float64, fallback float64) float64 { diff --git a/core/lyrics/ttml_test.go b/core/lyrics/ttml_test.go index 8ec16f679..5fc484a3b 100644 --- a/core/lyrics/ttml_test.go +++ b/core/lyrics/ttml_test.go @@ -129,6 +129,10 @@ var _ = Describe("parseTTML", func() { list, err := parseTTML(content) Expect(err).ToNot(HaveOccurred()) Expect(list).To(HaveLen(1)) + Expect(list[0].Agents).To(Equal([]model.Agent{ + {ID: "main", Role: "main"}, + {ID: "main__bg", Role: "bg"}, + })) Expect(list[0].Line).To(HaveLen(1)) line := list[0].Line[0] @@ -137,9 +141,41 @@ var _ = Describe("parseTTML", func() { Expect(line.End).To(Equal(gg.P(int64(3000)))) Expect(line.Cue).To(HaveLen(3)) - Expect(line.Cue[0]).To(Equal(model.Cue{Start: gg.P(int64(1000)), End: gg.P(int64(1400)), Value: "He"})) - Expect(line.Cue[1]).To(Equal(model.Cue{Start: gg.P(int64(1400)), End: gg.P(int64(1800)), Value: "llo"})) - Expect(line.Cue[2]).To(Equal(model.Cue{Start: gg.P(int64(2000)), End: gg.P(int64(2500)), Value: "echo", Role: "x-bg"})) + Expect(line.Cue[0]).To(Equal(model.Cue{Start: gg.P(int64(1000)), End: gg.P(int64(1400)), Value: "He", AgentID: "main"})) + Expect(line.Cue[1]).To(Equal(model.Cue{Start: gg.P(int64(1400)), End: gg.P(int64(1800)), Value: "llo", AgentID: "main"})) + Expect(line.Cue[2]).To(Equal(model.Cue{Start: gg.P(int64(2000)), End: gg.P(int64(2500)), Value: "echo", AgentID: "main__bg"})) + }) + + It("should parse named TTML agents into main, voice, and group roles", func() { + content := []byte(` + + + + Chris Martin + Jin + All + + + +
+

You

+

and

+

All

+
+ +
`) + + list, err := parseTTML(content) + Expect(err).ToNot(HaveOccurred()) + Expect(list).To(HaveLen(1)) + Expect(list[0].Agents).To(Equal([]model.Agent{ + {ID: "v1", Role: "main", Name: "Chris Martin"}, + {ID: "v2", Role: "voice", Name: "Jin"}, + {ID: "v1000", Role: "group", Name: "All"}, + })) + Expect(list[0].Line[0].Cue[0].AgentID).To(Equal("v1")) + Expect(list[0].Line[1].Cue[0].AgentID).To(Equal("v2")) + Expect(list[0].Line[2].Cue[0].AgentID).To(Equal("v1000")) }) }) diff --git a/model/lyrics.go b/model/lyrics.go index 9fcd4992e..725c3aa94 100644 --- a/model/lyrics.go +++ b/model/lyrics.go @@ -12,10 +12,16 @@ import ( ) type Cue struct { - Start *int64 `structs:"start,omitempty" json:"start,omitempty"` - End *int64 `structs:"end,omitempty" json:"end,omitempty"` - Value string `structs:"value" json:"value"` - Role string `structs:"role,omitempty" json:"role,omitempty"` + Start *int64 `structs:"start,omitempty" json:"start,omitempty"` + End *int64 `structs:"end,omitempty" json:"end,omitempty"` + Value string `structs:"value" json:"value"` + AgentID string `structs:"agentId,omitempty" json:"agentId,omitempty"` +} + +type Agent struct { + ID string `structs:"id" json:"id"` + Role string `structs:"role" json:"role"` + Name string `structs:"name,omitempty" json:"name,omitempty"` } type Line struct { @@ -26,13 +32,14 @@ type Line struct { } type Lyrics struct { - DisplayArtist string `structs:"displayArtist,omitempty" json:"displayArtist,omitempty"` - DisplayTitle string `structs:"displayTitle,omitempty" json:"displayTitle,omitempty"` - Kind string `structs:"kind,omitempty" json:"kind,omitempty"` - Lang string `structs:"lang" json:"lang"` - Line []Line `structs:"line" json:"line"` - Offset *int64 `structs:"offset,omitempty" json:"offset,omitempty"` - Synced bool `structs:"synced" json:"synced"` + DisplayArtist string `structs:"displayArtist,omitempty" json:"displayArtist,omitempty"` + DisplayTitle string `structs:"displayTitle,omitempty" json:"displayTitle,omitempty"` + Kind string `structs:"kind,omitempty" json:"kind,omitempty"` + Lang string `structs:"lang" json:"lang"` + Agents []Agent `structs:"agents,omitempty" json:"agents,omitempty"` + Line []Line `structs:"line" json:"line"` + Offset *int64 `structs:"offset,omitempty" json:"offset,omitempty"` + Synced bool `structs:"synced" json:"synced"` } // support the standard [mm:ss.mm], as well as [hh:*] and [*.mmm] @@ -199,7 +206,7 @@ func ToLyrics(language, text string) (*Lyrics, error) { DisplayArtist: artist, DisplayTitle: title, Lang: language, - Line: structuredLines, + Line: NormalizeCueLines(structuredLines), Offset: offset, Synced: synced, } @@ -265,11 +272,6 @@ func parseEnhancedCues(text string) []Cue { Start: &start, Value: seg.text, } - // Derive End from the next cue's Start - if i+1 < len(segments) { - end := segments[i+1].start - cues[i].End = &end - } } return cues } @@ -338,3 +340,127 @@ func parseTime(line string, match []int) (int64, error) { } type LyricList []Lyrics + +func NormalizeLyrics(lyrics Lyrics) Lyrics { + lyrics.Line = NormalizeCueLines(lyrics.Line) + if len(lyrics.Agents) == 0 { + lyrics.Agents = nil + } + return lyrics +} + +func NormalizeCueLines(lines []Line) []Line { + if len(lines) == 0 { + return lines + } + + normalized := make([]Line, len(lines)) + copy(normalized, lines) + + for i := range normalized { + var fallbackEnd *int64 + if normalized[i].End != nil { + v := *normalized[i].End + fallbackEnd = &v + } else if i+1 < len(normalized) && normalized[i+1].Start != nil { + v := *normalized[i+1].Start + fallbackEnd = &v + } + + normalized[i] = normalizeCueLine(normalized[i], fallbackEnd) + } + + return normalized +} + +func NormalizeLineTiming(line Line) Line { + if len(line.Cue) == 0 { + return line + } + + var earliestStart *int64 + var latestEnd *int64 + for i := range line.Cue { + token := line.Cue[i] + if token.Start != nil { + if earliestStart == nil || *token.Start < *earliestStart { + v := *token.Start + earliestStart = &v + } + } + + candidateEnd := token.End + if candidateEnd == nil { + candidateEnd = token.Start + } + if candidateEnd != nil { + if latestEnd == nil || *candidateEnd > *latestEnd { + v := *candidateEnd + latestEnd = &v + } + } + } + + if line.Start == nil && earliestStart != nil { + v := *earliestStart + line.Start = &v + } + if line.End == nil && latestEnd != nil { + v := *latestEnd + line.End = &v + } + return line +} + +func normalizeCueLine(line Line, fallbackEnd *int64) Line { + if len(line.Cue) == 0 { + return line + } + + hasAnyEnd := false + for i := range line.Cue { + if line.Cue[i].End != nil { + hasAnyEnd = true + break + } + } + if !hasAnyEnd { + line.Cue = clearCueEnds(line.Cue) + return NormalizeLineTiming(line) + } + + for i := range line.Cue { + if line.Cue[i].End != nil { + continue + } + + if i+1 < len(line.Cue) && line.Cue[i+1].Start != nil { + v := *line.Cue[i+1].Start + line.Cue[i].End = &v + continue + } + + if fallbackEnd != nil { + v := *fallbackEnd + line.Cue[i].End = &v + } + } + + for i := range line.Cue { + if line.Cue[i].End == nil { + line.Cue = clearCueEnds(line.Cue) + return NormalizeLineTiming(line) + } + } + + return NormalizeLineTiming(line) +} + +func clearCueEnds(cues []Cue) []Cue { + normalized := make([]Cue, len(cues)) + copy(normalized, cues) + for i := range normalized { + normalized[i].End = nil + } + return normalized +} diff --git a/model/lyrics_test.go b/model/lyrics_test.go index 2228306d0..9aad7d968 100644 --- a/model/lyrics_test.go +++ b/model/lyrics_test.go @@ -129,8 +129,8 @@ var _ = Describe("ToLyrics", func() { Expect(line0.Start).To(Equal(&t1000)) Expect(line0.Value).To(Equal("Some lyrics here")) Expect(line0.Cue).To(Equal([]Cue{ - {Start: &t1000, End: &t1500, Value: "Some "}, - {Start: &t1500, End: &t2000, Value: "lyrics "}, + {Start: &t1000, Value: "Some "}, + {Start: &t1500, Value: "lyrics "}, {Start: &t2000, Value: "here"}, })) @@ -138,7 +138,7 @@ var _ = Describe("ToLyrics", func() { Expect(line1.Start).To(Equal(&t3000)) Expect(line1.Value).To(Equal("More words")) Expect(line1.Cue).To(Equal([]Cue{ - {Start: &t3000, End: &t3500, Value: "More "}, + {Start: &t3000, Value: "More "}, {Start: &t3500, Value: "words"}, })) }) @@ -161,7 +161,7 @@ var _ = Describe("ToLyrics", func() { t1000, t1500, t5000, t5500 := int64(1000), int64(1500), int64(5000), int64(5500) Expect(lyrics.Line[0].Cue).To(Equal([]Cue{ - {Start: &t1000, End: &t1500, Value: "Some "}, + {Start: &t1000, Value: "Some "}, {Start: &t1500, Value: "lyrics"}, })) Expect(lyrics.Line[0].Value).To(Equal("Some lyrics")) @@ -170,7 +170,7 @@ var _ = Describe("ToLyrics", func() { Expect(lyrics.Line[1].Value).To(Equal("Plain line")) Expect(lyrics.Line[2].Cue).To(Equal([]Cue{ - {Start: &t5000, End: &t5500, Value: "More "}, + {Start: &t5000, Value: "More "}, {Start: &t5500, Value: "words"}, })) Expect(lyrics.Line[2].Value).To(Equal("More words")) diff --git a/server/subsonic/helpers.go b/server/subsonic/helpers.go index 305f1818e..7545a71c0 100644 --- a/server/subsonic/helpers.go +++ b/server/subsonic/helpers.go @@ -476,14 +476,22 @@ func mapExplicitStatus(explicitStatus string) string { return "" } -// sanitizeRole strips the TTML x- prefix from role values for the API. -func sanitizeRole(role string) string { - return strings.TrimPrefix(role, "x-") -} - func buildStructuredLyric(mf *model.MediaFile, lyrics model.Lyrics, enhanced bool) responses.StructuredLyric { lines := make([]responses.Line, len(lyrics.Line)) var cueLines []responses.CueLine + agentOrderByID := make(map[string]int, len(lyrics.Agents)) + agentRoleByID := make(map[string]string, len(lyrics.Agents)) + responseAgents := make([]responses.Agent, 0, len(lyrics.Agents)) + + for i, agent := range lyrics.Agents { + agentOrderByID[agent.ID] = i + agentRoleByID[agent.ID] = agent.Role + responseAgents = append(responseAgents, responses.Agent{ + ID: agent.ID, + Role: agent.Role, + Name: agent.Name, + }) + } for i, line := range lyrics.Line { lines[i] = responses.Line{ @@ -494,41 +502,50 @@ func buildStructuredLyric(mf *model.MediaFile, lyrics model.Lyrics, enhanced boo continue } - // Group cues by role, preserving order of first appearance - roleOrder := make([]string, 0, 2) - cuesByRole := make(map[string][]responses.LyricCue) + agentOrder := make([]string, 0, 2) + cuesByAgent := make(map[string][]model.Cue) for _, cue := range line.Cue { if cue.Start == nil { continue } - role := sanitizeRole(cue.Role) - if _, exists := cuesByRole[role]; !exists { - roleOrder = append(roleOrder, role) + agentID := strings.TrimSpace(cue.AgentID) + if _, exists := cuesByAgent[agentID]; !exists { + agentOrder = append(agentOrder, agentID) } - cuesByRole[role] = append(cuesByRole[role], responses.LyricCue{ - Start: *cue.Start, - End: cue.End, - Value: cue.Value, - }) + cuesByAgent[agentID] = append(cuesByAgent[agentID], cue) } - // Ensure main vocals (empty role) always comes first - sort.SliceStable(roleOrder, func(i, j int) bool { - return roleOrder[i] == "" && roleOrder[j] != "" + sort.SliceStable(agentOrder, func(i, j int) bool { + leftRole := agentRoleByID[agentOrder[i]] + rightRole := agentRoleByID[agentOrder[j]] + if leftRole == "main" && rightRole != "main" { + return true + } + if rightRole == "main" && leftRole != "main" { + return false + } + + leftOrder, leftOK := agentOrderByID[agentOrder[i]] + rightOrder, rightOK := agentOrderByID[agentOrder[j]] + if leftOK && rightOK && leftOrder != rightOrder { + return leftOrder < rightOrder + } + if leftOK != rightOK { + return leftOK + } + return i < j }) - // Create a separate CueLine for each role group - for _, role := range roleOrder { - cues := cuesByRole[role] + for _, agentID := range agentOrder { cueLine := responses.CueLine{ Index: int32(i), Start: line.Start, End: line.End, Value: line.Value, - Cue: cues, + Cue: buildLyricCues(cuesByAgent[agentID], line.End), } - if role != "" { - cueLine.Role = role + if agentID != "" { + cueLine.AgentID = agentID } cueLines = append(cueLines, cueLine) } @@ -550,6 +567,9 @@ func buildStructuredLyric(mf *model.MediaFile, lyrics model.Lyrics, enhanced boo kind = "main" } structured.Kind = kind + if len(cueLines) > 0 && len(responseAgents) > 0 { + structured.Agents = responseAgents + } } if structured.DisplayArtist == "" { @@ -562,6 +582,67 @@ func buildStructuredLyric(mf *model.MediaFile, lyrics model.Lyrics, enhanced boo return structured } +func buildLyricCues(cues []model.Cue, lineEnd *int64) []responses.LyricCue { + if len(cues) == 0 { + return nil + } + + hasAnyEnd := false + for i := range cues { + if cues[i].End != nil { + hasAnyEnd = true + break + } + } + + normalized := make([]responses.LyricCue, 0, len(cues)) + for i := range cues { + if cues[i].Start == nil { + continue + } + + cue := responses.LyricCue{ + Start: *cues[i].Start, + Value: cues[i].Value, + } + if hasAnyEnd { + end := cues[i].End + if end == nil { + if i+1 < len(cues) && cues[i+1].Start != nil { + v := *cues[i+1].Start + end = &v + } else if lineEnd != nil { + v := *lineEnd + end = &v + } + } + if end != nil && i+1 < len(cues) && cues[i+1].Start != nil && *end > *cues[i+1].Start { + v := *cues[i+1].Start + end = &v + } + if end != nil && *end < cue.Start { + v := cue.Start + end = &v + } + cue.End = end + } + normalized = append(normalized, cue) + } + + if hasAnyEnd { + for i := range normalized { + if normalized[i].End == nil { + for j := range normalized { + normalized[j].End = nil + } + break + } + } + } + + return normalized +} + func buildLyricsList(mf *model.MediaFile, lyricsList model.LyricList, enhanced bool) *responses.LyricsList { var filtered model.LyricList if enhanced { diff --git a/server/subsonic/media_retrieval_test.go b/server/subsonic/media_retrieval_test.go index 0fdbb3854..5489492ce 100644 --- a/server/subsonic/media_retrieval_test.go +++ b/server/subsonic/media_retrieval_test.go @@ -235,6 +235,7 @@ var _ = Describe("MediaRetrievalController", func() { Expect(realLyric.Kind).To(Equal(expectedLyric.Kind)) Expect(realLyric.Lang).To(Equal(expectedLyric.Lang)) Expect(realLyric.Synced).To(Equal(expectedLyric.Synced)) + Expect(realLyric.Agents).To(Equal(expectedLyric.Agents)) if expectedLyric.Offset == nil { Expect(realLyric.Offset).To(BeNil()) @@ -259,7 +260,7 @@ var _ = Describe("MediaRetrievalController", func() { expectedCueLine := expectedLyric.CueLine[j] Expect(realCueLine.Index).To(Equal(expectedCueLine.Index)) Expect(realCueLine.Value).To(Equal(expectedCueLine.Value)) - Expect(realCueLine.Role).To(Equal(expectedCueLine.Role)) + Expect(realCueLine.AgentID).To(Equal(expectedCueLine.AgentID)) if expectedCueLine.Start == nil { Expect(realCueLine.Start).To(BeNil()) } else { @@ -542,6 +543,7 @@ var _ = Describe("MediaRetrievalController", func() { lyricsJson, err := json.Marshal(model.LyricList{ { Lang: "eng", + Agents: []model.Agent{{ID: "lead", Role: "main"}, {ID: "lead__bg", Role: "bg"}}, Synced: true, Line: []model.Line{ { @@ -550,15 +552,16 @@ var _ = Describe("MediaRetrievalController", func() { Value: "Hello echo", Cue: []model.Cue{ { - Start: &tokenStartA, - End: &tokenEndA, - Value: "Hello", + Start: &tokenStartA, + End: &tokenEndA, + Value: "Hello", + AgentID: "lead", }, { - Start: &tokenStartB, - End: &tokenEndB, - Value: "echo", - Role: "x-bg", + Start: &tokenStartB, + End: &tokenEndB, + Value: "echo", + AgentID: "lead__bg", }, }, }, @@ -586,6 +589,10 @@ var _ = Describe("MediaRetrievalController", func() { Kind: "main", Lang: "eng", Synced: true, + Agents: []responses.Agent{ + {ID: "lead", Role: "main"}, + {ID: "lead__bg", Role: "bg"}, + }, Line: []responses.Line{ { Start: &lineStart, @@ -594,10 +601,11 @@ var _ = Describe("MediaRetrievalController", func() { }, CueLine: []responses.CueLine{ { - Index: 0, - Start: &lineStart, - End: &lineEnd, - Value: "Hello echo", + Index: 0, + Start: &lineStart, + End: &lineEnd, + Value: "Hello echo", + AgentID: "lead", Cue: []responses.LyricCue{ { Start: tokenStartA, @@ -607,11 +615,11 @@ var _ = Describe("MediaRetrievalController", func() { }, }, { - Index: 0, - Start: &lineStart, - End: &lineEnd, - Value: "Hello echo", - Role: "bg", + Index: 0, + Start: &lineStart, + End: &lineEnd, + Value: "Hello echo", + AgentID: "lead__bg", Cue: []responses.LyricCue{ { Start: tokenStartB, diff --git a/server/subsonic/responses/responses.go b/server/subsonic/responses/responses.go index f5446a961..344dd9999 100644 --- a/server/subsonic/responses/responses.go +++ b/server/subsonic/responses/responses.go @@ -543,13 +543,19 @@ type LyricCue struct { Value string `xml:",chardata" json:"value"` } +type Agent struct { + ID string `xml:"id,attr" json:"id"` + Role string `xml:"role,attr" json:"role"` + Name string `xml:"name,attr,omitempty" json:"name,omitempty"` +} + type CueLine struct { - Index int32 `xml:"index,attr" json:"index"` - Start *int64 `xml:"start,attr,omitempty" json:"start,omitempty"` - End *int64 `xml:"end,attr,omitempty" json:"end,omitempty"` - Value string `xml:"value,attr,omitempty" json:"value,omitempty"` - Role string `xml:"role,attr,omitempty" json:"role,omitempty"` - Cue []LyricCue `xml:"cue,omitempty" json:"cue,omitempty"` + Index int32 `xml:"index,attr" json:"index"` + Start *int64 `xml:"start,attr,omitempty" json:"start,omitempty"` + End *int64 `xml:"end,attr,omitempty" json:"end,omitempty"` + Value string `xml:"value,attr,omitempty" json:"value,omitempty"` + AgentID string `xml:"agentId,attr,omitempty" json:"agentId,omitempty"` + Cue []LyricCue `xml:"cue,omitempty" json:"cue,omitempty"` } type StructuredLyric struct { @@ -558,6 +564,7 @@ type StructuredLyric struct { Kind string `xml:"kind,attr,omitempty" json:"kind,omitempty"` Lang string `xml:"lang,attr" json:"lang"` Line []Line `xml:"line" json:"line"` + Agents []Agent `xml:"agent,omitempty" json:"agents,omitempty"` CueLine []CueLine `xml:"cueLine,omitempty" json:"cueLine,omitempty"` Offset *int64 `xml:"offset,attr,omitempty" json:"offset,omitempty"` Synced bool `xml:"synced,attr" json:"synced"` diff --git a/tests/fixtures/test.elrc b/tests/fixtures/test.elrc new file mode 100644 index 000000000..01c3d2cdd --- /dev/null +++ b/tests/fixtures/test.elrc @@ -0,0 +1,5 @@ +[ar:ELRC Artist] +[ti:ELRC Song] +[lang:eng] +[00:01.00]<00:01.00>Lead <00:01.50>words +[00:03.00]Fallback line diff --git a/tests/fixtures/test.srt b/tests/fixtures/test.srt new file mode 100644 index 000000000..3c9c09a39 --- /dev/null +++ b/tests/fixtures/test.srt @@ -0,0 +1,7 @@ +1 +00:00:18,800 --> 00:00:22,800 +We're from subtitles + +2 +00:00:22,801 --> 00:00:26,000 +Another subtitle line diff --git a/ui/src/audioplayer/PlayerToolbar.jsx b/ui/src/audioplayer/PlayerToolbar.jsx index 869df475d..8487b0655 100644 --- a/ui/src/audioplayer/PlayerToolbar.jsx +++ b/ui/src/audioplayer/PlayerToolbar.jsx @@ -108,7 +108,7 @@ const PlayerToolbar = ({ ) const toggleLyricsButton = ( - + Array.isArray(lyric.line) && lyric.line.some((line) => Number.isFinite(Number(line.start))) +const preferTimedLyrics = (lyrics) => { + const timed = lyrics.filter(hasTimedLines) + return timed.length > 0 ? timed : lyrics +} + const normalizeToken = (token) => { if (!token) { return null @@ -77,10 +82,38 @@ const normalizeToken = (token) => { } } -const normalizeCueLine = (cueLine, fallbackIndex) => { +const buildAgentLookup = (structuredLyric) => { + const lookup = new Map() + const agents = Array.isArray(structuredLyric?.agents) ? structuredLyric.agents : [] + for (const agent of agents) { + const id = typeof agent?.id === 'string' ? agent.id : '' + if (!id || lookup.has(id)) { + continue + } + lookup.set(id, { + id, + role: typeof agent?.role === 'string' ? agent.role : '', + name: typeof agent?.name === 'string' ? agent.name : '', + }) + } + return lookup +} + +const deriveUiRole = (agent) => { + if (!agent?.role || agent.role === 'main') { + return '' + } + return agent.role +} + +const normalizeCueLine = (cueLine, fallbackIndex, agentLookup) => { const index = Number.isFinite(Number(cueLine?.index)) ? Number(cueLine.index) : fallbackIndex + const agentId = typeof cueLine?.agentId === 'string' ? cueLine.agentId : '' + const agent = agentId ? agentLookup.get(agentId) || null : null + const fallbackRole = + typeof cueLine?.role === 'string' ? cueLine.role : '' const tokens = sortTokensByStart( Array.isArray(cueLine?.cue) ? cueLine.cue.map(normalizeToken).filter(Boolean) @@ -92,7 +125,10 @@ const normalizeCueLine = (cueLine, fallbackIndex) => { start: toTime(cueLine?.start), end: toTime(cueLine?.end), value: typeof cueLine?.value === 'string' ? cueLine.value : '', - role: typeof cueLine?.role === 'string' ? cueLine.role : '', + role: agent ? deriveUiRole(agent) : fallbackRole, + agentId, + agentRole: agent?.role || fallbackRole, + agentName: agent?.name || '', tokens, } } @@ -194,6 +230,9 @@ const buildSyntheticWordTokens = (line, token) => { end: baseStart + (duration * (idx + 1)) / chunks.length, value: chunk, role: typeof token?.role === 'string' ? token.role : '', + agentId: typeof token?.agentId === 'string' ? token.agentId : '', + agentName: typeof token?.agentName === 'string' ? token.agentName : '', + agentRole: typeof token?.agentRole === 'string' ? token.agentRole : '', })) } @@ -240,8 +279,8 @@ export const selectLyricLayers = (structuredLyrics, preferredLanguage) => { } } - const synced = structuredLyrics.filter(hasTimedLines) - if (synced.length === 0) { + const available = structuredLyrics.filter(hasStructuredLyricContent) + if (available.length === 0) { return { main: null, translation: null, @@ -255,22 +294,25 @@ export const selectLyricLayers = (structuredLyrics, preferredLanguage) => { [LYRIC_KIND_PRONUNCIATION]: [], } - for (const lyric of synced) { + for (const lyric of available) { grouped[normalizeLyricKind(lyric?.kind)].push(lyric) } const mainCandidates = grouped[LYRIC_KIND_MAIN].length ? grouped[LYRIC_KIND_MAIN] - : synced + : available return { - main: pickLyricByLanguage(mainCandidates, preferredLanguage), + main: pickLyricByLanguage( + preferTimedLyrics(mainCandidates), + preferredLanguage, + ), translation: pickLyricByLanguage( - grouped[LYRIC_KIND_TRANSLATION], + preferTimedLyrics(grouped[LYRIC_KIND_TRANSLATION]), preferredLanguage, ), pronunciation: pickLyricByLanguage( - grouped[LYRIC_KIND_PRONUNCIATION], + preferTimedLyrics(grouped[LYRIC_KIND_PRONUNCIATION]), preferredLanguage, ), } @@ -316,6 +358,7 @@ export const buildKaraokeLines = (structuredLyric) => { return [] } + const agentLookup = buildAgentLookup(structuredLyric) const baseLines = Array.isArray(structuredLyric.line) ? structuredLyric.line : [] @@ -328,12 +371,19 @@ export const buildKaraokeLines = (structuredLyric) => { ? (() => { const normalizedCueLines = rawCueLines.map( (cueLine, fallbackIndex) => { - const normalized = normalizeCueLine(cueLine, fallbackIndex) + const normalized = normalizeCueLine( + cueLine, + fallbackIndex, + agentLookup, + ) return { ...normalized, tokens: normalized.tokens.map((token) => ({ ...token, role: normalized.role, + agentId: normalized.agentId, + agentName: normalized.agentName, + agentRole: normalized.agentRole, })), } }, @@ -366,6 +416,9 @@ export const buildKaraokeLines = (structuredLyric) => { start: first.start ?? toTime(baseLine.start) ?? fallbackStart, end: first.end ?? toTime(baseLine.end) ?? fallbackEnd, value, + agentId: first.agentId, + agentName: first.agentName, + agentRole: first.agentRole, tokens, } }) diff --git a/ui/src/audioplayer/lyrics.test.js b/ui/src/audioplayer/lyrics.test.js index 6cb3a1b87..3a5f83b2d 100644 --- a/ui/src/audioplayer/lyrics.test.js +++ b/ui/src/audioplayer/lyrics.test.js @@ -124,6 +124,49 @@ describe('lyrics helpers', () => { expect(layers.pronunciation).toBeNull() }) + it('falls back to unsynced lyric content when no timed track exists', () => { + const layers = selectLyricLayers( + [ + { + lang: 'eng', + synced: false, + line: [{ value: 'Plain embedded lyric' }], + }, + ], + 'eng', + ) + + expect(layers.main).toEqual({ + lang: 'eng', + synced: false, + line: [{ value: 'Plain embedded lyric' }], + }) + }) + + it('still prefers timed lyrics when both timed and untimed tracks exist', () => { + const layers = selectLyricLayers( + [ + { + lang: 'eng', + synced: false, + line: [{ value: 'Plain lyric' }], + }, + { + lang: 'eng', + synced: true, + line: [{ start: 1000, value: 'Timed lyric' }], + }, + ], + 'eng', + ) + + expect(layers.main).toEqual({ + lang: 'eng', + synced: true, + line: [{ start: 1000, value: 'Timed lyric' }], + }) + }) + it('matches layer line by timing for the active main line', () => { const mainLines = [ { index: 0, start: 1000, end: 1800, value: 'Line A', tokens: [] }, @@ -200,43 +243,88 @@ describe('lyrics helpers', () => { expect(getPreferredLyricLanguage()).toBe('pt-BR') }) - it('builds karaoke lines from cueLine payload', () => { + it('builds karaoke lines from agent-based cueLine payload', () => { + const lines = buildKaraokeLines({ + lang: 'eng', + synced: true, + line: [{ start: 1000, end: 3000, value: 'Hello world' }], + agents: [ + { id: 'lead', role: 'main', name: 'Lead Vocal' }, + { id: 'backing', role: 'bg' }, + ], + cueLine: [ + { + index: 0, + start: 1000, + end: 3000, + value: 'Hello world', + agentId: 'lead', + cue: [{ start: 1000, end: 1500, value: 'Hello' }], + }, + { + index: 0, + start: 1000, + end: 3000, + value: 'Hello world', + agentId: 'backing', + cue: [{ start: 2000, end: 2500, value: 'world' }], + }, + ], + }) + + expect(lines).toEqual([ + { + agentId: 'lead', + agentName: 'Lead Vocal', + agentRole: 'main', + index: 0, + start: 1000, + end: 3000, + value: 'Hello world', + tokens: [ + { + start: 1000, + end: 1500, + value: 'Hello', + role: '', + agentId: 'lead', + agentName: 'Lead Vocal', + agentRole: 'main', + }, + { + start: 2000, + end: 2500, + value: 'world', + role: 'bg', + agentId: 'backing', + agentName: '', + agentRole: 'bg', + }, + ], + }, + ]) + }) + + it('falls back to legacy cueLine role values when agents are absent', () => { const lines = buildKaraokeLines({ lang: 'eng', synced: true, line: [{ start: 1000, end: 3000, value: 'Hello world' }], cueLine: [ - { - index: 0, - start: 1000, - end: 3000, - value: 'Hello world', - role: '', - cue: [{ start: 1000, end: 1500, value: 'Hello' }], - }, { index: 0, start: 1000, end: 3000, value: 'Hello world', role: 'bg', - cue: [{ start: 2000, end: 2500, value: 'world' }], + cue: [{ start: 1000, end: 1500, value: 'Hello' }], }, ], }) - expect(lines).toEqual([ - { - index: 0, - start: 1000, - end: 3000, - value: 'Hello world', - tokens: [ - { start: 1000, end: 1500, value: 'Hello', role: '' }, - { start: 2000, end: 2500, value: 'world', role: 'bg' }, - ], - }, - ]) + expect(lines[0].tokens[0].role).toBe('bg') + expect(lines[0].tokens[0].agentId).toBe('') + expect(lines[0].tokens[0].agentName).toBe('') }) it('sorts token timing by start to keep playback stable', () => {