mirror of
https://github.com/navidrome/navidrome.git
synced 2026-05-03 06:51:16 +00:00
feat(lyrics): support agent-based lyric layers
This commit is contained in:
parent
ff40c030d9
commit
d6a684e60e
@ -677,7 +677,7 @@ func setViperDefaults() {
|
||||
viper.SetDefault("coverartquality", 75)
|
||||
viper.SetDefault("artistartpriority", "artist.*, album/artist.*, external")
|
||||
viper.SetDefault("discartpriority", "disc*.*, cd*.*, cover.*, folder.*, front.*, discsubtitle, embedded")
|
||||
viper.SetDefault("lyricspriority", ".lrc,.ttml,.txt,embedded")
|
||||
viper.SetDefault("lyricspriority", ".ttml,.elrc,.lrc,.srt,.txt,embedded")
|
||||
viper.SetDefault("enablegravatar", false)
|
||||
viper.SetDefault("enablefavourites", true)
|
||||
viper.SetDefault("enablestarrating", true)
|
||||
|
||||
@ -44,6 +44,36 @@ var _ = Describe("sources", func() {
|
||||
},
|
||||
}
|
||||
|
||||
elrcLyrics := model.LyricList{
|
||||
model.Lyrics{
|
||||
DisplayArtist: "ELRC Artist",
|
||||
DisplayTitle: "ELRC Song",
|
||||
Lang: "eng",
|
||||
Line: []model.Line{
|
||||
{
|
||||
Start: gg.P(int64(1000)),
|
||||
End: gg.P(int64(1500)),
|
||||
Value: "Lead words",
|
||||
Cue: []model.Cue{
|
||||
{
|
||||
Start: gg.P(int64(1000)),
|
||||
Value: "Lead ",
|
||||
},
|
||||
{
|
||||
Start: gg.P(int64(1500)),
|
||||
Value: "words",
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
Start: gg.P(int64(3000)),
|
||||
Value: "Fallback line",
|
||||
},
|
||||
},
|
||||
Synced: true,
|
||||
},
|
||||
}
|
||||
|
||||
ttmlLyrics := model.LyricList{
|
||||
model.Lyrics{
|
||||
Kind: "main",
|
||||
@ -88,6 +118,25 @@ var _ = Describe("sources", func() {
|
||||
},
|
||||
}
|
||||
|
||||
srtLyrics := model.LyricList{
|
||||
model.Lyrics{
|
||||
Lang: "xxx",
|
||||
Line: []model.Line{
|
||||
{
|
||||
Start: gg.P(int64(18800)),
|
||||
End: gg.P(int64(22800)),
|
||||
Value: "We're from subtitles",
|
||||
},
|
||||
{
|
||||
Start: gg.P(int64(22801)),
|
||||
End: gg.P(int64(26000)),
|
||||
Value: "Another subtitle line",
|
||||
},
|
||||
},
|
||||
Synced: true,
|
||||
},
|
||||
}
|
||||
|
||||
BeforeEach(func() {
|
||||
DeferCleanup(configtest.SetupConfig())
|
||||
|
||||
@ -109,8 +158,10 @@ var _ = Describe("sources", func() {
|
||||
},
|
||||
Entry("embedded > lrc > txt", "embedded,.lrc,.txt", embeddedLyrics),
|
||||
Entry("lrc > embedded > txt", ".lrc,embedded,.txt", syncedLyrics),
|
||||
Entry("elrc > lrc > embedded", ".elrc,.lrc,embedded", elrcLyrics),
|
||||
Entry("srt > txt > embedded", ".srt,.txt,embedded", srtLyrics),
|
||||
Entry("txt > lrc > embedded", ".txt,.lrc,embedded", unsyncedLyrics),
|
||||
Entry("ttml > lrc > embedded", ".ttml,.lrc,embedded", ttmlLyrics))
|
||||
Entry("ttml > elrc > lrc > srt > embedded", ".ttml,.elrc,.lrc,.srt,embedded", ttmlLyrics))
|
||||
|
||||
Context("Errors", func() {
|
||||
var RegularUserContext = XContext
|
||||
|
||||
@ -38,13 +38,20 @@ func fromExternalFile(ctx context.Context, mf *model.MediaFile, suffix string) (
|
||||
}
|
||||
|
||||
var list model.LyricList
|
||||
if strings.EqualFold(suffix, ".ttml") {
|
||||
switch {
|
||||
case strings.EqualFold(suffix, ".ttml"):
|
||||
list, err = parseTTML(contents)
|
||||
if err != nil {
|
||||
log.Error(ctx, "error parsing ttml external file", "path", externalLyric, err)
|
||||
return nil, err
|
||||
}
|
||||
} else {
|
||||
case strings.EqualFold(suffix, ".srt"):
|
||||
list, err = parseSRT(contents)
|
||||
if err != nil {
|
||||
log.Error(ctx, "error parsing srt external file", "path", externalLyric, err)
|
||||
return nil, err
|
||||
}
|
||||
default:
|
||||
lyrics, err := model.ToLyrics("xxx", string(contents))
|
||||
if err != nil {
|
||||
log.Error(ctx, "error parsing lyric external file", "path", externalLyric, err)
|
||||
|
||||
@ -106,10 +106,10 @@ var _ = Describe("sources", func() {
|
||||
Expect(lyrics[0].Line[0].Cue).To(HaveLen(3))
|
||||
Expect(*lyrics[0].Line[0].Cue[0].Start).To(Equal(int64(1000)))
|
||||
Expect(lyrics[0].Line[0].Cue[0].Value).To(Equal("Some "))
|
||||
Expect(*lyrics[0].Line[0].Cue[0].End).To(Equal(int64(1500)))
|
||||
Expect(lyrics[0].Line[0].Cue[0].End).To(BeNil())
|
||||
Expect(*lyrics[0].Line[0].Cue[1].Start).To(Equal(int64(1500)))
|
||||
Expect(lyrics[0].Line[0].Cue[1].Value).To(Equal("lyrics "))
|
||||
Expect(*lyrics[0].Line[0].Cue[1].End).To(Equal(int64(2000)))
|
||||
Expect(lyrics[0].Line[0].Cue[1].End).To(BeNil())
|
||||
Expect(*lyrics[0].Line[0].Cue[2].Start).To(Equal(int64(2000)))
|
||||
Expect(lyrics[0].Line[0].Cue[2].Value).To(Equal("here"))
|
||||
Expect(lyrics[0].Line[0].Cue[2].End).To(BeNil())
|
||||
@ -125,6 +125,33 @@ var _ = Describe("sources", func() {
|
||||
Expect(lyrics[0].Line[2].Cue).To(BeNil())
|
||||
})
|
||||
|
||||
It("should return Enhanced LRC lyrics from an ELRC file", func() {
|
||||
mf := model.MediaFile{Path: "tests/fixtures/test.mp3"}
|
||||
lyrics, err := fromExternalFile(ctx, &mf, ".elrc")
|
||||
|
||||
Expect(err).To(BeNil())
|
||||
Expect(lyrics).To(HaveLen(1))
|
||||
Expect(lyrics[0].DisplayArtist).To(Equal("ELRC Artist"))
|
||||
Expect(lyrics[0].DisplayTitle).To(Equal("ELRC Song"))
|
||||
Expect(lyrics[0].Lang).To(Equal("eng"))
|
||||
Expect(lyrics[0].Synced).To(BeTrue())
|
||||
Expect(lyrics[0].Line).To(HaveLen(2))
|
||||
|
||||
Expect(lyrics[0].Line[0].Start).To(Equal(gg.P(int64(1000))))
|
||||
Expect(lyrics[0].Line[0].Value).To(Equal("Lead words"))
|
||||
Expect(lyrics[0].Line[0].Cue).To(HaveLen(2))
|
||||
Expect(*lyrics[0].Line[0].Cue[0].Start).To(Equal(int64(1000)))
|
||||
Expect(lyrics[0].Line[0].Cue[0].Value).To(Equal("Lead "))
|
||||
Expect(lyrics[0].Line[0].Cue[0].End).To(BeNil())
|
||||
Expect(*lyrics[0].Line[0].Cue[1].Start).To(Equal(int64(1500)))
|
||||
Expect(lyrics[0].Line[0].Cue[1].Value).To(Equal("words"))
|
||||
Expect(lyrics[0].Line[0].Cue[1].End).To(BeNil())
|
||||
|
||||
Expect(lyrics[0].Line[1].Start).To(Equal(gg.P(int64(3000))))
|
||||
Expect(lyrics[0].Line[1].Value).To(Equal("Fallback line"))
|
||||
Expect(lyrics[0].Line[1].Cue).To(BeNil())
|
||||
})
|
||||
|
||||
It("should return unsynchronized lyrics from a file", func() {
|
||||
mf := model.MediaFile{Path: "tests/fixtures/test.mp3"}
|
||||
lyrics, err := fromExternalFile(ctx, &mf, ".txt")
|
||||
@ -146,6 +173,31 @@ var _ = Describe("sources", func() {
|
||||
}))
|
||||
})
|
||||
|
||||
It("should return synchronized lyrics from an SRT file", func() {
|
||||
mf := model.MediaFile{Path: "tests/fixtures/test.mp3"}
|
||||
lyrics, err := fromExternalFile(ctx, &mf, ".srt")
|
||||
|
||||
Expect(err).To(BeNil())
|
||||
Expect(lyrics).To(Equal(model.LyricList{
|
||||
model.Lyrics{
|
||||
Lang: "xxx",
|
||||
Line: []model.Line{
|
||||
{
|
||||
Start: gg.P(int64(18800)),
|
||||
End: gg.P(int64(22800)),
|
||||
Value: "We're from subtitles",
|
||||
},
|
||||
{
|
||||
Start: gg.P(int64(22801)),
|
||||
End: gg.P(int64(26000)),
|
||||
Value: "Another subtitle line",
|
||||
},
|
||||
},
|
||||
Synced: true,
|
||||
},
|
||||
}))
|
||||
})
|
||||
|
||||
It("should return synchronized multilingual lyrics from a TTML file", func() {
|
||||
mf := model.MediaFile{Path: "tests/fixtures/test.mp3"}
|
||||
lyrics, err := fromExternalFile(ctx, &mf, ".ttml")
|
||||
|
||||
161
core/lyrics/srt.go
Normal file
161
core/lyrics/srt.go
Normal file
@ -0,0 +1,161 @@
|
||||
package lyrics
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/navidrome/navidrome/model"
|
||||
"github.com/navidrome/navidrome/utils/str"
|
||||
)
|
||||
|
||||
var srtTimeRegex = regexp.MustCompile(`^\s*(\d{1,2}):(\d{2}):(\d{2})[,.](\d{1,3})\s*$`)
|
||||
|
||||
func parseSRT(contents []byte) (model.LyricList, error) {
|
||||
raw := strings.ReplaceAll(string(contents), "\r\n", "\n")
|
||||
raw = strings.ReplaceAll(raw, "\r", "\n")
|
||||
|
||||
blocks := splitSRTBlocks(raw)
|
||||
lines := make([]model.Line, 0, len(blocks))
|
||||
|
||||
for _, block := range blocks {
|
||||
line, ok, err := parseSRTBlock(block)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if ok {
|
||||
lines = append(lines, line)
|
||||
}
|
||||
}
|
||||
|
||||
if len(lines) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
lyrics := model.NormalizeLyrics(model.Lyrics{
|
||||
Lang: "xxx",
|
||||
Line: lines,
|
||||
Synced: true,
|
||||
})
|
||||
return model.LyricList{lyrics}, nil
|
||||
}
|
||||
|
||||
func splitSRTBlocks(raw string) []string {
|
||||
raw = strings.TrimSpace(raw)
|
||||
if raw == "" {
|
||||
return nil
|
||||
}
|
||||
|
||||
parts := strings.Split(raw, "\n\n")
|
||||
blocks := make([]string, 0, len(parts))
|
||||
for _, part := range parts {
|
||||
part = strings.TrimSpace(part)
|
||||
if part != "" {
|
||||
blocks = append(blocks, part)
|
||||
}
|
||||
}
|
||||
return blocks
|
||||
}
|
||||
|
||||
func parseSRTBlock(block string) (model.Line, bool, error) {
|
||||
scanner := bytes.Split([]byte(block), []byte("\n"))
|
||||
if len(scanner) == 0 {
|
||||
return model.Line{}, false, nil
|
||||
}
|
||||
|
||||
lines := make([]string, 0, len(scanner))
|
||||
for _, line := range scanner {
|
||||
lines = append(lines, strings.TrimSpace(string(line)))
|
||||
}
|
||||
|
||||
if len(lines) == 0 {
|
||||
return model.Line{}, false, nil
|
||||
}
|
||||
|
||||
startIdx := 0
|
||||
if digitsOnly(lines[0]) {
|
||||
startIdx = 1
|
||||
}
|
||||
if startIdx >= len(lines) {
|
||||
return model.Line{}, false, nil
|
||||
}
|
||||
|
||||
timing := strings.Split(lines[startIdx], "-->")
|
||||
if len(timing) != 2 {
|
||||
return model.Line{}, false, nil
|
||||
}
|
||||
|
||||
startMs, err := parseSRTTime(timing[0])
|
||||
if err != nil {
|
||||
return model.Line{}, false, err
|
||||
}
|
||||
endMs, err := parseSRTTime(timing[1])
|
||||
if err != nil {
|
||||
return model.Line{}, false, err
|
||||
}
|
||||
|
||||
textLines := make([]string, 0, len(lines)-startIdx-1)
|
||||
for _, line := range lines[startIdx+1:] {
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
textLines = append(textLines, line)
|
||||
}
|
||||
|
||||
value := str.SanitizeText(strings.Join(textLines, "\n"))
|
||||
if value == "" {
|
||||
return model.Line{}, false, nil
|
||||
}
|
||||
|
||||
return model.Line{
|
||||
Start: &startMs,
|
||||
End: &endMs,
|
||||
Value: value,
|
||||
}, true, nil
|
||||
}
|
||||
|
||||
func parseSRTTime(value string) (int64, error) {
|
||||
match := srtTimeRegex.FindStringSubmatch(strings.TrimSpace(value))
|
||||
if match == nil {
|
||||
return 0, strconv.ErrSyntax
|
||||
}
|
||||
|
||||
hours, err := strconv.ParseInt(match[1], 10, 64)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
minutes, err := strconv.ParseInt(match[2], 10, 64)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
seconds, err := strconv.ParseInt(match[3], 10, 64)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
millis, err := strconv.ParseInt(match[4], 10, 64)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
switch len(match[4]) {
|
||||
case 1:
|
||||
millis *= 100
|
||||
case 2:
|
||||
millis *= 10
|
||||
}
|
||||
|
||||
return (((hours*60)+minutes)*60+seconds)*1000 + millis, nil
|
||||
}
|
||||
|
||||
func digitsOnly(value string) bool {
|
||||
if value == "" {
|
||||
return false
|
||||
}
|
||||
for _, ch := range value {
|
||||
if ch < '0' || ch > '9' {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
@ -46,6 +46,7 @@ type ttmlTimingParams struct {
|
||||
type ttmlTimingContext struct {
|
||||
lang string
|
||||
role string
|
||||
agentID string
|
||||
begin int64
|
||||
hasBegin bool
|
||||
end int64
|
||||
@ -70,6 +71,12 @@ type ttmlResolvedMetadataLine struct {
|
||||
line model.Line
|
||||
}
|
||||
|
||||
type ttmlDefinedAgent struct {
|
||||
ID string
|
||||
Type string
|
||||
Name string
|
||||
}
|
||||
|
||||
type ttmlParser struct {
|
||||
decoder *xml.Decoder
|
||||
params ttmlTimingParams
|
||||
@ -86,6 +93,8 @@ type ttmlParser struct {
|
||||
pronunciationLangOrder []string
|
||||
pronunciationEntriesByLg map[string][]ttmlMetadataEntry
|
||||
|
||||
definedAgents map[string]ttmlDefinedAgent
|
||||
|
||||
metadataSeq int
|
||||
}
|
||||
|
||||
@ -103,6 +112,7 @@ func parseTTML(contents []byte) (model.LyricList, error) {
|
||||
mainLineRefsByKey: make(map[string]ttmlLineRef),
|
||||
translationEntriesByLg: make(map[string][]ttmlMetadataEntry),
|
||||
pronunciationEntriesByLg: make(map[string][]ttmlMetadataEntry),
|
||||
definedAgents: make(map[string]ttmlDefinedAgent),
|
||||
}
|
||||
|
||||
root := ttmlTimingContext{lang: "xxx"}
|
||||
@ -140,6 +150,8 @@ func (p *ttmlParser) parseElement(start xml.StartElement, parent ttmlTimingConte
|
||||
return p.parseMetadataTrack(start, parent, ttmlLyricKindTranslation)
|
||||
case "transliteration":
|
||||
return p.parseMetadataTrack(start, parent, ttmlLyricKindPronunciation)
|
||||
case "agent":
|
||||
return p.parseAgentDefinition(start)
|
||||
}
|
||||
|
||||
ctx := p.childContext(start.Attr, parent)
|
||||
@ -234,6 +246,49 @@ func (p *ttmlParser) parseMetadataTrack(start xml.StartElement, parent ttmlTimin
|
||||
}
|
||||
}
|
||||
|
||||
func (p *ttmlParser) parseAgentDefinition(start xml.StartElement) error {
|
||||
id, ok := attrValue(start.Attr, "id")
|
||||
id = strings.TrimSpace(id)
|
||||
if !ok || id == "" {
|
||||
return p.skipElement(start)
|
||||
}
|
||||
|
||||
agent := ttmlDefinedAgent{
|
||||
ID: id,
|
||||
Type: strings.ToLower(strings.TrimSpace(attrOrEmpty(start.Attr, "type"))),
|
||||
}
|
||||
|
||||
for {
|
||||
token, err := p.decoder.Token()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
switch t := token.(type) {
|
||||
case xml.StartElement:
|
||||
if strings.EqualFold(t.Name.Local, "name") {
|
||||
name, err := p.collectElementText(t)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
name = sanitizeTTMLText(name)
|
||||
if name != "" && agent.Name == "" {
|
||||
agent.Name = name
|
||||
}
|
||||
continue
|
||||
}
|
||||
if err := p.skipElement(t); err != nil {
|
||||
return err
|
||||
}
|
||||
case xml.EndElement:
|
||||
if strings.EqualFold(t.Name.Local, start.Name.Local) {
|
||||
p.definedAgents[agent.ID] = agent
|
||||
return nil
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (p *ttmlParser) parseMetadataText(start xml.StartElement, parent ttmlTimingContext) (ttmlMetadataEntry, bool, error) {
|
||||
forKey, hasFor := attrValue(start.Attr, "for")
|
||||
forKey = strings.TrimSpace(forKey)
|
||||
@ -338,8 +393,8 @@ func (p *ttmlParser) parseInlineElement(start xml.StartElement, parent ttmlTimin
|
||||
tokenText := sanitizeTTMLText(value)
|
||||
if local == "span" && hasOwnTiming && !ctx.invalid && tokenText != "" && len(tokens) == 0 {
|
||||
parsedToken := model.Cue{
|
||||
Value: tokenText,
|
||||
Role: ctx.role,
|
||||
Value: tokenText,
|
||||
AgentID: p.resolveCueAgentID(ctx),
|
||||
}
|
||||
if ctx.hasBegin {
|
||||
startMs := ctx.begin
|
||||
@ -366,12 +421,12 @@ func (p *ttmlParser) toLyricList() model.LyricList {
|
||||
if len(lines) == 0 {
|
||||
continue
|
||||
}
|
||||
res = append(res, model.Lyrics{
|
||||
res = append(res, p.finalizeLyrics(model.Lyrics{
|
||||
Kind: ttmlLyricKindMain,
|
||||
Lang: lang,
|
||||
Line: lines,
|
||||
Synced: linesAreSynced(lines),
|
||||
})
|
||||
}))
|
||||
}
|
||||
|
||||
res = append(res, p.buildMetadataLyrics(ttmlLyricKindTranslation, p.translationLangOrder, p.translationEntriesByLg)...)
|
||||
@ -440,17 +495,168 @@ func (p *ttmlParser) buildMetadataLyrics(kind string, langOrder []string, entrie
|
||||
lines[i] = resolved[i].line
|
||||
}
|
||||
|
||||
res = append(res, model.Lyrics{
|
||||
res = append(res, p.finalizeLyrics(model.Lyrics{
|
||||
Kind: kind,
|
||||
Lang: lang,
|
||||
Line: lines,
|
||||
Synced: linesAreSynced(lines),
|
||||
})
|
||||
}))
|
||||
}
|
||||
|
||||
return res
|
||||
}
|
||||
|
||||
func (p *ttmlParser) finalizeLyrics(lyrics model.Lyrics) model.Lyrics {
|
||||
lyrics.Line = model.NormalizeCueLines(lyrics.Line)
|
||||
lyrics.Line, lyrics.Agents = p.resolveAgents(lyrics.Line)
|
||||
return model.NormalizeLyrics(lyrics)
|
||||
}
|
||||
|
||||
func (p *ttmlParser) resolveAgents(lines []model.Line) ([]model.Line, []model.Agent) {
|
||||
if len(lines) == 0 {
|
||||
return lines, nil
|
||||
}
|
||||
|
||||
normalized := model.NormalizeCueLines(lines)
|
||||
usedOrder := make([]string, 0, 4)
|
||||
usedSet := make(map[string]struct{}, 4)
|
||||
sawEmptyCue := false
|
||||
|
||||
for i := range normalized {
|
||||
for j := range normalized[i].Cue {
|
||||
agentID := strings.TrimSpace(normalized[i].Cue[j].AgentID)
|
||||
if agentID == "" {
|
||||
sawEmptyCue = true
|
||||
continue
|
||||
}
|
||||
if _, exists := usedSet[agentID]; !exists {
|
||||
usedSet[agentID] = struct{}{}
|
||||
usedOrder = append(usedOrder, agentID)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if len(usedOrder) == 0 {
|
||||
return normalized, nil
|
||||
}
|
||||
|
||||
mainID := ""
|
||||
for _, agentID := range usedOrder {
|
||||
role := p.baseRoleForAgent(agentID)
|
||||
if role != "bg" && role != "group" {
|
||||
mainID = agentID
|
||||
break
|
||||
}
|
||||
}
|
||||
if mainID == "" && sawEmptyCue {
|
||||
mainID = "main"
|
||||
}
|
||||
if mainID == "" {
|
||||
for _, agentID := range usedOrder {
|
||||
if p.baseRoleForAgent(agentID) != "bg" {
|
||||
mainID = agentID
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
if mainID == "" {
|
||||
mainID = usedOrder[0]
|
||||
}
|
||||
|
||||
if _, exists := usedSet[mainID]; !exists {
|
||||
usedSet[mainID] = struct{}{}
|
||||
usedOrder = append([]string{mainID}, usedOrder...)
|
||||
}
|
||||
|
||||
for i := range normalized {
|
||||
for j := range normalized[i].Cue {
|
||||
if strings.TrimSpace(normalized[i].Cue[j].AgentID) == "" {
|
||||
normalized[i].Cue[j].AgentID = mainID
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
agents := make([]model.Agent, 0, len(usedOrder))
|
||||
for _, agentID := range usedOrder {
|
||||
role := p.baseRoleForAgent(agentID)
|
||||
if agentID == mainID {
|
||||
role = "main"
|
||||
}
|
||||
agent := model.Agent{
|
||||
ID: agentID,
|
||||
Role: role,
|
||||
Name: p.agentNameForID(agentID),
|
||||
}
|
||||
agents = append(agents, agent)
|
||||
}
|
||||
|
||||
return normalized, agents
|
||||
}
|
||||
|
||||
func (p *ttmlParser) resolveCueAgentID(ctx ttmlTimingContext) string {
|
||||
agentID := strings.TrimSpace(ctx.agentID)
|
||||
if contextHasRole(ctx.role, "x-bg") {
|
||||
if agentID == "" {
|
||||
agentID = "main"
|
||||
}
|
||||
return backgroundAgentID(agentID)
|
||||
}
|
||||
return agentID
|
||||
}
|
||||
|
||||
func (p *ttmlParser) baseRoleForAgent(agentID string) string {
|
||||
if isBackgroundAgentID(agentID) {
|
||||
return "bg"
|
||||
}
|
||||
|
||||
if agent, ok := p.definedAgents[agentID]; ok {
|
||||
switch agent.Type {
|
||||
case "group":
|
||||
return "group"
|
||||
default:
|
||||
return "voice"
|
||||
}
|
||||
}
|
||||
|
||||
return "voice"
|
||||
}
|
||||
|
||||
func (p *ttmlParser) agentNameForID(agentID string) string {
|
||||
if isBackgroundAgentID(agentID) {
|
||||
baseID := strings.TrimSuffix(agentID, "__bg")
|
||||
if baseID == "main" {
|
||||
return ""
|
||||
}
|
||||
if agent, ok := p.definedAgents[baseID]; ok {
|
||||
return agent.Name
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
if agent, ok := p.definedAgents[agentID]; ok {
|
||||
return agent.Name
|
||||
}
|
||||
|
||||
return ""
|
||||
}
|
||||
|
||||
func backgroundAgentID(agentID string) string {
|
||||
return agentID + "__bg"
|
||||
}
|
||||
|
||||
func isBackgroundAgentID(agentID string) bool {
|
||||
return strings.HasSuffix(agentID, "__bg")
|
||||
}
|
||||
|
||||
func contextHasRole(roles string, role string) bool {
|
||||
for _, candidate := range strings.Fields(strings.ToLower(roles)) {
|
||||
if candidate == strings.ToLower(role) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func (p *ttmlParser) addMainLine(lang string, lineKey string, line model.Line) {
|
||||
lang = normalizeTTMLLang(lang)
|
||||
if _, ok := p.mainLinesByLang[lang]; !ok {
|
||||
@ -495,6 +701,9 @@ func (p *ttmlParser) childContext(attrs []xml.Attr, parent ttmlTimingContext) tt
|
||||
if lang, ok := attrValue(attrs, "lang"); ok {
|
||||
ctx.lang = normalizeTTMLLang(lang)
|
||||
}
|
||||
if agentID, ok := attrValue(attrs, "agent"); ok {
|
||||
ctx.agentID = strings.TrimSpace(agentID)
|
||||
}
|
||||
if role, ok := attrValue(attrs, "role"); ok {
|
||||
role = strings.TrimSpace(role)
|
||||
if role != "" {
|
||||
@ -805,6 +1014,55 @@ func attrValue(attrs []xml.Attr, key string) (string, bool) {
|
||||
return "", false
|
||||
}
|
||||
|
||||
func attrOrEmpty(attrs []xml.Attr, key string) string {
|
||||
value, _ := attrValue(attrs, key)
|
||||
return value
|
||||
}
|
||||
|
||||
func (p *ttmlParser) collectElementText(start xml.StartElement) (string, error) {
|
||||
var text strings.Builder
|
||||
|
||||
for {
|
||||
token, err := p.decoder.Token()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
switch t := token.(type) {
|
||||
case xml.StartElement:
|
||||
value, err := p.collectElementText(t)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
text.WriteString(value)
|
||||
case xml.EndElement:
|
||||
if strings.EqualFold(t.Name.Local, start.Name.Local) {
|
||||
return text.String(), nil
|
||||
}
|
||||
case xml.CharData:
|
||||
text.WriteString(string(t))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (p *ttmlParser) skipElement(_ xml.StartElement) error {
|
||||
depth := 1
|
||||
for depth > 0 {
|
||||
token, err := p.decoder.Token()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
switch token.(type) {
|
||||
case xml.StartElement:
|
||||
depth++
|
||||
case xml.EndElement:
|
||||
depth--
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func normalizeTTMLLang(lang string) string {
|
||||
lang = strings.ToLower(strings.TrimSpace(lang))
|
||||
if lang == "" {
|
||||
@ -840,42 +1098,7 @@ func linesAreSynced(lines []model.Line) bool {
|
||||
}
|
||||
|
||||
func hydrateLineTimingFromTokens(line model.Line) model.Line {
|
||||
if len(line.Cue) == 0 {
|
||||
return line
|
||||
}
|
||||
|
||||
var earliestStart *int64
|
||||
var latestEnd *int64
|
||||
for i := range line.Cue {
|
||||
token := line.Cue[i]
|
||||
if token.Start != nil {
|
||||
if earliestStart == nil || *token.Start < *earliestStart {
|
||||
v := *token.Start
|
||||
earliestStart = &v
|
||||
}
|
||||
}
|
||||
|
||||
candidateEnd := token.End
|
||||
if candidateEnd == nil {
|
||||
candidateEnd = token.Start
|
||||
}
|
||||
if candidateEnd != nil {
|
||||
if latestEnd == nil || *candidateEnd > *latestEnd {
|
||||
v := *candidateEnd
|
||||
latestEnd = &v
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if line.Start == nil && earliestStart != nil {
|
||||
v := *earliestStart
|
||||
line.Start = &v
|
||||
}
|
||||
if line.End == nil && latestEnd != nil {
|
||||
v := *latestEnd
|
||||
line.End = &v
|
||||
}
|
||||
return line
|
||||
return model.NormalizeLineTiming(line)
|
||||
}
|
||||
|
||||
func max(v float64, fallback float64) float64 {
|
||||
|
||||
@ -129,6 +129,10 @@ var _ = Describe("parseTTML", func() {
|
||||
list, err := parseTTML(content)
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(list).To(HaveLen(1))
|
||||
Expect(list[0].Agents).To(Equal([]model.Agent{
|
||||
{ID: "main", Role: "main"},
|
||||
{ID: "main__bg", Role: "bg"},
|
||||
}))
|
||||
Expect(list[0].Line).To(HaveLen(1))
|
||||
|
||||
line := list[0].Line[0]
|
||||
@ -137,9 +141,41 @@ var _ = Describe("parseTTML", func() {
|
||||
Expect(line.End).To(Equal(gg.P(int64(3000))))
|
||||
Expect(line.Cue).To(HaveLen(3))
|
||||
|
||||
Expect(line.Cue[0]).To(Equal(model.Cue{Start: gg.P(int64(1000)), End: gg.P(int64(1400)), Value: "He"}))
|
||||
Expect(line.Cue[1]).To(Equal(model.Cue{Start: gg.P(int64(1400)), End: gg.P(int64(1800)), Value: "llo"}))
|
||||
Expect(line.Cue[2]).To(Equal(model.Cue{Start: gg.P(int64(2000)), End: gg.P(int64(2500)), Value: "echo", Role: "x-bg"}))
|
||||
Expect(line.Cue[0]).To(Equal(model.Cue{Start: gg.P(int64(1000)), End: gg.P(int64(1400)), Value: "He", AgentID: "main"}))
|
||||
Expect(line.Cue[1]).To(Equal(model.Cue{Start: gg.P(int64(1400)), End: gg.P(int64(1800)), Value: "llo", AgentID: "main"}))
|
||||
Expect(line.Cue[2]).To(Equal(model.Cue{Start: gg.P(int64(2000)), End: gg.P(int64(2500)), Value: "echo", AgentID: "main__bg"}))
|
||||
})
|
||||
|
||||
It("should parse named TTML agents into main, voice, and group roles", func() {
|
||||
content := []byte(`<?xml version="1.0" encoding="UTF-8"?>
|
||||
<tt xmlns="http://www.w3.org/ns/ttml" xmlns:ttm="http://www.w3.org/ns/ttml#metadata">
|
||||
<head>
|
||||
<metadata>
|
||||
<ttm:agent xml:id="v1" type="person"><ttm:name>Chris Martin</ttm:name></ttm:agent>
|
||||
<ttm:agent xml:id="v2" type="person"><ttm:name>Jin</ttm:name></ttm:agent>
|
||||
<ttm:agent xml:id="v1000" type="group"><ttm:name>All</ttm:name></ttm:agent>
|
||||
</metadata>
|
||||
</head>
|
||||
<body xml:lang="eng">
|
||||
<div>
|
||||
<p begin="1s" end="2s" ttm:agent="v1"><span begin="1s" end="1.5s">You</span></p>
|
||||
<p begin="2s" end="3s" ttm:agent="v2"><span begin="2s" end="2.5s">and</span></p>
|
||||
<p begin="3s" end="4s" ttm:agent="v1000"><span begin="3s" end="3.5s">All</span></p>
|
||||
</div>
|
||||
</body>
|
||||
</tt>`)
|
||||
|
||||
list, err := parseTTML(content)
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(list).To(HaveLen(1))
|
||||
Expect(list[0].Agents).To(Equal([]model.Agent{
|
||||
{ID: "v1", Role: "main", Name: "Chris Martin"},
|
||||
{ID: "v2", Role: "voice", Name: "Jin"},
|
||||
{ID: "v1000", Role: "group", Name: "All"},
|
||||
}))
|
||||
Expect(list[0].Line[0].Cue[0].AgentID).To(Equal("v1"))
|
||||
Expect(list[0].Line[1].Cue[0].AgentID).To(Equal("v2"))
|
||||
Expect(list[0].Line[2].Cue[0].AgentID).To(Equal("v1000"))
|
||||
})
|
||||
})
|
||||
|
||||
|
||||
160
model/lyrics.go
160
model/lyrics.go
@ -12,10 +12,16 @@ import (
|
||||
)
|
||||
|
||||
type Cue struct {
|
||||
Start *int64 `structs:"start,omitempty" json:"start,omitempty"`
|
||||
End *int64 `structs:"end,omitempty" json:"end,omitempty"`
|
||||
Value string `structs:"value" json:"value"`
|
||||
Role string `structs:"role,omitempty" json:"role,omitempty"`
|
||||
Start *int64 `structs:"start,omitempty" json:"start,omitempty"`
|
||||
End *int64 `structs:"end,omitempty" json:"end,omitempty"`
|
||||
Value string `structs:"value" json:"value"`
|
||||
AgentID string `structs:"agentId,omitempty" json:"agentId,omitempty"`
|
||||
}
|
||||
|
||||
type Agent struct {
|
||||
ID string `structs:"id" json:"id"`
|
||||
Role string `structs:"role" json:"role"`
|
||||
Name string `structs:"name,omitempty" json:"name,omitempty"`
|
||||
}
|
||||
|
||||
type Line struct {
|
||||
@ -26,13 +32,14 @@ type Line struct {
|
||||
}
|
||||
|
||||
type Lyrics struct {
|
||||
DisplayArtist string `structs:"displayArtist,omitempty" json:"displayArtist,omitempty"`
|
||||
DisplayTitle string `structs:"displayTitle,omitempty" json:"displayTitle,omitempty"`
|
||||
Kind string `structs:"kind,omitempty" json:"kind,omitempty"`
|
||||
Lang string `structs:"lang" json:"lang"`
|
||||
Line []Line `structs:"line" json:"line"`
|
||||
Offset *int64 `structs:"offset,omitempty" json:"offset,omitempty"`
|
||||
Synced bool `structs:"synced" json:"synced"`
|
||||
DisplayArtist string `structs:"displayArtist,omitempty" json:"displayArtist,omitempty"`
|
||||
DisplayTitle string `structs:"displayTitle,omitempty" json:"displayTitle,omitempty"`
|
||||
Kind string `structs:"kind,omitempty" json:"kind,omitempty"`
|
||||
Lang string `structs:"lang" json:"lang"`
|
||||
Agents []Agent `structs:"agents,omitempty" json:"agents,omitempty"`
|
||||
Line []Line `structs:"line" json:"line"`
|
||||
Offset *int64 `structs:"offset,omitempty" json:"offset,omitempty"`
|
||||
Synced bool `structs:"synced" json:"synced"`
|
||||
}
|
||||
|
||||
// support the standard [mm:ss.mm], as well as [hh:*] and [*.mmm]
|
||||
@ -199,7 +206,7 @@ func ToLyrics(language, text string) (*Lyrics, error) {
|
||||
DisplayArtist: artist,
|
||||
DisplayTitle: title,
|
||||
Lang: language,
|
||||
Line: structuredLines,
|
||||
Line: NormalizeCueLines(structuredLines),
|
||||
Offset: offset,
|
||||
Synced: synced,
|
||||
}
|
||||
@ -265,11 +272,6 @@ func parseEnhancedCues(text string) []Cue {
|
||||
Start: &start,
|
||||
Value: seg.text,
|
||||
}
|
||||
// Derive End from the next cue's Start
|
||||
if i+1 < len(segments) {
|
||||
end := segments[i+1].start
|
||||
cues[i].End = &end
|
||||
}
|
||||
}
|
||||
return cues
|
||||
}
|
||||
@ -338,3 +340,127 @@ func parseTime(line string, match []int) (int64, error) {
|
||||
}
|
||||
|
||||
type LyricList []Lyrics
|
||||
|
||||
func NormalizeLyrics(lyrics Lyrics) Lyrics {
|
||||
lyrics.Line = NormalizeCueLines(lyrics.Line)
|
||||
if len(lyrics.Agents) == 0 {
|
||||
lyrics.Agents = nil
|
||||
}
|
||||
return lyrics
|
||||
}
|
||||
|
||||
func NormalizeCueLines(lines []Line) []Line {
|
||||
if len(lines) == 0 {
|
||||
return lines
|
||||
}
|
||||
|
||||
normalized := make([]Line, len(lines))
|
||||
copy(normalized, lines)
|
||||
|
||||
for i := range normalized {
|
||||
var fallbackEnd *int64
|
||||
if normalized[i].End != nil {
|
||||
v := *normalized[i].End
|
||||
fallbackEnd = &v
|
||||
} else if i+1 < len(normalized) && normalized[i+1].Start != nil {
|
||||
v := *normalized[i+1].Start
|
||||
fallbackEnd = &v
|
||||
}
|
||||
|
||||
normalized[i] = normalizeCueLine(normalized[i], fallbackEnd)
|
||||
}
|
||||
|
||||
return normalized
|
||||
}
|
||||
|
||||
func NormalizeLineTiming(line Line) Line {
|
||||
if len(line.Cue) == 0 {
|
||||
return line
|
||||
}
|
||||
|
||||
var earliestStart *int64
|
||||
var latestEnd *int64
|
||||
for i := range line.Cue {
|
||||
token := line.Cue[i]
|
||||
if token.Start != nil {
|
||||
if earliestStart == nil || *token.Start < *earliestStart {
|
||||
v := *token.Start
|
||||
earliestStart = &v
|
||||
}
|
||||
}
|
||||
|
||||
candidateEnd := token.End
|
||||
if candidateEnd == nil {
|
||||
candidateEnd = token.Start
|
||||
}
|
||||
if candidateEnd != nil {
|
||||
if latestEnd == nil || *candidateEnd > *latestEnd {
|
||||
v := *candidateEnd
|
||||
latestEnd = &v
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if line.Start == nil && earliestStart != nil {
|
||||
v := *earliestStart
|
||||
line.Start = &v
|
||||
}
|
||||
if line.End == nil && latestEnd != nil {
|
||||
v := *latestEnd
|
||||
line.End = &v
|
||||
}
|
||||
return line
|
||||
}
|
||||
|
||||
func normalizeCueLine(line Line, fallbackEnd *int64) Line {
|
||||
if len(line.Cue) == 0 {
|
||||
return line
|
||||
}
|
||||
|
||||
hasAnyEnd := false
|
||||
for i := range line.Cue {
|
||||
if line.Cue[i].End != nil {
|
||||
hasAnyEnd = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !hasAnyEnd {
|
||||
line.Cue = clearCueEnds(line.Cue)
|
||||
return NormalizeLineTiming(line)
|
||||
}
|
||||
|
||||
for i := range line.Cue {
|
||||
if line.Cue[i].End != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
if i+1 < len(line.Cue) && line.Cue[i+1].Start != nil {
|
||||
v := *line.Cue[i+1].Start
|
||||
line.Cue[i].End = &v
|
||||
continue
|
||||
}
|
||||
|
||||
if fallbackEnd != nil {
|
||||
v := *fallbackEnd
|
||||
line.Cue[i].End = &v
|
||||
}
|
||||
}
|
||||
|
||||
for i := range line.Cue {
|
||||
if line.Cue[i].End == nil {
|
||||
line.Cue = clearCueEnds(line.Cue)
|
||||
return NormalizeLineTiming(line)
|
||||
}
|
||||
}
|
||||
|
||||
return NormalizeLineTiming(line)
|
||||
}
|
||||
|
||||
func clearCueEnds(cues []Cue) []Cue {
|
||||
normalized := make([]Cue, len(cues))
|
||||
copy(normalized, cues)
|
||||
for i := range normalized {
|
||||
normalized[i].End = nil
|
||||
}
|
||||
return normalized
|
||||
}
|
||||
|
||||
@ -129,8 +129,8 @@ var _ = Describe("ToLyrics", func() {
|
||||
Expect(line0.Start).To(Equal(&t1000))
|
||||
Expect(line0.Value).To(Equal("Some lyrics here"))
|
||||
Expect(line0.Cue).To(Equal([]Cue{
|
||||
{Start: &t1000, End: &t1500, Value: "Some "},
|
||||
{Start: &t1500, End: &t2000, Value: "lyrics "},
|
||||
{Start: &t1000, Value: "Some "},
|
||||
{Start: &t1500, Value: "lyrics "},
|
||||
{Start: &t2000, Value: "here"},
|
||||
}))
|
||||
|
||||
@ -138,7 +138,7 @@ var _ = Describe("ToLyrics", func() {
|
||||
Expect(line1.Start).To(Equal(&t3000))
|
||||
Expect(line1.Value).To(Equal("More words"))
|
||||
Expect(line1.Cue).To(Equal([]Cue{
|
||||
{Start: &t3000, End: &t3500, Value: "More "},
|
||||
{Start: &t3000, Value: "More "},
|
||||
{Start: &t3500, Value: "words"},
|
||||
}))
|
||||
})
|
||||
@ -161,7 +161,7 @@ var _ = Describe("ToLyrics", func() {
|
||||
t1000, t1500, t5000, t5500 := int64(1000), int64(1500), int64(5000), int64(5500)
|
||||
|
||||
Expect(lyrics.Line[0].Cue).To(Equal([]Cue{
|
||||
{Start: &t1000, End: &t1500, Value: "Some "},
|
||||
{Start: &t1000, Value: "Some "},
|
||||
{Start: &t1500, Value: "lyrics"},
|
||||
}))
|
||||
Expect(lyrics.Line[0].Value).To(Equal("Some lyrics"))
|
||||
@ -170,7 +170,7 @@ var _ = Describe("ToLyrics", func() {
|
||||
Expect(lyrics.Line[1].Value).To(Equal("Plain line"))
|
||||
|
||||
Expect(lyrics.Line[2].Cue).To(Equal([]Cue{
|
||||
{Start: &t5000, End: &t5500, Value: "More "},
|
||||
{Start: &t5000, Value: "More "},
|
||||
{Start: &t5500, Value: "words"},
|
||||
}))
|
||||
Expect(lyrics.Line[2].Value).To(Equal("More words"))
|
||||
|
||||
@ -476,14 +476,22 @@ func mapExplicitStatus(explicitStatus string) string {
|
||||
return ""
|
||||
}
|
||||
|
||||
// sanitizeRole strips the TTML x- prefix from role values for the API.
|
||||
func sanitizeRole(role string) string {
|
||||
return strings.TrimPrefix(role, "x-")
|
||||
}
|
||||
|
||||
func buildStructuredLyric(mf *model.MediaFile, lyrics model.Lyrics, enhanced bool) responses.StructuredLyric {
|
||||
lines := make([]responses.Line, len(lyrics.Line))
|
||||
var cueLines []responses.CueLine
|
||||
agentOrderByID := make(map[string]int, len(lyrics.Agents))
|
||||
agentRoleByID := make(map[string]string, len(lyrics.Agents))
|
||||
responseAgents := make([]responses.Agent, 0, len(lyrics.Agents))
|
||||
|
||||
for i, agent := range lyrics.Agents {
|
||||
agentOrderByID[agent.ID] = i
|
||||
agentRoleByID[agent.ID] = agent.Role
|
||||
responseAgents = append(responseAgents, responses.Agent{
|
||||
ID: agent.ID,
|
||||
Role: agent.Role,
|
||||
Name: agent.Name,
|
||||
})
|
||||
}
|
||||
|
||||
for i, line := range lyrics.Line {
|
||||
lines[i] = responses.Line{
|
||||
@ -494,41 +502,50 @@ func buildStructuredLyric(mf *model.MediaFile, lyrics model.Lyrics, enhanced boo
|
||||
continue
|
||||
}
|
||||
|
||||
// Group cues by role, preserving order of first appearance
|
||||
roleOrder := make([]string, 0, 2)
|
||||
cuesByRole := make(map[string][]responses.LyricCue)
|
||||
agentOrder := make([]string, 0, 2)
|
||||
cuesByAgent := make(map[string][]model.Cue)
|
||||
for _, cue := range line.Cue {
|
||||
if cue.Start == nil {
|
||||
continue
|
||||
}
|
||||
role := sanitizeRole(cue.Role)
|
||||
if _, exists := cuesByRole[role]; !exists {
|
||||
roleOrder = append(roleOrder, role)
|
||||
agentID := strings.TrimSpace(cue.AgentID)
|
||||
if _, exists := cuesByAgent[agentID]; !exists {
|
||||
agentOrder = append(agentOrder, agentID)
|
||||
}
|
||||
cuesByRole[role] = append(cuesByRole[role], responses.LyricCue{
|
||||
Start: *cue.Start,
|
||||
End: cue.End,
|
||||
Value: cue.Value,
|
||||
})
|
||||
cuesByAgent[agentID] = append(cuesByAgent[agentID], cue)
|
||||
}
|
||||
|
||||
// Ensure main vocals (empty role) always comes first
|
||||
sort.SliceStable(roleOrder, func(i, j int) bool {
|
||||
return roleOrder[i] == "" && roleOrder[j] != ""
|
||||
sort.SliceStable(agentOrder, func(i, j int) bool {
|
||||
leftRole := agentRoleByID[agentOrder[i]]
|
||||
rightRole := agentRoleByID[agentOrder[j]]
|
||||
if leftRole == "main" && rightRole != "main" {
|
||||
return true
|
||||
}
|
||||
if rightRole == "main" && leftRole != "main" {
|
||||
return false
|
||||
}
|
||||
|
||||
leftOrder, leftOK := agentOrderByID[agentOrder[i]]
|
||||
rightOrder, rightOK := agentOrderByID[agentOrder[j]]
|
||||
if leftOK && rightOK && leftOrder != rightOrder {
|
||||
return leftOrder < rightOrder
|
||||
}
|
||||
if leftOK != rightOK {
|
||||
return leftOK
|
||||
}
|
||||
return i < j
|
||||
})
|
||||
|
||||
// Create a separate CueLine for each role group
|
||||
for _, role := range roleOrder {
|
||||
cues := cuesByRole[role]
|
||||
for _, agentID := range agentOrder {
|
||||
cueLine := responses.CueLine{
|
||||
Index: int32(i),
|
||||
Start: line.Start,
|
||||
End: line.End,
|
||||
Value: line.Value,
|
||||
Cue: cues,
|
||||
Cue: buildLyricCues(cuesByAgent[agentID], line.End),
|
||||
}
|
||||
if role != "" {
|
||||
cueLine.Role = role
|
||||
if agentID != "" {
|
||||
cueLine.AgentID = agentID
|
||||
}
|
||||
cueLines = append(cueLines, cueLine)
|
||||
}
|
||||
@ -550,6 +567,9 @@ func buildStructuredLyric(mf *model.MediaFile, lyrics model.Lyrics, enhanced boo
|
||||
kind = "main"
|
||||
}
|
||||
structured.Kind = kind
|
||||
if len(cueLines) > 0 && len(responseAgents) > 0 {
|
||||
structured.Agents = responseAgents
|
||||
}
|
||||
}
|
||||
|
||||
if structured.DisplayArtist == "" {
|
||||
@ -562,6 +582,67 @@ func buildStructuredLyric(mf *model.MediaFile, lyrics model.Lyrics, enhanced boo
|
||||
return structured
|
||||
}
|
||||
|
||||
func buildLyricCues(cues []model.Cue, lineEnd *int64) []responses.LyricCue {
|
||||
if len(cues) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
hasAnyEnd := false
|
||||
for i := range cues {
|
||||
if cues[i].End != nil {
|
||||
hasAnyEnd = true
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
normalized := make([]responses.LyricCue, 0, len(cues))
|
||||
for i := range cues {
|
||||
if cues[i].Start == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
cue := responses.LyricCue{
|
||||
Start: *cues[i].Start,
|
||||
Value: cues[i].Value,
|
||||
}
|
||||
if hasAnyEnd {
|
||||
end := cues[i].End
|
||||
if end == nil {
|
||||
if i+1 < len(cues) && cues[i+1].Start != nil {
|
||||
v := *cues[i+1].Start
|
||||
end = &v
|
||||
} else if lineEnd != nil {
|
||||
v := *lineEnd
|
||||
end = &v
|
||||
}
|
||||
}
|
||||
if end != nil && i+1 < len(cues) && cues[i+1].Start != nil && *end > *cues[i+1].Start {
|
||||
v := *cues[i+1].Start
|
||||
end = &v
|
||||
}
|
||||
if end != nil && *end < cue.Start {
|
||||
v := cue.Start
|
||||
end = &v
|
||||
}
|
||||
cue.End = end
|
||||
}
|
||||
normalized = append(normalized, cue)
|
||||
}
|
||||
|
||||
if hasAnyEnd {
|
||||
for i := range normalized {
|
||||
if normalized[i].End == nil {
|
||||
for j := range normalized {
|
||||
normalized[j].End = nil
|
||||
}
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return normalized
|
||||
}
|
||||
|
||||
func buildLyricsList(mf *model.MediaFile, lyricsList model.LyricList, enhanced bool) *responses.LyricsList {
|
||||
var filtered model.LyricList
|
||||
if enhanced {
|
||||
|
||||
@ -235,6 +235,7 @@ var _ = Describe("MediaRetrievalController", func() {
|
||||
Expect(realLyric.Kind).To(Equal(expectedLyric.Kind))
|
||||
Expect(realLyric.Lang).To(Equal(expectedLyric.Lang))
|
||||
Expect(realLyric.Synced).To(Equal(expectedLyric.Synced))
|
||||
Expect(realLyric.Agents).To(Equal(expectedLyric.Agents))
|
||||
|
||||
if expectedLyric.Offset == nil {
|
||||
Expect(realLyric.Offset).To(BeNil())
|
||||
@ -259,7 +260,7 @@ var _ = Describe("MediaRetrievalController", func() {
|
||||
expectedCueLine := expectedLyric.CueLine[j]
|
||||
Expect(realCueLine.Index).To(Equal(expectedCueLine.Index))
|
||||
Expect(realCueLine.Value).To(Equal(expectedCueLine.Value))
|
||||
Expect(realCueLine.Role).To(Equal(expectedCueLine.Role))
|
||||
Expect(realCueLine.AgentID).To(Equal(expectedCueLine.AgentID))
|
||||
if expectedCueLine.Start == nil {
|
||||
Expect(realCueLine.Start).To(BeNil())
|
||||
} else {
|
||||
@ -542,6 +543,7 @@ var _ = Describe("MediaRetrievalController", func() {
|
||||
lyricsJson, err := json.Marshal(model.LyricList{
|
||||
{
|
||||
Lang: "eng",
|
||||
Agents: []model.Agent{{ID: "lead", Role: "main"}, {ID: "lead__bg", Role: "bg"}},
|
||||
Synced: true,
|
||||
Line: []model.Line{
|
||||
{
|
||||
@ -550,15 +552,16 @@ var _ = Describe("MediaRetrievalController", func() {
|
||||
Value: "Hello echo",
|
||||
Cue: []model.Cue{
|
||||
{
|
||||
Start: &tokenStartA,
|
||||
End: &tokenEndA,
|
||||
Value: "Hello",
|
||||
Start: &tokenStartA,
|
||||
End: &tokenEndA,
|
||||
Value: "Hello",
|
||||
AgentID: "lead",
|
||||
},
|
||||
{
|
||||
Start: &tokenStartB,
|
||||
End: &tokenEndB,
|
||||
Value: "echo",
|
||||
Role: "x-bg",
|
||||
Start: &tokenStartB,
|
||||
End: &tokenEndB,
|
||||
Value: "echo",
|
||||
AgentID: "lead__bg",
|
||||
},
|
||||
},
|
||||
},
|
||||
@ -586,6 +589,10 @@ var _ = Describe("MediaRetrievalController", func() {
|
||||
Kind: "main",
|
||||
Lang: "eng",
|
||||
Synced: true,
|
||||
Agents: []responses.Agent{
|
||||
{ID: "lead", Role: "main"},
|
||||
{ID: "lead__bg", Role: "bg"},
|
||||
},
|
||||
Line: []responses.Line{
|
||||
{
|
||||
Start: &lineStart,
|
||||
@ -594,10 +601,11 @@ var _ = Describe("MediaRetrievalController", func() {
|
||||
},
|
||||
CueLine: []responses.CueLine{
|
||||
{
|
||||
Index: 0,
|
||||
Start: &lineStart,
|
||||
End: &lineEnd,
|
||||
Value: "Hello echo",
|
||||
Index: 0,
|
||||
Start: &lineStart,
|
||||
End: &lineEnd,
|
||||
Value: "Hello echo",
|
||||
AgentID: "lead",
|
||||
Cue: []responses.LyricCue{
|
||||
{
|
||||
Start: tokenStartA,
|
||||
@ -607,11 +615,11 @@ var _ = Describe("MediaRetrievalController", func() {
|
||||
},
|
||||
},
|
||||
{
|
||||
Index: 0,
|
||||
Start: &lineStart,
|
||||
End: &lineEnd,
|
||||
Value: "Hello echo",
|
||||
Role: "bg",
|
||||
Index: 0,
|
||||
Start: &lineStart,
|
||||
End: &lineEnd,
|
||||
Value: "Hello echo",
|
||||
AgentID: "lead__bg",
|
||||
Cue: []responses.LyricCue{
|
||||
{
|
||||
Start: tokenStartB,
|
||||
|
||||
@ -543,13 +543,19 @@ type LyricCue struct {
|
||||
Value string `xml:",chardata" json:"value"`
|
||||
}
|
||||
|
||||
type Agent struct {
|
||||
ID string `xml:"id,attr" json:"id"`
|
||||
Role string `xml:"role,attr" json:"role"`
|
||||
Name string `xml:"name,attr,omitempty" json:"name,omitempty"`
|
||||
}
|
||||
|
||||
type CueLine struct {
|
||||
Index int32 `xml:"index,attr" json:"index"`
|
||||
Start *int64 `xml:"start,attr,omitempty" json:"start,omitempty"`
|
||||
End *int64 `xml:"end,attr,omitempty" json:"end,omitempty"`
|
||||
Value string `xml:"value,attr,omitempty" json:"value,omitempty"`
|
||||
Role string `xml:"role,attr,omitempty" json:"role,omitempty"`
|
||||
Cue []LyricCue `xml:"cue,omitempty" json:"cue,omitempty"`
|
||||
Index int32 `xml:"index,attr" json:"index"`
|
||||
Start *int64 `xml:"start,attr,omitempty" json:"start,omitempty"`
|
||||
End *int64 `xml:"end,attr,omitempty" json:"end,omitempty"`
|
||||
Value string `xml:"value,attr,omitempty" json:"value,omitempty"`
|
||||
AgentID string `xml:"agentId,attr,omitempty" json:"agentId,omitempty"`
|
||||
Cue []LyricCue `xml:"cue,omitempty" json:"cue,omitempty"`
|
||||
}
|
||||
|
||||
type StructuredLyric struct {
|
||||
@ -558,6 +564,7 @@ type StructuredLyric struct {
|
||||
Kind string `xml:"kind,attr,omitempty" json:"kind,omitempty"`
|
||||
Lang string `xml:"lang,attr" json:"lang"`
|
||||
Line []Line `xml:"line" json:"line"`
|
||||
Agents []Agent `xml:"agent,omitempty" json:"agents,omitempty"`
|
||||
CueLine []CueLine `xml:"cueLine,omitempty" json:"cueLine,omitempty"`
|
||||
Offset *int64 `xml:"offset,attr,omitempty" json:"offset,omitempty"`
|
||||
Synced bool `xml:"synced,attr" json:"synced"`
|
||||
|
||||
5
tests/fixtures/test.elrc
vendored
Normal file
5
tests/fixtures/test.elrc
vendored
Normal file
@ -0,0 +1,5 @@
|
||||
[ar:ELRC Artist]
|
||||
[ti:ELRC Song]
|
||||
[lang:eng]
|
||||
[00:01.00]<00:01.00>Lead <00:01.50>words
|
||||
[00:03.00]Fallback line
|
||||
7
tests/fixtures/test.srt
vendored
Normal file
7
tests/fixtures/test.srt
vendored
Normal file
@ -0,0 +1,7 @@
|
||||
1
|
||||
00:00:18,800 --> 00:00:22,800
|
||||
We're from subtitles
|
||||
|
||||
2
|
||||
00:00:22,801 --> 00:00:26,000
|
||||
Another subtitle line
|
||||
@ -108,7 +108,7 @@ const PlayerToolbar = ({
|
||||
)
|
||||
|
||||
const toggleLyricsButton = (
|
||||
<Tooltip title="Toggle synchronized lyrics">
|
||||
<Tooltip title="Toggle lyrics">
|
||||
<span>
|
||||
<IconButton
|
||||
size={isDesktop ? 'small' : undefined}
|
||||
|
||||
@ -62,6 +62,11 @@ const hasTimedLines = (lyric) =>
|
||||
Array.isArray(lyric.line) &&
|
||||
lyric.line.some((line) => Number.isFinite(Number(line.start)))
|
||||
|
||||
const preferTimedLyrics = (lyrics) => {
|
||||
const timed = lyrics.filter(hasTimedLines)
|
||||
return timed.length > 0 ? timed : lyrics
|
||||
}
|
||||
|
||||
const normalizeToken = (token) => {
|
||||
if (!token) {
|
||||
return null
|
||||
@ -77,10 +82,38 @@ const normalizeToken = (token) => {
|
||||
}
|
||||
}
|
||||
|
||||
const normalizeCueLine = (cueLine, fallbackIndex) => {
|
||||
const buildAgentLookup = (structuredLyric) => {
|
||||
const lookup = new Map()
|
||||
const agents = Array.isArray(structuredLyric?.agents) ? structuredLyric.agents : []
|
||||
for (const agent of agents) {
|
||||
const id = typeof agent?.id === 'string' ? agent.id : ''
|
||||
if (!id || lookup.has(id)) {
|
||||
continue
|
||||
}
|
||||
lookup.set(id, {
|
||||
id,
|
||||
role: typeof agent?.role === 'string' ? agent.role : '',
|
||||
name: typeof agent?.name === 'string' ? agent.name : '',
|
||||
})
|
||||
}
|
||||
return lookup
|
||||
}
|
||||
|
||||
const deriveUiRole = (agent) => {
|
||||
if (!agent?.role || agent.role === 'main') {
|
||||
return ''
|
||||
}
|
||||
return agent.role
|
||||
}
|
||||
|
||||
const normalizeCueLine = (cueLine, fallbackIndex, agentLookup) => {
|
||||
const index = Number.isFinite(Number(cueLine?.index))
|
||||
? Number(cueLine.index)
|
||||
: fallbackIndex
|
||||
const agentId = typeof cueLine?.agentId === 'string' ? cueLine.agentId : ''
|
||||
const agent = agentId ? agentLookup.get(agentId) || null : null
|
||||
const fallbackRole =
|
||||
typeof cueLine?.role === 'string' ? cueLine.role : ''
|
||||
const tokens = sortTokensByStart(
|
||||
Array.isArray(cueLine?.cue)
|
||||
? cueLine.cue.map(normalizeToken).filter(Boolean)
|
||||
@ -92,7 +125,10 @@ const normalizeCueLine = (cueLine, fallbackIndex) => {
|
||||
start: toTime(cueLine?.start),
|
||||
end: toTime(cueLine?.end),
|
||||
value: typeof cueLine?.value === 'string' ? cueLine.value : '',
|
||||
role: typeof cueLine?.role === 'string' ? cueLine.role : '',
|
||||
role: agent ? deriveUiRole(agent) : fallbackRole,
|
||||
agentId,
|
||||
agentRole: agent?.role || fallbackRole,
|
||||
agentName: agent?.name || '',
|
||||
tokens,
|
||||
}
|
||||
}
|
||||
@ -194,6 +230,9 @@ const buildSyntheticWordTokens = (line, token) => {
|
||||
end: baseStart + (duration * (idx + 1)) / chunks.length,
|
||||
value: chunk,
|
||||
role: typeof token?.role === 'string' ? token.role : '',
|
||||
agentId: typeof token?.agentId === 'string' ? token.agentId : '',
|
||||
agentName: typeof token?.agentName === 'string' ? token.agentName : '',
|
||||
agentRole: typeof token?.agentRole === 'string' ? token.agentRole : '',
|
||||
}))
|
||||
}
|
||||
|
||||
@ -240,8 +279,8 @@ export const selectLyricLayers = (structuredLyrics, preferredLanguage) => {
|
||||
}
|
||||
}
|
||||
|
||||
const synced = structuredLyrics.filter(hasTimedLines)
|
||||
if (synced.length === 0) {
|
||||
const available = structuredLyrics.filter(hasStructuredLyricContent)
|
||||
if (available.length === 0) {
|
||||
return {
|
||||
main: null,
|
||||
translation: null,
|
||||
@ -255,22 +294,25 @@ export const selectLyricLayers = (structuredLyrics, preferredLanguage) => {
|
||||
[LYRIC_KIND_PRONUNCIATION]: [],
|
||||
}
|
||||
|
||||
for (const lyric of synced) {
|
||||
for (const lyric of available) {
|
||||
grouped[normalizeLyricKind(lyric?.kind)].push(lyric)
|
||||
}
|
||||
|
||||
const mainCandidates = grouped[LYRIC_KIND_MAIN].length
|
||||
? grouped[LYRIC_KIND_MAIN]
|
||||
: synced
|
||||
: available
|
||||
|
||||
return {
|
||||
main: pickLyricByLanguage(mainCandidates, preferredLanguage),
|
||||
main: pickLyricByLanguage(
|
||||
preferTimedLyrics(mainCandidates),
|
||||
preferredLanguage,
|
||||
),
|
||||
translation: pickLyricByLanguage(
|
||||
grouped[LYRIC_KIND_TRANSLATION],
|
||||
preferTimedLyrics(grouped[LYRIC_KIND_TRANSLATION]),
|
||||
preferredLanguage,
|
||||
),
|
||||
pronunciation: pickLyricByLanguage(
|
||||
grouped[LYRIC_KIND_PRONUNCIATION],
|
||||
preferTimedLyrics(grouped[LYRIC_KIND_PRONUNCIATION]),
|
||||
preferredLanguage,
|
||||
),
|
||||
}
|
||||
@ -316,6 +358,7 @@ export const buildKaraokeLines = (structuredLyric) => {
|
||||
return []
|
||||
}
|
||||
|
||||
const agentLookup = buildAgentLookup(structuredLyric)
|
||||
const baseLines = Array.isArray(structuredLyric.line)
|
||||
? structuredLyric.line
|
||||
: []
|
||||
@ -328,12 +371,19 @@ export const buildKaraokeLines = (structuredLyric) => {
|
||||
? (() => {
|
||||
const normalizedCueLines = rawCueLines.map(
|
||||
(cueLine, fallbackIndex) => {
|
||||
const normalized = normalizeCueLine(cueLine, fallbackIndex)
|
||||
const normalized = normalizeCueLine(
|
||||
cueLine,
|
||||
fallbackIndex,
|
||||
agentLookup,
|
||||
)
|
||||
return {
|
||||
...normalized,
|
||||
tokens: normalized.tokens.map((token) => ({
|
||||
...token,
|
||||
role: normalized.role,
|
||||
agentId: normalized.agentId,
|
||||
agentName: normalized.agentName,
|
||||
agentRole: normalized.agentRole,
|
||||
})),
|
||||
}
|
||||
},
|
||||
@ -366,6 +416,9 @@ export const buildKaraokeLines = (structuredLyric) => {
|
||||
start: first.start ?? toTime(baseLine.start) ?? fallbackStart,
|
||||
end: first.end ?? toTime(baseLine.end) ?? fallbackEnd,
|
||||
value,
|
||||
agentId: first.agentId,
|
||||
agentName: first.agentName,
|
||||
agentRole: first.agentRole,
|
||||
tokens,
|
||||
}
|
||||
})
|
||||
|
||||
@ -124,6 +124,49 @@ describe('lyrics helpers', () => {
|
||||
expect(layers.pronunciation).toBeNull()
|
||||
})
|
||||
|
||||
it('falls back to unsynced lyric content when no timed track exists', () => {
|
||||
const layers = selectLyricLayers(
|
||||
[
|
||||
{
|
||||
lang: 'eng',
|
||||
synced: false,
|
||||
line: [{ value: 'Plain embedded lyric' }],
|
||||
},
|
||||
],
|
||||
'eng',
|
||||
)
|
||||
|
||||
expect(layers.main).toEqual({
|
||||
lang: 'eng',
|
||||
synced: false,
|
||||
line: [{ value: 'Plain embedded lyric' }],
|
||||
})
|
||||
})
|
||||
|
||||
it('still prefers timed lyrics when both timed and untimed tracks exist', () => {
|
||||
const layers = selectLyricLayers(
|
||||
[
|
||||
{
|
||||
lang: 'eng',
|
||||
synced: false,
|
||||
line: [{ value: 'Plain lyric' }],
|
||||
},
|
||||
{
|
||||
lang: 'eng',
|
||||
synced: true,
|
||||
line: [{ start: 1000, value: 'Timed lyric' }],
|
||||
},
|
||||
],
|
||||
'eng',
|
||||
)
|
||||
|
||||
expect(layers.main).toEqual({
|
||||
lang: 'eng',
|
||||
synced: true,
|
||||
line: [{ start: 1000, value: 'Timed lyric' }],
|
||||
})
|
||||
})
|
||||
|
||||
it('matches layer line by timing for the active main line', () => {
|
||||
const mainLines = [
|
||||
{ index: 0, start: 1000, end: 1800, value: 'Line A', tokens: [] },
|
||||
@ -200,43 +243,88 @@ describe('lyrics helpers', () => {
|
||||
expect(getPreferredLyricLanguage()).toBe('pt-BR')
|
||||
})
|
||||
|
||||
it('builds karaoke lines from cueLine payload', () => {
|
||||
it('builds karaoke lines from agent-based cueLine payload', () => {
|
||||
const lines = buildKaraokeLines({
|
||||
lang: 'eng',
|
||||
synced: true,
|
||||
line: [{ start: 1000, end: 3000, value: 'Hello world' }],
|
||||
agents: [
|
||||
{ id: 'lead', role: 'main', name: 'Lead Vocal' },
|
||||
{ id: 'backing', role: 'bg' },
|
||||
],
|
||||
cueLine: [
|
||||
{
|
||||
index: 0,
|
||||
start: 1000,
|
||||
end: 3000,
|
||||
value: 'Hello world',
|
||||
agentId: 'lead',
|
||||
cue: [{ start: 1000, end: 1500, value: 'Hello' }],
|
||||
},
|
||||
{
|
||||
index: 0,
|
||||
start: 1000,
|
||||
end: 3000,
|
||||
value: 'Hello world',
|
||||
agentId: 'backing',
|
||||
cue: [{ start: 2000, end: 2500, value: 'world' }],
|
||||
},
|
||||
],
|
||||
})
|
||||
|
||||
expect(lines).toEqual([
|
||||
{
|
||||
agentId: 'lead',
|
||||
agentName: 'Lead Vocal',
|
||||
agentRole: 'main',
|
||||
index: 0,
|
||||
start: 1000,
|
||||
end: 3000,
|
||||
value: 'Hello world',
|
||||
tokens: [
|
||||
{
|
||||
start: 1000,
|
||||
end: 1500,
|
||||
value: 'Hello',
|
||||
role: '',
|
||||
agentId: 'lead',
|
||||
agentName: 'Lead Vocal',
|
||||
agentRole: 'main',
|
||||
},
|
||||
{
|
||||
start: 2000,
|
||||
end: 2500,
|
||||
value: 'world',
|
||||
role: 'bg',
|
||||
agentId: 'backing',
|
||||
agentName: '',
|
||||
agentRole: 'bg',
|
||||
},
|
||||
],
|
||||
},
|
||||
])
|
||||
})
|
||||
|
||||
it('falls back to legacy cueLine role values when agents are absent', () => {
|
||||
const lines = buildKaraokeLines({
|
||||
lang: 'eng',
|
||||
synced: true,
|
||||
line: [{ start: 1000, end: 3000, value: 'Hello world' }],
|
||||
cueLine: [
|
||||
{
|
||||
index: 0,
|
||||
start: 1000,
|
||||
end: 3000,
|
||||
value: 'Hello world',
|
||||
role: '',
|
||||
cue: [{ start: 1000, end: 1500, value: 'Hello' }],
|
||||
},
|
||||
{
|
||||
index: 0,
|
||||
start: 1000,
|
||||
end: 3000,
|
||||
value: 'Hello world',
|
||||
role: 'bg',
|
||||
cue: [{ start: 2000, end: 2500, value: 'world' }],
|
||||
cue: [{ start: 1000, end: 1500, value: 'Hello' }],
|
||||
},
|
||||
],
|
||||
})
|
||||
|
||||
expect(lines).toEqual([
|
||||
{
|
||||
index: 0,
|
||||
start: 1000,
|
||||
end: 3000,
|
||||
value: 'Hello world',
|
||||
tokens: [
|
||||
{ start: 1000, end: 1500, value: 'Hello', role: '' },
|
||||
{ start: 2000, end: 2500, value: 'world', role: 'bg' },
|
||||
],
|
||||
},
|
||||
])
|
||||
expect(lines[0].tokens[0].role).toBe('bg')
|
||||
expect(lines[0].tokens[0].agentId).toBe('')
|
||||
expect(lines[0].tokens[0].agentName).toBe('')
|
||||
})
|
||||
|
||||
it('sorts token timing by start to keep playback stable', () => {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user