feat: parse embedded rich lyrics

This commit is contained in:
ranokay 2026-05-26 19:59:41 +03:00
parent e9f969e3d2
commit 719fa5dc99
No known key found for this signature in database
7 changed files with 283 additions and 6 deletions

View File

@ -52,7 +52,7 @@ A share of the revenue helps fund the development of Navidrome at no additional
- **Multi-platform**, runs on macOS, Linux and Windows. **Docker** images are also provided
- Ready to use binaries for all major platforms, including **Raspberry Pi**
- Automatically **monitors your library** for changes, importing new files and reloading new metadata
- Supports lyrics from sidecar **.ttml**, **.elrc**, **.lrc**, **.srt**, **.txt** files and embedded tags (via `lyricspriority`)
- Supports lyrics from sidecar **.ttml**, **.elrc**, **.lrc**, **.srt**, **.txt** files and embedded **TTML**, **Enhanced LRC**, **LRC**, **SRT**, and plain-text tags (via `lyricspriority`)
- **Themeable**, modern and responsive **Web interface** based on [Material UI](https://material-ui.com)
- **Compatible** with all Subsonic/Madsonic/Airsonic [clients](https://www.navidrome.org/docs/overview/#apps)
- **Transcoding** on the fly. Can be set per user/player. **Opus encoding is supported**

64
core/lyrics/embedded.go Normal file
View File

@ -0,0 +1,64 @@
package lyrics
import (
"encoding/xml"
"strings"
"github.com/navidrome/navidrome/log"
"github.com/navidrome/navidrome/model"
)
// ParseEmbedded parses lyrics read from media-file metadata tags. It detects rich
// payloads before falling back to the generic LRC/plain-text parser, because
// text sanitization would otherwise strip TTML XML markup.
func ParseEmbedded(language, text string) (model.LyricList, error) {
text = strings.TrimPrefix(text, "\ufeff")
if isTTMLDocument(text) {
list, err := parseTTMLWithDefaultLang([]byte(text), language)
if err == nil && len(list) > 0 {
return list, nil
}
if err != nil {
log.Warn("Error parsing embedded TTML lyrics, falling back to plain lyrics", "error", err)
}
}
list, err := parseSRTWithLanguage([]byte(text), language)
if err == nil && len(list) > 0 {
return list, nil
}
if err != nil && strings.Contains(text, "-->") {
log.Warn("Error parsing embedded SRT lyrics, falling back to plain lyrics", "error", err)
}
lyric, err := model.ToLyrics(language, text)
if err != nil {
return nil, err
}
if lyric == nil || lyric.IsEmpty() {
return nil, nil
}
return model.LyricList{*lyric}, nil
}
func isTTMLDocument(text string) bool {
decoder := xml.NewDecoder(strings.NewReader(strings.TrimSpace(text)))
for {
token, err := decoder.Token()
if err != nil {
return false
}
if start, ok := token.(xml.StartElement); ok {
return strings.EqualFold(start.Name.Local, "tt")
}
}
}
func normalizeEmbeddedLanguage(language string) string {
language = strings.ToLower(strings.TrimSpace(language))
if language == "" {
return "xxx"
}
return language
}

View File

@ -0,0 +1,157 @@
package lyrics
import (
"strings"
"github.com/navidrome/navidrome/model"
"github.com/navidrome/navidrome/utils/gg"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
)
var _ = Describe("ParseEmbedded", func() {
It("should parse embedded TTML with the tag language as the default", func() {
content := `<tt xmlns="http://www.w3.org/ns/ttml" xmlns:ttm="http://www.w3.org/ns/ttml#metadata">
<head>
<metadata>
<ttm:agent xml:id="lead" ttm:type="person">
<ttm:name>Lead Vocal</ttm:name>
</ttm:agent>
</metadata>
</head>
<body>
<div>
<p begin="00:00:01.000" end="00:00:03.000">
<span begin="00:00:01.000" end="00:00:02.000" ttm:agent="lead">Hello </span><span begin="00:00:02.000" end="00:00:03.000" ttm:agent="lead">world</span>
</p>
</div>
</body>
</tt>`
list, err := ParseEmbedded("ENG", content)
Expect(err).ToNot(HaveOccurred())
Expect(list).To(HaveLen(1))
Expect(list[0].Kind).To(Equal("main"))
Expect(list[0].Lang).To(Equal("eng"))
Expect(list[0].Synced).To(BeTrue())
Expect(list[0].Agents).To(Equal([]model.Agent{{ID: "lead", Role: "main", Name: "Lead Vocal"}}))
Expect(list[0].Line).To(HaveLen(1))
Expect(list[0].Line[0].Start).To(Equal(gg.P(int64(1000))))
Expect(list[0].Line[0].End).To(Equal(gg.P(int64(3000))))
Expect(list[0].Line[0].Value).To(Equal("Hello world"))
Expect(list[0].Line[0].Cue).To(HaveLen(2))
Expect(list[0].Line[0].Cue[0].AgentID).To(Equal("lead"))
Expect(list[0].Line[0].Cue[0].ByteStart).To(Equal(0))
Expect(list[0].Line[0].Cue[0].ByteEnd).To(Equal(5))
Expect(list[0].Line[0].Cue[1].ByteStart).To(Equal(6))
Expect(list[0].Line[0].Cue[1].ByteEnd).To(Equal(10))
})
It("should preserve embedded TTML translation and pronunciation tracks", func() {
content := `<tt xmlns="http://www.w3.org/ns/ttml" xmlns:itunes="http://music.apple.com/lyric-ttml-internal">
<head>
<metadata>
<iTunesMetadata xmlns="http://music.apple.com/lyric-ttml-internal">
<translations>
<translation xml:lang="es">
<text for="L1">Hola</text>
</translation>
</translations>
<transliterations>
<transliteration xml:lang="ja-Latn">
<text for="L1"><span begin="00:00:01.000" end="00:00:01.300" xmlns="http://www.w3.org/ns/ttml">ko</span><span begin="00:00:01.300" end="00:00:01.600" xmlns="http://www.w3.org/ns/ttml">nni</span></text>
</transliteration>
</transliterations>
</iTunesMetadata>
</metadata>
</head>
<body xml:lang="ja">
<div>
<p begin="00:00:01.000" end="00:00:02.000" itunes:key="L1">こんにちは</p>
</div>
</body>
</tt>`
list, err := ParseEmbedded("eng", content)
Expect(err).ToNot(HaveOccurred())
Expect(list).To(HaveLen(3))
Expect(list[0].Kind).To(Equal("main"))
Expect(list[0].Lang).To(Equal("ja"))
Expect(list[0].Line[0].Value).To(Equal("こんにちは"))
Expect(list[1].Kind).To(Equal("translation"))
Expect(list[1].Lang).To(Equal("es"))
Expect(list[1].Line[0].Value).To(Equal("Hola"))
Expect(list[2].Kind).To(Equal("pronunciation"))
Expect(list[2].Lang).To(Equal("ja-latn"))
Expect(list[2].Line[0].Value).To(Equal("konni"))
Expect(list[2].Line[0].Cue).To(HaveLen(2))
})
It("should parse embedded SRT with the tag language", func() {
content := `1
00:00:18,800 --> 00:00:22,800
We're from subtitles
2
00:00:22,801 --> 00:00:26,000
Another subtitle line`
list, err := ParseEmbedded("POR", content)
Expect(err).ToNot(HaveOccurred())
Expect(list).To(Equal(model.LyricList{
{
Lang: "por",
Line: []model.Line{
{
Start: gg.P(int64(18800)),
End: gg.P(int64(22800)),
Value: "We're from subtitles",
},
{
Start: gg.P(int64(22801)),
End: gg.P(int64(26000)),
Value: "Another subtitle line",
},
},
Synced: true,
},
}))
})
It("should keep embedded enhanced LRC cues", func() {
content := "[00:01.00]<00:01.00>Lead <00:01.50>words"
list, err := ParseEmbedded("eng", content)
Expect(err).ToNot(HaveOccurred())
Expect(list).To(HaveLen(1))
Expect(list[0].Lang).To(Equal("eng"))
Expect(list[0].Synced).To(BeTrue())
Expect(list[0].Line[0].Value).To(Equal("Lead words"))
Expect(list[0].Line[0].Cue).To(HaveLen(2))
})
It("should fall back to plain lyrics when embedded TTML is invalid", func() {
content := `<tt xmlns="http://www.w3.org/ns/ttml">
<body>
<p begin="not-a-time">Broken</p>
</body>
</tt>`
list, err := ParseEmbedded("eng", content)
Expect(err).ToNot(HaveOccurred())
Expect(list).To(HaveLen(1))
Expect(list[0].Lang).To(Equal("eng"))
Expect(list[0].Synced).To(BeFalse())
Expect(list[0].Line).ToNot(BeEmpty())
values := make([]string, 0, len(list[0].Line))
for _, line := range list[0].Line {
values = append(values, line.Value)
}
Expect(strings.Join(values, "\n")).To(ContainSubstring("Broken"))
})
})

View File

@ -13,6 +13,10 @@ import (
var srtTimeRegex = regexp.MustCompile(`^\s*(\d{1,2}):(\d{2}):(\d{2})[,.](\d{1,3})\s*$`)
func parseSRT(contents []byte) (model.LyricList, error) {
return parseSRTWithLanguage(contents, "xxx")
}
func parseSRTWithLanguage(contents []byte, language string) (model.LyricList, error) {
raw := strings.ReplaceAll(string(contents), "\r\n", "\n")
raw = strings.ReplaceAll(raw, "\r", "\n")
@ -34,7 +38,7 @@ func parseSRT(contents []byte) (model.LyricList, error) {
}
lyrics := model.NormalizeLyrics(model.Lyrics{
Lang: "xxx",
Lang: normalizeEmbeddedLanguage(language),
Line: lines,
Synced: true,
})

View File

@ -106,6 +106,10 @@ type ttmlParser struct {
}
func parseTTML(contents []byte) (model.LyricList, error) {
return parseTTMLWithDefaultLang(contents, "xxx")
}
func parseTTMLWithDefaultLang(contents []byte, defaultLang string) (model.LyricList, error) {
contents = xmlEncodingRegex.ReplaceAll(contents, []byte(`<?xml$1encoding="UTF-8"$2?>`))
p := ttmlParser{
@ -122,7 +126,7 @@ func parseTTML(contents []byte) (model.LyricList, error) {
definedAgents: make(map[string]ttmlDefinedAgent),
}
root := ttmlTimingContext{lang: "xxx"}
root := ttmlTimingContext{lang: normalizeTTMLLang(defaultLang)}
for {
token, err := p.decoder.Token()

View File

@ -8,6 +8,7 @@ import (
"strconv"
"github.com/navidrome/navidrome/conf"
lyricssvc "github.com/navidrome/navidrome/core/lyrics"
"github.com/navidrome/navidrome/log"
"github.com/navidrome/navidrome/model"
"github.com/navidrome/navidrome/utils/str"
@ -137,13 +138,15 @@ func (md Metadata) mapLyrics() string {
lang := raw.Key()
text := raw.Value()
lyrics, err := model.ToLyrics(lang, text)
lyrics, err := lyricssvc.ParseEmbedded(lang, text)
if err != nil {
log.Warn("Unexpected failure occurred when parsing lyrics", "file", md.filePath, err)
continue
}
if !lyrics.IsEmpty() {
lyricList = append(lyricList, *lyrics)
for _, lyric := range lyrics {
if !lyric.IsEmpty() {
lyricList = append(lyricList, lyric)
}
}
}

View File

@ -116,5 +116,50 @@ var _ = Describe("ToMediaFile", func() {
sort.Slice(expected, func(i, j int) bool { return expected[i].Lang < expected[j].Lang })
Expect(actual).To(Equal(expected))
})
It("should parse embedded TTML lyrics before sanitizing XML tags", func() {
mf = toMediaFile(model.RawTags{
"LYRICS:ENG": {`<tt xmlns="http://www.w3.org/ns/ttml">
<body>
<div>
<p begin="00:00:01.000" end="00:00:02.500">Embedded TTML line</p>
</div>
</body>
</tt>`},
})
var actual model.LyricList
err := json.Unmarshal([]byte(mf.Lyrics), &actual)
Expect(err).ToNot(HaveOccurred())
Expect(actual).To(Equal(model.LyricList{
{
Kind: "main",
Lang: "eng",
Line: []model.Line{{Start: P(int64(1000)), End: P(int64(2500)), Value: "Embedded TTML line"}},
Synced: true,
},
}))
})
It("should parse embedded SRT lyrics with the tag language", func() {
mf = toMediaFile(model.RawTags{
"LYRICS:POR": {`1
00:00:18,800 --> 00:00:22,800
Estamos nas legendas`},
})
var actual model.LyricList
err := json.Unmarshal([]byte(mf.Lyrics), &actual)
Expect(err).ToNot(HaveOccurred())
Expect(actual).To(Equal(model.LyricList{
{
Lang: "por",
Line: []model.Line{
{Start: P(int64(18800)), End: P(int64(22800)), Value: "Estamos nas legendas"},
},
Synced: true,
},
}))
})
})
})