diff --git a/conf/configuration.go b/conf/configuration.go index 916efe70b..12541d6aa 100644 --- a/conf/configuration.go +++ b/conf/configuration.go @@ -110,6 +110,7 @@ type configOptions struct { PID pidOptions `json:",omitzero"` Inspect inspectOptions `json:",omitzero"` Subsonic subsonicOptions `json:",omitzero"` + SQLite sqliteOptions `json:",omitzero"` LastFM lastfmOptions `json:",omitzero"` Deezer deezerOptions `json:",omitzero"` ListenBrainz listenBrainzOptions `json:",omitzero"` diff --git a/conf/sqlite_options.go b/conf/sqlite_options.go new file mode 100644 index 000000000..fc449cb2e --- /dev/null +++ b/conf/sqlite_options.go @@ -0,0 +1,20 @@ +package conf + +// sqliteOptions configures SQLite database behavior +type sqliteOptions struct { + // JournalMode sets the SQLite journal mode (WAL, DELETE, etc) + // Default: WAL - provides better concurrency but may not work on network filesystems + JournalMode string `json:",omitzero"` + + // BusyTimeout sets how long SQLite should wait for locks to clear (milliseconds) + // Default: 5000 - waits up to 5 seconds before returning "database is locked" + BusyTimeout int `json:",omitzero"` + + // SyncMode controls how aggressively SQLite writes to disk + // Default: NORMAL - good balance of safety and performance + SyncMode string `json:",omitzero"` + + // MaxConnections limits concurrent database connections + // Default: 0 (uses max(4, runtime.NumCPU())) + MaxConnections int `json:",omitzero"` +} diff --git a/contrib/grafana/sqlite-dashboard.json b/contrib/grafana/sqlite-dashboard.json new file mode 100644 index 000000000..fa30a9d42 --- /dev/null +++ b/contrib/grafana/sqlite-dashboard.json @@ -0,0 +1,311 @@ +{ + "annotations": { + "list": [] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": null, + "links": [], + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "hiddenSeries": false, + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.4.0", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(sqlite_lock_wait_duration_seconds_sum[5m])", + "interval": "", + "legendFormat": "{{operation}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "SQLite Lock Wait Duration (5m rate)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "hiddenSeries": false, + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.4.0", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(sqlite_lock_errors_total[5m])", + "interval": "", + "legendFormat": "{{operation}} - {{type}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "SQLite Lock Errors (5m rate)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "hiddenSeries": false, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.4.0", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(sqlite_operation_retries_total[5m])", + "interval": "", + "legendFormat": "{{operation}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "SQLite Operation Retries (5m rate)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "schemaVersion": 27, + "style": "dark", + "tags": ["navidrome", "sqlite"], + "templating": { + "list": [] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Navidrome SQLite Monitoring", + "version": 1 +} diff --git a/core/metrics/sqlite.go b/core/metrics/sqlite.go new file mode 100644 index 000000000..4d44d9a2e --- /dev/null +++ b/core/metrics/sqlite.go @@ -0,0 +1,53 @@ +package metrics + +import ( + "github.com/prometheus/client_golang/prometheus" +) + +var ( + sqliteLockWaitDuration = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "sqlite_lock_wait_duration_seconds", + Help: "Time spent waiting for SQLite locks to be released", + Buckets: []float64{.005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5, 10}, + }, + []string{"operation"}, + ) + + sqliteLockErrors = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "sqlite_lock_errors_total", + Help: "Number of SQLite lock-related errors", + }, + []string{"operation", "type"}, + ) + + sqliteRetries = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "sqlite_operation_retries_total", + Help: "Number of retried SQLite operations", + }, + []string{"operation"}, + ) +) + +func init() { + prometheus.MustRegister(sqliteLockWaitDuration) + prometheus.MustRegister(sqliteLockErrors) + prometheus.MustRegister(sqliteRetries) +} + +// ObserveSQLiteLockWait records the duration spent waiting for a lock +func ObserveSQLiteLockWait(operation string, duration float64) { + sqliteLockWaitDuration.WithLabelValues(operation).Observe(duration) +} + +// IncrementSQLiteLockError increments the counter for lock-related errors +func IncrementSQLiteLockError(operation, errType string) { + sqliteLockErrors.WithLabelValues(operation, errType).Inc() +} + +// IncrementSQLiteRetry increments the counter for operation retries +func IncrementSQLiteRetry(operation string) { + sqliteRetries.WithLabelValues(operation).Inc() +} diff --git a/db/db.go b/db/db.go index 0945d1a00..350a4ebcb 100644 --- a/db/db.go +++ b/db/db.go @@ -41,7 +41,32 @@ func Db() *sql.DB { } log.Debug("Opening DataBase", "dbPath", Path, "driver", Driver) db, err := sql.Open(Driver, Path) - db.SetMaxOpenConns(max(4, runtime.NumCPU())) + + maxConns := max(4, runtime.NumCPU()) + if conf.Server.SQLite.MaxConnections > 0 { + maxConns = conf.Server.SQLite.MaxConnections + } + db.SetMaxOpenConns(maxConns) + + // Configure SQLite PRAGMAs to improve concurrency and reduce "database is locked" errors + // WAL allows concurrent readers while a writer is active + // busy_timeout tells SQLite how long to wait for a lock before error + // Note: some network filesystems (NFS/CIFS) may not fully support WAL + if conf.Server.SQLite.JournalMode != "" { + if _, err := db.Exec("PRAGMA journal_mode=" + conf.Server.SQLite.JournalMode + ";"); err != nil { + log.Error("Error setting PRAGMA journal_mode", "mode", conf.Server.SQLite.JournalMode, err) + } + } + if conf.Server.SQLite.BusyTimeout > 0 { + if _, err := db.Exec(fmt.Sprintf("PRAGMA busy_timeout=%d;", conf.Server.SQLite.BusyTimeout)); err != nil { + log.Error("Error setting PRAGMA busy_timeout", "timeout", conf.Server.SQLite.BusyTimeout, err) + } + } + if conf.Server.SQLite.SyncMode != "" { + if _, err := db.Exec("PRAGMA synchronous=" + conf.Server.SQLite.SyncMode + ";"); err != nil { + log.Error("Error setting PRAGMA synchronous", "mode", conf.Server.SQLite.SyncMode, err) + } + } if err != nil { log.Fatal("Error opening database", err) } diff --git a/db/migrations/20251026_add_sqlite_settings.go b/db/migrations/20251026_add_sqlite_settings.go new file mode 100644 index 000000000..496aa2e70 --- /dev/null +++ b/db/migrations/20251026_add_sqlite_settings.go @@ -0,0 +1,25 @@ +-- +migrate Up +-- Enable WAL mode and set busy timeout if not already set +UPDATE user_property SET value = 'WAL' +WHERE name = 'sqlite_journal_mode' AND NOT EXISTS ( + SELECT 1 FROM user_property WHERE name = 'sqlite_journal_mode' +); + +INSERT INTO user_property (name, value) +SELECT 'sqlite_busy_timeout', '5000' +WHERE NOT EXISTS ( + SELECT 1 FROM user_property WHERE name = 'sqlite_busy_timeout' +); + +INSERT INTO user_property (name, value) +SELECT 'sqlite_sync_mode', 'NORMAL' +WHERE NOT EXISTS ( + SELECT 1 FROM user_property WHERE name = 'sqlite_sync_mode' +); + +-- +migrate Down +DELETE FROM user_property WHERE name IN ( + 'sqlite_journal_mode', + 'sqlite_busy_timeout', + 'sqlite_sync_mode' +); \ No newline at end of file diff --git a/docs/config/sqlite.md b/docs/config/sqlite.md new file mode 100644 index 000000000..0ccc862a7 --- /dev/null +++ b/docs/config/sqlite.md @@ -0,0 +1,47 @@ +# SQLite Configuration Options + +The following SQLite-specific configuration options are available under the `SQLite` section: + +## Basic Options + +### JournalMode + +- **Default:** `"WAL"` +- **Options:** `"DELETE"`, `"TRUNCATE"`, `"PERSIST"`, `"MEMORY"`, `"WAL"`, `"OFF"` +- **Description:** Controls how SQLite manages its journal file. WAL (Write-Ahead Logging) mode generally provides better concurrency and performance but may not work on some network filesystems. + +### BusyTimeout + +- **Default:** `5000` (milliseconds) +- **Description:** How long SQLite should wait when the database is locked before returning a "database is locked" error. Higher values allow more concurrency but may impact responsiveness. + +### SyncMode + +- **Default:** `"NORMAL"` +- **Options:** `"OFF"`, `"NORMAL"`, `"FULL"`, `"EXTRA"` +- **Description:** Controls how aggressively SQLite writes data to disk. NORMAL provides a good balance between safety and performance. + +### MaxConnections + +- **Default:** `0` (uses max(4, number of CPU cores)) +- **Description:** Maximum number of concurrent database connections. Lower this if you experience "database is locked" errors, especially on network filesystems. + +## Example Configuration + +```toml +[SQLite] +JournalMode = "WAL" # Enable Write-Ahead Logging for better concurrency +BusyTimeout = 5000 # Wait up to 5 seconds for locks to clear +SyncMode = "NORMAL" # Good balance of durability and performance +MaxConnections = 4 # Limit concurrent connections if needed +``` + +## Network Filesystem Considerations + +If your database is on a network filesystem (NFS, CIFS, etc.): + +1. Consider moving the database to local storage +2. If using network storage is required: + - Set `JournalMode = "DELETE"` + - Lower `MaxConnections` to reduce contention + - Increase `BusyTimeout` for better reliability diff --git a/docs/config/sqlite_monitoring.md b/docs/config/sqlite_monitoring.md new file mode 100644 index 000000000..35b36aab7 --- /dev/null +++ b/docs/config/sqlite_monitoring.md @@ -0,0 +1,92 @@ +# SQLite Monitoring + +Navidrome provides several Prometheus metrics to monitor SQLite database performance and lock contention: + +## Available Metrics + +### Lock Wait Duration + +- **Metric**: `sqlite_lock_wait_duration_seconds` +- **Type**: Histogram +- **Labels**: `operation` +- **Description**: Time spent waiting for SQLite locks to be released +- **Use Case**: Identify operations that are frequently blocked by locks + +### Lock Errors + +- **Metric**: `sqlite_lock_errors_total` +- **Type**: Counter +- **Labels**: `operation`, `type` +- **Description**: Number of SQLite lock-related errors +- **Types**: + - `retryable`: Temporary lock errors that can be retried + - `non_retryable`: Fatal errors that cannot be retried + +### Operation Retries + +- **Metric**: `sqlite_operation_retries_total` +- **Type**: Counter +- **Labels**: `operation` +- **Description**: Number of retried SQLite operations +- **Use Case**: Track which operations require frequent retries + +## Grafana Dashboard + +A pre-configured Grafana dashboard is available at `contrib/grafana/sqlite-dashboard.json`. +This dashboard provides visualizations for: + +- Lock wait duration trends +- Lock error rates by operation +- Retry rates by operation + +## Alerting Recommendations + +Consider setting up alerts for: + +1. High lock wait durations (> 1s) +2. Increasing error rates +3. Frequent retries on specific operations + +Example Prometheus alert rules: + +```yaml +groups: + - name: SQLiteAlerts + rules: + - alert: SQLiteLongLockWaits + expr: rate(sqlite_lock_wait_duration_seconds_sum[5m]) > 1 + for: 5m + labels: + severity: warning + annotations: + description: "SQLite operations are waiting long for locks" + + - alert: SQLiteHighErrorRate + expr: rate(sqlite_lock_errors_total[5m]) > 0.1 + for: 5m + labels: + severity: warning + annotations: + description: "High rate of SQLite lock errors" +``` + +## Troubleshooting with Metrics + +If you see: + +1. High lock wait durations: + + - Consider adjusting `SQLite.BusyTimeout` + - Review concurrent operations + - Check if database is on network storage + +2. Many retryable errors: + + - Increase `SQLite.BusyTimeout` + - Consider reducing `SQLite.MaxConnections` + - Enable WAL mode if not using network storage + +3. High retry rates: + - Review operations causing contention + - Consider batching updates + - Check for long-running transactions diff --git a/persistence/playlist_repository.go b/persistence/playlist_repository.go index 4152505d2..d6d3a6754 100644 --- a/persistence/playlist_repository.go +++ b/persistence/playlist_repository.go @@ -111,7 +111,13 @@ func (r *playlistRepository) Put(p *model.Playlist, cols ...string) error { } pls.UpdatedAt = time.Now() - id, err := r.put(pls.ID, pls) + var id string + err := RetryWithBackoff(r.ctx, "playlist_put", func() error { + var putErr error + id, putErr = r.put(pls.ID, pls) + return putErr + }, 3, 100*time.Millisecond, 2*time.Second) + if err != nil { return err } diff --git a/persistence/retry.go b/persistence/retry.go new file mode 100644 index 000000000..80a8805d3 --- /dev/null +++ b/persistence/retry.go @@ -0,0 +1,81 @@ +package persistence + +import ( + "context" + "database/sql" + "errors" + "math/rand" + "strings" + "time" + + "github.com/navidrome/navidrome/core/metrics" + + "github.com/navidrome/navidrome/log" +) + +// RetryWithBackoff attempts an operation with exponential backoff +// maxAttempts: maximum number of attempts (minimum 1) +// initialDelay: delay before first retry +// maxDelay: maximum delay between retries +func RetryWithBackoff(ctx context.Context, operation string, op func() error, maxAttempts int, initialDelay, maxDelay time.Duration) error { + var lastErr error + delay := initialDelay + startTime := time.Now() + + for attempt := 1; attempt <= maxAttempts; attempt++ { + err := op() + if err == nil { + if attempt > 1 { + // Record successful retry + metrics.IncrementSQLiteRetry(operation) + } + return nil + } + + lastErr = err + if !isRetryableError(err) { + // Record non-retryable error + metrics.IncrementSQLiteLockError(operation, "non_retryable") + return err + } + + metrics.IncrementSQLiteLockError(operation, "retryable") + if attempt == maxAttempts { + break + } + + // Use exponential backoff with jitter + jitter := time.Duration(float64(delay) * (0.5 + rand.Float64())) // 50-150% of base delay + if jitter > maxDelay { + jitter = maxDelay + } + + log.Debug(ctx, "Retrying operation after error", + "operation", operation, + "attempt", attempt, + "maxAttempts", maxAttempts, + "delay", jitter, + "error", err) + + select { + case <-time.After(jitter): + metrics.ObserveSQLiteLockWait(operation, jitter.Seconds()) + case <-ctx.Done(): + return ctx.Err() + } + + delay *= 2 + } + + return lastErr +} + +func isRetryableError(err error) bool { + if err == nil { + return false + } + errStr := err.Error() + return strings.Contains(errStr, "database is locked") || + strings.Contains(errStr, "busy") || + errors.Is(err, sql.ErrConnDone) +}