mirror of
https://github.com/navidrome/navidrome.git
synced 2026-05-03 06:51:16 +00:00
Merge 70f43437b5da8a1cd7440b3d3922267591cc1026 into 94eb6c522b63198bdc4565442d86918ad43156e5
This commit is contained in:
commit
5ed4704044
@ -110,6 +110,7 @@ type configOptions struct {
|
||||
PID pidOptions `json:",omitzero"`
|
||||
Inspect inspectOptions `json:",omitzero"`
|
||||
Subsonic subsonicOptions `json:",omitzero"`
|
||||
SQLite sqliteOptions `json:",omitzero"`
|
||||
LastFM lastfmOptions `json:",omitzero"`
|
||||
Deezer deezerOptions `json:",omitzero"`
|
||||
ListenBrainz listenBrainzOptions `json:",omitzero"`
|
||||
|
||||
20
conf/sqlite_options.go
Normal file
20
conf/sqlite_options.go
Normal file
@ -0,0 +1,20 @@
|
||||
package conf
|
||||
|
||||
// sqliteOptions configures SQLite database behavior
|
||||
type sqliteOptions struct {
|
||||
// JournalMode sets the SQLite journal mode (WAL, DELETE, etc)
|
||||
// Default: WAL - provides better concurrency but may not work on network filesystems
|
||||
JournalMode string `json:",omitzero"`
|
||||
|
||||
// BusyTimeout sets how long SQLite should wait for locks to clear (milliseconds)
|
||||
// Default: 5000 - waits up to 5 seconds before returning "database is locked"
|
||||
BusyTimeout int `json:",omitzero"`
|
||||
|
||||
// SyncMode controls how aggressively SQLite writes to disk
|
||||
// Default: NORMAL - good balance of safety and performance
|
||||
SyncMode string `json:",omitzero"`
|
||||
|
||||
// MaxConnections limits concurrent database connections
|
||||
// Default: 0 (uses max(4, runtime.NumCPU()))
|
||||
MaxConnections int `json:",omitzero"`
|
||||
}
|
||||
311
contrib/grafana/sqlite-dashboard.json
Normal file
311
contrib/grafana/sqlite-dashboard.json
Normal file
@ -0,0 +1,311 @@
|
||||
{
|
||||
"annotations": {
|
||||
"list": []
|
||||
},
|
||||
"editable": true,
|
||||
"gnetId": null,
|
||||
"graphTooltip": 0,
|
||||
"id": null,
|
||||
"links": [],
|
||||
"panels": [
|
||||
{
|
||||
"aliasColors": {},
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "Prometheus",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"hiddenSeries": false,
|
||||
"id": 2,
|
||||
"legend": {
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"nullPointMode": "null",
|
||||
"options": {
|
||||
"alertThreshold": true
|
||||
},
|
||||
"percentage": false,
|
||||
"pluginVersion": "7.4.0",
|
||||
"pointradius": 2,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [],
|
||||
"spaceLength": 10,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(sqlite_lock_wait_duration_seconds_sum[5m])",
|
||||
"interval": "",
|
||||
"legendFormat": "{{operation}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"thresholds": [],
|
||||
"timeFrom": null,
|
||||
"timeRegions": [],
|
||||
"timeShift": null,
|
||||
"title": "SQLite Lock Wait Duration (5m rate)",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": []
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "s",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
}
|
||||
],
|
||||
"yaxis": {
|
||||
"align": false,
|
||||
"alignLevel": null
|
||||
}
|
||||
},
|
||||
{
|
||||
"aliasColors": {},
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "Prometheus",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 0
|
||||
},
|
||||
"hiddenSeries": false,
|
||||
"id": 4,
|
||||
"legend": {
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"nullPointMode": "null",
|
||||
"options": {
|
||||
"alertThreshold": true
|
||||
},
|
||||
"percentage": false,
|
||||
"pluginVersion": "7.4.0",
|
||||
"pointradius": 2,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [],
|
||||
"spaceLength": 10,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(sqlite_lock_errors_total[5m])",
|
||||
"interval": "",
|
||||
"legendFormat": "{{operation}} - {{type}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"thresholds": [],
|
||||
"timeFrom": null,
|
||||
"timeRegions": [],
|
||||
"timeShift": null,
|
||||
"title": "SQLite Lock Errors (5m rate)",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": []
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
}
|
||||
],
|
||||
"yaxis": {
|
||||
"align": false,
|
||||
"alignLevel": null
|
||||
}
|
||||
},
|
||||
{
|
||||
"aliasColors": {},
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "Prometheus",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 8
|
||||
},
|
||||
"hiddenSeries": false,
|
||||
"id": 6,
|
||||
"legend": {
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"nullPointMode": "null",
|
||||
"options": {
|
||||
"alertThreshold": true
|
||||
},
|
||||
"percentage": false,
|
||||
"pluginVersion": "7.4.0",
|
||||
"pointradius": 2,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [],
|
||||
"spaceLength": 10,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(sqlite_operation_retries_total[5m])",
|
||||
"interval": "",
|
||||
"legendFormat": "{{operation}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"thresholds": [],
|
||||
"timeFrom": null,
|
||||
"timeRegions": [],
|
||||
"timeShift": null,
|
||||
"title": "SQLite Operation Retries (5m rate)",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": []
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
}
|
||||
],
|
||||
"yaxis": {
|
||||
"align": false,
|
||||
"alignLevel": null
|
||||
}
|
||||
}
|
||||
],
|
||||
"schemaVersion": 27,
|
||||
"style": "dark",
|
||||
"tags": ["navidrome", "sqlite"],
|
||||
"templating": {
|
||||
"list": []
|
||||
},
|
||||
"time": {
|
||||
"from": "now-6h",
|
||||
"to": "now"
|
||||
},
|
||||
"timepicker": {},
|
||||
"timezone": "",
|
||||
"title": "Navidrome SQLite Monitoring",
|
||||
"version": 1
|
||||
}
|
||||
53
core/metrics/sqlite.go
Normal file
53
core/metrics/sqlite.go
Normal file
@ -0,0 +1,53 @@
|
||||
package metrics
|
||||
|
||||
import (
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
)
|
||||
|
||||
var (
|
||||
sqliteLockWaitDuration = prometheus.NewHistogramVec(
|
||||
prometheus.HistogramOpts{
|
||||
Name: "sqlite_lock_wait_duration_seconds",
|
||||
Help: "Time spent waiting for SQLite locks to be released",
|
||||
Buckets: []float64{.005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5, 10},
|
||||
},
|
||||
[]string{"operation"},
|
||||
)
|
||||
|
||||
sqliteLockErrors = prometheus.NewCounterVec(
|
||||
prometheus.CounterOpts{
|
||||
Name: "sqlite_lock_errors_total",
|
||||
Help: "Number of SQLite lock-related errors",
|
||||
},
|
||||
[]string{"operation", "type"},
|
||||
)
|
||||
|
||||
sqliteRetries = prometheus.NewCounterVec(
|
||||
prometheus.CounterOpts{
|
||||
Name: "sqlite_operation_retries_total",
|
||||
Help: "Number of retried SQLite operations",
|
||||
},
|
||||
[]string{"operation"},
|
||||
)
|
||||
)
|
||||
|
||||
func init() {
|
||||
prometheus.MustRegister(sqliteLockWaitDuration)
|
||||
prometheus.MustRegister(sqliteLockErrors)
|
||||
prometheus.MustRegister(sqliteRetries)
|
||||
}
|
||||
|
||||
// ObserveSQLiteLockWait records the duration spent waiting for a lock
|
||||
func ObserveSQLiteLockWait(operation string, duration float64) {
|
||||
sqliteLockWaitDuration.WithLabelValues(operation).Observe(duration)
|
||||
}
|
||||
|
||||
// IncrementSQLiteLockError increments the counter for lock-related errors
|
||||
func IncrementSQLiteLockError(operation, errType string) {
|
||||
sqliteLockErrors.WithLabelValues(operation, errType).Inc()
|
||||
}
|
||||
|
||||
// IncrementSQLiteRetry increments the counter for operation retries
|
||||
func IncrementSQLiteRetry(operation string) {
|
||||
sqliteRetries.WithLabelValues(operation).Inc()
|
||||
}
|
||||
27
db/db.go
27
db/db.go
@ -41,7 +41,32 @@ func Db() *sql.DB {
|
||||
}
|
||||
log.Debug("Opening DataBase", "dbPath", Path, "driver", Driver)
|
||||
db, err := sql.Open(Driver, Path)
|
||||
db.SetMaxOpenConns(max(4, runtime.NumCPU()))
|
||||
|
||||
maxConns := max(4, runtime.NumCPU())
|
||||
if conf.Server.SQLite.MaxConnections > 0 {
|
||||
maxConns = conf.Server.SQLite.MaxConnections
|
||||
}
|
||||
db.SetMaxOpenConns(maxConns)
|
||||
|
||||
// Configure SQLite PRAGMAs to improve concurrency and reduce "database is locked" errors
|
||||
// WAL allows concurrent readers while a writer is active
|
||||
// busy_timeout tells SQLite how long to wait for a lock before error
|
||||
// Note: some network filesystems (NFS/CIFS) may not fully support WAL
|
||||
if conf.Server.SQLite.JournalMode != "" {
|
||||
if _, err := db.Exec("PRAGMA journal_mode=" + conf.Server.SQLite.JournalMode + ";"); err != nil {
|
||||
log.Error("Error setting PRAGMA journal_mode", "mode", conf.Server.SQLite.JournalMode, err)
|
||||
}
|
||||
}
|
||||
if conf.Server.SQLite.BusyTimeout > 0 {
|
||||
if _, err := db.Exec(fmt.Sprintf("PRAGMA busy_timeout=%d;", conf.Server.SQLite.BusyTimeout)); err != nil {
|
||||
log.Error("Error setting PRAGMA busy_timeout", "timeout", conf.Server.SQLite.BusyTimeout, err)
|
||||
}
|
||||
}
|
||||
if conf.Server.SQLite.SyncMode != "" {
|
||||
if _, err := db.Exec("PRAGMA synchronous=" + conf.Server.SQLite.SyncMode + ";"); err != nil {
|
||||
log.Error("Error setting PRAGMA synchronous", "mode", conf.Server.SQLite.SyncMode, err)
|
||||
}
|
||||
}
|
||||
if err != nil {
|
||||
log.Fatal("Error opening database", err)
|
||||
}
|
||||
|
||||
25
db/migrations/20251026_add_sqlite_settings.go
Normal file
25
db/migrations/20251026_add_sqlite_settings.go
Normal file
@ -0,0 +1,25 @@
|
||||
-- +migrate Up
|
||||
-- Enable WAL mode and set busy timeout if not already set
|
||||
UPDATE user_property SET value = 'WAL'
|
||||
WHERE name = 'sqlite_journal_mode' AND NOT EXISTS (
|
||||
SELECT 1 FROM user_property WHERE name = 'sqlite_journal_mode'
|
||||
);
|
||||
|
||||
INSERT INTO user_property (name, value)
|
||||
SELECT 'sqlite_busy_timeout', '5000'
|
||||
WHERE NOT EXISTS (
|
||||
SELECT 1 FROM user_property WHERE name = 'sqlite_busy_timeout'
|
||||
);
|
||||
|
||||
INSERT INTO user_property (name, value)
|
||||
SELECT 'sqlite_sync_mode', 'NORMAL'
|
||||
WHERE NOT EXISTS (
|
||||
SELECT 1 FROM user_property WHERE name = 'sqlite_sync_mode'
|
||||
);
|
||||
|
||||
-- +migrate Down
|
||||
DELETE FROM user_property WHERE name IN (
|
||||
'sqlite_journal_mode',
|
||||
'sqlite_busy_timeout',
|
||||
'sqlite_sync_mode'
|
||||
);
|
||||
47
docs/config/sqlite.md
Normal file
47
docs/config/sqlite.md
Normal file
@ -0,0 +1,47 @@
|
||||
# SQLite Configuration Options
|
||||
|
||||
The following SQLite-specific configuration options are available under the `SQLite` section:
|
||||
|
||||
## Basic Options
|
||||
|
||||
### JournalMode
|
||||
|
||||
- **Default:** `"WAL"`
|
||||
- **Options:** `"DELETE"`, `"TRUNCATE"`, `"PERSIST"`, `"MEMORY"`, `"WAL"`, `"OFF"`
|
||||
- **Description:** Controls how SQLite manages its journal file. WAL (Write-Ahead Logging) mode generally provides better concurrency and performance but may not work on some network filesystems.
|
||||
|
||||
### BusyTimeout
|
||||
|
||||
- **Default:** `5000` (milliseconds)
|
||||
- **Description:** How long SQLite should wait when the database is locked before returning a "database is locked" error. Higher values allow more concurrency but may impact responsiveness.
|
||||
|
||||
### SyncMode
|
||||
|
||||
- **Default:** `"NORMAL"`
|
||||
- **Options:** `"OFF"`, `"NORMAL"`, `"FULL"`, `"EXTRA"`
|
||||
- **Description:** Controls how aggressively SQLite writes data to disk. NORMAL provides a good balance between safety and performance.
|
||||
|
||||
### MaxConnections
|
||||
|
||||
- **Default:** `0` (uses max(4, number of CPU cores))
|
||||
- **Description:** Maximum number of concurrent database connections. Lower this if you experience "database is locked" errors, especially on network filesystems.
|
||||
|
||||
## Example Configuration
|
||||
|
||||
```toml
|
||||
[SQLite]
|
||||
JournalMode = "WAL" # Enable Write-Ahead Logging for better concurrency
|
||||
BusyTimeout = 5000 # Wait up to 5 seconds for locks to clear
|
||||
SyncMode = "NORMAL" # Good balance of durability and performance
|
||||
MaxConnections = 4 # Limit concurrent connections if needed
|
||||
```
|
||||
|
||||
## Network Filesystem Considerations
|
||||
|
||||
If your database is on a network filesystem (NFS, CIFS, etc.):
|
||||
|
||||
1. Consider moving the database to local storage
|
||||
2. If using network storage is required:
|
||||
- Set `JournalMode = "DELETE"`
|
||||
- Lower `MaxConnections` to reduce contention
|
||||
- Increase `BusyTimeout` for better reliability
|
||||
92
docs/config/sqlite_monitoring.md
Normal file
92
docs/config/sqlite_monitoring.md
Normal file
@ -0,0 +1,92 @@
|
||||
# SQLite Monitoring
|
||||
|
||||
Navidrome provides several Prometheus metrics to monitor SQLite database performance and lock contention:
|
||||
|
||||
## Available Metrics
|
||||
|
||||
### Lock Wait Duration
|
||||
|
||||
- **Metric**: `sqlite_lock_wait_duration_seconds`
|
||||
- **Type**: Histogram
|
||||
- **Labels**: `operation`
|
||||
- **Description**: Time spent waiting for SQLite locks to be released
|
||||
- **Use Case**: Identify operations that are frequently blocked by locks
|
||||
|
||||
### Lock Errors
|
||||
|
||||
- **Metric**: `sqlite_lock_errors_total`
|
||||
- **Type**: Counter
|
||||
- **Labels**: `operation`, `type`
|
||||
- **Description**: Number of SQLite lock-related errors
|
||||
- **Types**:
|
||||
- `retryable`: Temporary lock errors that can be retried
|
||||
- `non_retryable`: Fatal errors that cannot be retried
|
||||
|
||||
### Operation Retries
|
||||
|
||||
- **Metric**: `sqlite_operation_retries_total`
|
||||
- **Type**: Counter
|
||||
- **Labels**: `operation`
|
||||
- **Description**: Number of retried SQLite operations
|
||||
- **Use Case**: Track which operations require frequent retries
|
||||
|
||||
## Grafana Dashboard
|
||||
|
||||
A pre-configured Grafana dashboard is available at `contrib/grafana/sqlite-dashboard.json`.
|
||||
This dashboard provides visualizations for:
|
||||
|
||||
- Lock wait duration trends
|
||||
- Lock error rates by operation
|
||||
- Retry rates by operation
|
||||
|
||||
## Alerting Recommendations
|
||||
|
||||
Consider setting up alerts for:
|
||||
|
||||
1. High lock wait durations (> 1s)
|
||||
2. Increasing error rates
|
||||
3. Frequent retries on specific operations
|
||||
|
||||
Example Prometheus alert rules:
|
||||
|
||||
```yaml
|
||||
groups:
|
||||
- name: SQLiteAlerts
|
||||
rules:
|
||||
- alert: SQLiteLongLockWaits
|
||||
expr: rate(sqlite_lock_wait_duration_seconds_sum[5m]) > 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: "SQLite operations are waiting long for locks"
|
||||
|
||||
- alert: SQLiteHighErrorRate
|
||||
expr: rate(sqlite_lock_errors_total[5m]) > 0.1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: "High rate of SQLite lock errors"
|
||||
```
|
||||
|
||||
## Troubleshooting with Metrics
|
||||
|
||||
If you see:
|
||||
|
||||
1. High lock wait durations:
|
||||
|
||||
- Consider adjusting `SQLite.BusyTimeout`
|
||||
- Review concurrent operations
|
||||
- Check if database is on network storage
|
||||
|
||||
2. Many retryable errors:
|
||||
|
||||
- Increase `SQLite.BusyTimeout`
|
||||
- Consider reducing `SQLite.MaxConnections`
|
||||
- Enable WAL mode if not using network storage
|
||||
|
||||
3. High retry rates:
|
||||
- Review operations causing contention
|
||||
- Consider batching updates
|
||||
- Check for long-running transactions
|
||||
@ -111,7 +111,13 @@ func (r *playlistRepository) Put(p *model.Playlist, cols ...string) error {
|
||||
}
|
||||
pls.UpdatedAt = time.Now()
|
||||
|
||||
id, err := r.put(pls.ID, pls)
|
||||
var id string
|
||||
err := RetryWithBackoff(r.ctx, "playlist_put", func() error {
|
||||
var putErr error
|
||||
id, putErr = r.put(pls.ID, pls)
|
||||
return putErr
|
||||
}, 3, 100*time.Millisecond, 2*time.Second)
|
||||
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
81
persistence/retry.go
Normal file
81
persistence/retry.go
Normal file
@ -0,0 +1,81 @@
|
||||
package persistence
|
||||
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"errors"
|
||||
"math/rand"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/navidrome/navidrome/core/metrics"
|
||||
|
||||
"github.com/navidrome/navidrome/log"
|
||||
)
|
||||
|
||||
// RetryWithBackoff attempts an operation with exponential backoff
|
||||
// maxAttempts: maximum number of attempts (minimum 1)
|
||||
// initialDelay: delay before first retry
|
||||
// maxDelay: maximum delay between retries
|
||||
func RetryWithBackoff(ctx context.Context, operation string, op func() error, maxAttempts int, initialDelay, maxDelay time.Duration) error {
|
||||
var lastErr error
|
||||
delay := initialDelay
|
||||
startTime := time.Now()
|
||||
|
||||
for attempt := 1; attempt <= maxAttempts; attempt++ {
|
||||
err := op()
|
||||
if err == nil {
|
||||
if attempt > 1 {
|
||||
// Record successful retry
|
||||
metrics.IncrementSQLiteRetry(operation)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
lastErr = err
|
||||
if !isRetryableError(err) {
|
||||
// Record non-retryable error
|
||||
metrics.IncrementSQLiteLockError(operation, "non_retryable")
|
||||
return err
|
||||
}
|
||||
|
||||
metrics.IncrementSQLiteLockError(operation, "retryable")
|
||||
if attempt == maxAttempts {
|
||||
break
|
||||
}
|
||||
|
||||
// Use exponential backoff with jitter
|
||||
jitter := time.Duration(float64(delay) * (0.5 + rand.Float64())) // 50-150% of base delay
|
||||
if jitter > maxDelay {
|
||||
jitter = maxDelay
|
||||
}
|
||||
|
||||
log.Debug(ctx, "Retrying operation after error",
|
||||
"operation", operation,
|
||||
"attempt", attempt,
|
||||
"maxAttempts", maxAttempts,
|
||||
"delay", jitter,
|
||||
"error", err)
|
||||
|
||||
select {
|
||||
case <-time.After(jitter):
|
||||
metrics.ObserveSQLiteLockWait(operation, jitter.Seconds())
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
}
|
||||
|
||||
delay *= 2
|
||||
}
|
||||
|
||||
return lastErr
|
||||
}
|
||||
|
||||
func isRetryableError(err error) bool {
|
||||
if err == nil {
|
||||
return false
|
||||
}
|
||||
errStr := err.Error()
|
||||
return strings.Contains(errStr, "database is locked") ||
|
||||
strings.Contains(errStr, "busy") ||
|
||||
errors.Is(err, sql.ErrConnDone)
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user