Description
Component(s)
extension/storage/filestorage
Is your feature request related to a problem? Please describe.
When the filestorage extension is used as a persistent queue for an exporter, the underlying bbolt database file may become corrupted if the host machine experiences a power loss. This appears to be tied to the NoSync bbolt option being set to true by default.
Here is an example panic from v0.72.0 when the collector is restarted after a power loss (compaction is enabled on start):
2023-03-01T03:40:49.513Z info extensions/extensions.go:41 Starting extensions...
2023-03-01T03:40:49.513Z info extensions/extensions.go:44 Extension is starting... {"kind": "extension", "name": "memory_ballast"}
2023-03-01T03:40:49.520Z info [email protected]/memory_ballast.go:52 Setting memory ballast {"kind": "extension", "name": "memory_ballast", "MiBs": 512}
2023-03-01T03:40:49.520Z info extensions/extensions.go:48 Extension started. {"kind": "extension", "name": "memory_ballast"}
2023-03-01T03:40:49.520Z info extensions/extensions.go:44 Extension is starting... {"kind": "extension", "name": "file_storage"}
2023-03-01T03:40:49.520Z info extensions/extensions.go:48 Extension started. {"kind": "extension", "name": "file_storage"}
2023-03-01T03:40:50.903Z info filestorage/client.go:244 finished compaction {"kind": "extension", "name": "file_storage", "directory": "/var/lib/otelcol/file_storage/exporter_otlp__logs", "elapsed": 1.339773672}
2023-03-01T03:40:50.903Z info internal/persistent_storage.go:301 Fetching items left for dispatch by consumers {"kind": "exporter", "data_type": "logs", "name": "otlp", "queueName": "otlp-logs", "numberOfItems": 2}
2023-03-01T03:40:50.908Z info internal/persistent_storage.go:170 Moved items for dispatching back to queue {"kind": "exporter", "data_type": "logs", "name": "otlp", "queueName": "otlp-logs", "numberOfItems": 2}
panic: assertion failed: Page expected to be: 5381, but self identifies as 0
goroutine 1 [running]:
go.etcd.io/bbolt._assert(...)
go.etcd.io/[email protected]/db.go:1359
go.etcd.io/bbolt.(*page).fastCheck(0x7f855a736000, 0x1505)
go.etcd.io/[email protected]/page.go:57 +0x1df
go.etcd.io/bbolt.(*Tx).page(0x7f855a72c000?, 0x56fa140?)
go.etcd.io/[email protected]/tx.go:534 +0x8a
go.etcd.io/bbolt.(*Tx).forEachPageInternal(0x7f855951b000?, {0xc034f8e140?, 0x3, 0xa}, 0xc038ed47c8)
go.etcd.io/[email protected]/tx.go:546 +0x65
go.etcd.io/bbolt.(*Tx).forEachPageInternal(0x7f855927b000?, {0xc034f8e140?, 0x2, 0xa}, 0xc038ed47c8)
go.etcd.io/[email protected]/tx.go:555 +0xd1
go.etcd.io/bbolt.(*Tx).forEachPageInternal(0xc0000da6c0?, {0xc034f8e140?, 0x1, 0xa}, 0xc038ed47c8)
go.etcd.io/[email protected]/tx.go:555 +0xd1
go.etcd.io/bbolt.(*Tx).forEachPage(...)
go.etcd.io/[email protected]/tx.go:542
go.etcd.io/bbolt.(*Tx).checkBucket(0xc034fb0000?, 0xc034f8a1c0?, 0xc038ed4b68?, 0xc038ed4b38?, {0x76f4cf0?, 0xb64b530?}, 0xc0000da6c0?)
go.etcd.io/[email protected]/tx_check.go:83 +0x126
go.etcd.io/bbolt.(*Tx).checkBucket.func2({0x7f855a6bc020?, 0xc038ed4988?, 0xc034fb0000?})
go.etcd.io/[email protected]/tx_check.go:110 +0x93
go.etcd.io/bbolt.(*Bucket).ForEachBucket(0xc034fb0018, 0xc038ed49d0)
go.etcd.io/[email protected]/bucket.go:403 +0xb0
go.etcd.io/bbolt.(*Tx).checkBucket(0xc034fb0000, 0xc034fb0018, 0xc038ed4b68, 0xc038ed4b38, {0x76f4cf0?, 0xb64b530}, 0xc0000da6c0)
go.etcd.io/[email protected]/tx_check.go:108 +0x265
go.etcd.io/bbolt.(*DB).freepages(0x699ae4b?)
go.etcd.io/[email protected]/db.go:1181 +0x229
go.etcd.io/bbolt.(*DB).loadFreelist.func1()
go.etcd.io/[email protected]/db.go:412 +0xc5
sync.(*Once).doSlow(0xc000a0a1c8?, 0x10?)
sync/once.go:74 +0xc2
sync.(*Once).Do(...)
sync/once.go:65
go.etcd.io/bbolt.(*DB).loadFreelist(0xc000a0a000?)
go.etcd.io/[email protected]/db.go:408 +0x47
go.etcd.io/bbolt.Open({0xc034f901c0, 0x34}, 0x1?, 0xc038ed4e10)
go.etcd.io/[email protected]/db.go:290 +0x40c
github.com/open-telemetry/opentelemetry-collector-contrib/extension/storage/filestorage.newClient(0xc000c1b9d0, {0xc034f901c0?, 0xc038ed4fa8?}, 0x2540be400, 0xc0004747c0)
github.com/open-telemetry/opentelemetry-collector-contrib/extension/[email protected]/filestorage/client.go:62 +0xaf
github.com/open-telemetry/opentelemetry-collector-contrib/extension/storage/filestorage.(*localFileStorage).GetClient(0xc0005eeaf0, {0x0?, 0x0?}, 0x0?, {{0xc0010df408?, 0x0?}, {0x0?, 0x0?}}, {0x699b31b, 0x7})
github.com/open-telemetry/opentelemetry-collector-contrib/extension/[email protected]/filestorage/extension.go:65 +0x335
go.opentelemetry.io/collector/exporter/exporterhelper.toStorageClient({0x7714640, 0xc000078028}, {{0xc0010deba0?, 0x0?}, {0x0?, 0x0?}}, {0x77188f8?, 0xc000415200?}, {{0xc0010df408, 0x4}, ...}, ...)
go.opentelemetry.io/[email protected]/exporter/exporterhelper/queued_retry.go:146 +0x168
go.opentelemetry.io/collector/exporter/exporterhelper.(*queuedRetrySender).initializePersistentQueue(0xc000ae3dc0, {0x7714640, 0xc000078028}, {0x77188f8?, 0xc000415200?})
go.opentelemetry.io/[email protected]/exporter/exporterhelper/queued_retry.go:160 +0xe6
go.opentelemetry.io/collector/exporter/exporterhelper.(*queuedRetrySender).start(0xc000ae3dc0, {0x7714640?, 0xc000078028?}, {0x77188f8?, 0xc000415200?})
go.opentelemetry.io/[email protected]/exporter/exporterhelper/queued_retry.go:200 +0x45
go.opentelemetry.io/collector/exporter/exporterhelper.newBaseExporter.func1({0x7714640?, 0xc000078028?}, {0x77188f8?, 0xc000415200?})
go.opentelemetry.io/[email protected]/exporter/exporterhelper/common.go:177 +0xae
go.opentelemetry.io/collector/component.StartFunc.Start(...)
go.opentelemetry.io/collector/[email protected]/component.go:84
go.opentelemetry.io/collector/service.(*pipelinesGraph).StartAll(0xc000475400?, {0x7714640, 0xc000078028}, {0x77188f8, 0xc000415200})
go.opentelemetry.io/[email protected]/service/graph.go:272 +0xb3
go.opentelemetry.io/collector/service.(*Service).Start(0xc000415180, {0x7714640, 0xc000078028})
go.opentelemetry.io/[email protected]/service/service.go:149 +0x2ac
go.opentelemetry.io/collector/otelcol.(*Collector).setupConfigurationComponents(0xc0008b9680, {0x7714640, 0xc000078028})
go.opentelemetry.io/[email protected]/otelcol/collector.go:182 +0x5eb
go.opentelemetry.io/collector/otelcol.(*Collector).Run(0xc0008b9680, {0x7714640, 0xc000078028})
go.opentelemetry.io/[email protected]/otelcol/collector.go:207 +0x65
go.opentelemetry.io/collector/otelcol.NewCommand.func1(0xc000ad0c00, {0x699c088?, 0x1?, 0x1?})
go.opentelemetry.io/[email protected]/otelcol/command.go:38 +0x96
github.com/spf13/cobra.(*Command).execute(0xc000ad0c00, {0xc000072030, 0x1, 0x1})
github.com/spf13/[email protected]/command.go:916 +0x862
github.com/spf13/cobra.(*Command).ExecuteC(0xc000ad0c00)
github.com/spf13/[email protected]/command.go:1044 +0x3bd
github.com/spf13/cobra.(*Command).Execute(...)
github.com/spf13/[email protected]/command.go:968
main.runInteractive({{0xc000bd81e0, 0xc000bd9440, 0xc000bd8600, 0xc000b8fdd0, 0xc000bd9470}, {{0x69c424b, 0xf}, {0x6a4324b, 0x1f}, {0x6995b93, ...}}, ...})
github.com/open-telemetry/opentelemetry-collector-releases/contrib/main.go:31 +0x5d
main.run(...)
github.com/open-telemetry/opentelemetry-collector-releases/contrib/main_others.go:11
main.main()
github.com/open-telemetry/opentelemetry-collector-releases/contrib/main.go:24 +0x1d9
Describe the solution you'd like
The proposed solution is to expose the NoSync parameter from bbolt options as a parameter in the filestorage extension config so that users can enable fsync in the collector configuration file.
Something like:
file_storage:
enable_fsync: true
Describe alternatives you've considered
I've built a customized version of the filestorage extension with one change to set NoSync to false and included this in a custom collector build. With this change I have not seen the bbolt panic again.
Additional context
No response