Skip to content

Commit a091f47

Browse files
authored
CBG-3947 Do not remove loaded database on transient fetch error (#7035)
1 parent b4e7b60 commit a091f47

File tree

6 files changed

+158
-54
lines changed

6 files changed

+158
-54
lines changed

base/error.go

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -244,8 +244,12 @@ func IsTemporaryKvError(err error) bool {
244244
return false
245245
}
246246
// define list of temporary errors
247-
temporaryKVError := []error{ErrTimeout, gocb.ErrAmbiguousTimeout, gocb.ErrUnambiguousTimeout,
248-
gocb.ErrOverload, gocb.ErrTemporaryFailure, gocb.ErrCircuitBreakerOpen}
247+
temporaryKVError := []error{
248+
ErrTimeout, // Sync Gateway client-side timeout
249+
gocb.ErrTimeout, // SDK op timeout. Wrapped by gocb.ErrAmbiguousTimeout, gocb.ErrUnambiguousTimeout,
250+
gocb.ErrOverload, // SDK client-side pipeline queue full, request was not submitted to server
251+
gocb.ErrTemporaryFailure, // Couchbase Server returned temporary failure error
252+
gocb.ErrCircuitBreakerOpen} // SDK client-side circuit breaker blocked request
249253

250254
// iterate through to check incoming error is one of them
251255
for _, tempKVErr := range temporaryKVError {

rest/adminapitest/admin_api_test.go

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1777,6 +1777,91 @@ func TestMultipleBucketWithBadDbConfigScenario3(t *testing.T) {
17771777

17781778
}
17791779

1780+
// TestConfigPollingRemoveDatabase:
1781+
//
1782+
// Validates db is removed when polling detects that the config is not found
1783+
func TestConfigPollingRemoveDatabase(t *testing.T) {
1784+
1785+
base.SetUpTestLogging(t, base.LevelInfo, base.KeyConfig)
1786+
testCases := []struct {
1787+
useXattrConfig bool
1788+
}{
1789+
{
1790+
useXattrConfig: false,
1791+
},
1792+
{
1793+
useXattrConfig: true,
1794+
},
1795+
}
1796+
for _, testCase := range testCases {
1797+
t.Run(fmt.Sprintf("xattrConfig_%v", testCase.useXattrConfig), func(t *testing.T) {
1798+
1799+
rt := rest.NewRestTester(t, &rest.RestTesterConfig{
1800+
CustomTestBucket: base.GetTestBucket(t),
1801+
PersistentConfig: true,
1802+
MutateStartupConfig: func(config *rest.StartupConfig) {
1803+
// configure the interval time to pick up new configs from the bucket to every 50 milliseconds
1804+
config.Bootstrap.ConfigUpdateFrequency = base.NewConfigDuration(50 * time.Millisecond)
1805+
},
1806+
DatabaseConfig: nil,
1807+
UseXattrConfig: testCase.useXattrConfig,
1808+
})
1809+
defer rt.Close()
1810+
1811+
ctx := base.TestCtx(t)
1812+
// create a new db
1813+
dbName := "db1"
1814+
dbConfig := rt.NewDbConfig()
1815+
dbConfig.Name = dbName
1816+
dbConfig.BucketConfig.Bucket = base.StringPtr(rt.CustomTestBucket.GetName())
1817+
resp := rt.CreateDatabase(dbName, dbConfig)
1818+
rest.RequireStatus(t, resp, http.StatusCreated)
1819+
1820+
// Validate that db is loaded
1821+
_, err := rt.ServerContext().GetDatabase(ctx, dbName)
1822+
require.NoError(t, err)
1823+
1824+
// Force timeouts - dev-time only test enhancement to validate CBG-3947, requires manual "leaky bootstrap" handling
1825+
// To enable:
1826+
// - Add "var ForceTimeouts bool" to bootstrap.go
1827+
// - In CouchbaseCluster.GetMetadataDocument, add the following after loadConfig:
1828+
// if ForceTimeouts {
1829+
// return 0, gocb.ErrTimeout
1830+
// }
1831+
// - enable the code block below
1832+
/*
1833+
base.ForceTimeouts = true
1834+
1835+
// Wait to ensure database doesn't disappear
1836+
err = rt.WaitForConditionWithOptions(func() bool {
1837+
_, err := rt.ServerContext().GetActiveDatabase(dbName)
1838+
return errors.Is(err, base.ErrNotFound)
1839+
1840+
}, 200, 50)
1841+
require.Error(t, err)
1842+
1843+
base.ForceTimeouts = false
1844+
*/
1845+
1846+
// Delete the config directly
1847+
rt.RemoveDbConfigFromBucket("db1", rt.CustomTestBucket.GetName())
1848+
1849+
// assert that the database is unloaded
1850+
err = rt.WaitForConditionWithOptions(func() bool {
1851+
_, err := rt.ServerContext().GetActiveDatabase(dbName)
1852+
return errors.Is(err, base.ErrNotFound)
1853+
1854+
}, 200, 1000)
1855+
require.NoError(t, err)
1856+
1857+
// assert that a request to the database fails with correct error message
1858+
resp = rt.SendAdminRequest(http.MethodGet, "/db1/_config", "")
1859+
rest.RequireStatus(t, resp, http.StatusNotFound)
1860+
assert.Contains(t, resp.Body.String(), "no such database")
1861+
})
1862+
}
1863+
}
1864+
17801865
func TestResyncStopUsingDCPStream(t *testing.T) {
17811866
if base.UnitTestUrlIsWalrus() {
17821867
// This test requires a gocb bucket

rest/config.go

Lines changed: 62 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -1629,17 +1629,26 @@ func (sc *ServerContext) fetchAndLoadConfigs(ctx context.Context, isInitialStart
16291629
sc.lock.Lock()
16301630
defer sc.lock.Unlock()
16311631
for _, dbName := range deletedDatabases {
1632+
dbc, ok := sc.databases_[dbName]
1633+
if !ok {
1634+
base.DebugfCtx(ctx, base.KeyConfig, "Database %q already removed from server context after acquiring write lock - do not need to remove not removing database", base.MD(dbName))
1635+
continue
1636+
}
16321637
// It's possible that the "deleted" database was not written to the server until after sc.FetchConfigs had returned...
16331638
// we'll need to pay for the cost of getting the config again now that we've got the write lock to double-check this db is definitely ok to remove...
1634-
found, _, err := sc._fetchDatabase(ctx, dbName)
1635-
if err != nil {
1636-
base.InfofCtx(ctx, base.KeyConfig, "Error fetching config for database %q to check whether we need to remove it: %v", dbName, err)
1639+
found, _, getConfigErr := sc._fetchDatabaseFromBucket(ctx, dbc.Bucket.GetName(), dbName)
1640+
if found && getConfigErr == nil {
1641+
base.DebugfCtx(ctx, base.KeyConfig, "Found config for database %q after acquiring write lock - not removing database", base.MD(dbName))
1642+
continue
1643+
}
1644+
if base.IsTemporaryKvError(getConfigErr) {
1645+
base.InfofCtx(ctx, base.KeyConfig, "Transient error fetching config for database %q to check whether we need to remove it, will not be removed: %v", base.MD(dbName), getConfigErr)
1646+
continue
16371647
}
1648+
16381649
if !found {
1639-
base.InfofCtx(ctx, base.KeyConfig, "Database %q was running on this node, but config was not found on the server - removing database", base.MD(dbName))
1650+
base.InfofCtx(ctx, base.KeyConfig, "Database %q was running on this node, but config was not found on the server - removing database (%v)", base.MD(dbName), getConfigErr)
16401651
sc._removeDatabase(ctx, dbName)
1641-
} else {
1642-
base.DebugfCtx(ctx, base.KeyConfig, "Found config for database %q after acquiring write lock - not removing database", base.MD(dbName))
16431652
}
16441653
}
16451654

@@ -1747,56 +1756,60 @@ func (sc *ServerContext) fetchDatabase(ctx context.Context, dbName string) (foun
17471756
return sc._fetchDatabase(ctx, dbName)
17481757
}
17491758

1750-
func (sc *ServerContext) _fetchDatabase(ctx context.Context, dbName string) (found bool, dbConfig *DatabaseConfig, err error) {
1751-
// loop code moved to foreachDbConfig
1752-
var cnf DatabaseConfig
1753-
callback := func(bucket string) (exit bool, err error) {
1754-
cas, err := sc.BootstrapContext.GetConfig(ctx, bucket, sc.Config.Bootstrap.ConfigGroupID, dbName, &cnf)
1755-
if err == base.ErrNotFound {
1756-
base.DebugfCtx(ctx, base.KeyConfig, "%q did not contain config in group %q", bucket, sc.Config.Bootstrap.ConfigGroupID)
1757-
return false, err
1758-
}
1759-
if err != nil {
1760-
base.DebugfCtx(ctx, base.KeyConfig, "unable to fetch config in group %q from bucket %q: %v", sc.Config.Bootstrap.ConfigGroupID, bucket, err)
1761-
return false, err
1762-
}
1759+
func (sc *ServerContext) _fetchDatabaseFromBucket(ctx context.Context, bucket string, dbName string) (found bool, cnf DatabaseConfig, err error) {
17631760

1764-
if cnf.Name == "" {
1765-
cnf.Name = bucket
1766-
}
1761+
cas, err := sc.BootstrapContext.GetConfig(ctx, bucket, sc.Config.Bootstrap.ConfigGroupID, dbName, &cnf)
1762+
if errors.Is(err, base.ErrNotFound) {
1763+
base.DebugfCtx(ctx, base.KeyConfig, "%q did not contain config in group %q", bucket, sc.Config.Bootstrap.ConfigGroupID)
1764+
return false, cnf, err
1765+
}
1766+
if err != nil {
1767+
base.DebugfCtx(ctx, base.KeyConfig, "unable to fetch config in group %q from bucket %q: %v", sc.Config.Bootstrap.ConfigGroupID, bucket, err)
1768+
return false, cnf, err
1769+
}
17671770

1768-
if cnf.Name != dbName {
1769-
base.TracefCtx(ctx, base.KeyConfig, "%q did not contain config in group %q for db %q", bucket, sc.Config.Bootstrap.ConfigGroupID, dbName)
1770-
return false, err
1771-
}
1771+
if cnf.Name == "" {
1772+
cnf.Name = bucket
1773+
}
17721774

1773-
cnf.cfgCas = cas
1775+
if cnf.Name != dbName {
1776+
base.TracefCtx(ctx, base.KeyConfig, "%q did not contain config in group %q for db %q", bucket, sc.Config.Bootstrap.ConfigGroupID, dbName)
1777+
return false, cnf, err
1778+
}
17741779

1775-
// TODO: This code is mostly copied from FetchConfigs, move into shared function with DbConfig REST API work?
1780+
cnf.cfgCas = cas
17761781

1777-
// inherit properties the bootstrap config
1778-
cnf.CACertPath = sc.Config.Bootstrap.CACertPath
1782+
// inherit properties the bootstrap config
1783+
cnf.CACertPath = sc.Config.Bootstrap.CACertPath
17791784

1780-
// We need to check for corruption in the database config (CC. CBG-3292). If the fetched config doesn't match the
1781-
// bucket name we got the config from we need to maker this db context as corrupt. Then remove the context and
1782-
// in memory representation on the server context.
1783-
if bucket != cnf.GetBucketName() {
1784-
sc._handleInvalidDatabaseConfig(ctx, bucket, cnf)
1785-
return true, fmt.Errorf("mismatch in persisted database bucket name %q vs the actual bucket name %q. Please correct db %q's config, groupID %q.", base.MD(cnf.Bucket), base.MD(bucket), base.MD(cnf.Name), base.MD(sc.Config.Bootstrap.ConfigGroupID))
1786-
}
1787-
bucketCopy := bucket
1788-
// no corruption detected carry on as usual
1789-
cnf.Bucket = &bucketCopy
1785+
// We need to check for corruption in the database config (CC. CBG-3292). If the fetched config doesn't match the
1786+
// bucket name we got the config from we need to maker this db context as corrupt. Then remove the context and
1787+
// in memory representation on the server context.
1788+
if bucket != cnf.GetBucketName() {
1789+
sc._handleInvalidDatabaseConfig(ctx, bucket, cnf)
1790+
return true, cnf, fmt.Errorf("mismatch in persisted database bucket name %q vs the actual bucket name %q. Please correct db %q's config, groupID %q.", base.MD(cnf.Bucket), base.MD(bucket), base.MD(cnf.Name), base.MD(sc.Config.Bootstrap.ConfigGroupID))
1791+
}
1792+
bucketCopy := bucket
1793+
// no corruption detected carry on as usual
1794+
cnf.Bucket = &bucketCopy
17901795

1791-
// any authentication fields defined on the dbconfig take precedence over any in the bootstrap config
1792-
if cnf.Username == "" && cnf.Password == "" && cnf.CertPath == "" && cnf.KeyPath == "" {
1793-
cnf.Username = sc.Config.Bootstrap.Username
1794-
cnf.Password = sc.Config.Bootstrap.Password
1795-
cnf.CertPath = sc.Config.Bootstrap.X509CertPath
1796-
cnf.KeyPath = sc.Config.Bootstrap.X509KeyPath
1797-
}
1798-
base.TracefCtx(ctx, base.KeyConfig, "Got database config %s for bucket %q with cas %d and groupID %q", base.MD(dbName), base.MD(bucket), cas, base.MD(sc.Config.Bootstrap.ConfigGroupID))
1799-
return true, nil
1796+
// any authentication fields defined on the dbconfig take precedence over any in the bootstrap config
1797+
if cnf.Username == "" && cnf.Password == "" && cnf.CertPath == "" && cnf.KeyPath == "" {
1798+
cnf.Username = sc.Config.Bootstrap.Username
1799+
cnf.Password = sc.Config.Bootstrap.Password
1800+
cnf.CertPath = sc.Config.Bootstrap.X509CertPath
1801+
cnf.KeyPath = sc.Config.Bootstrap.X509KeyPath
1802+
}
1803+
base.TracefCtx(ctx, base.KeyConfig, "Got database config %s for bucket %q with cas %d and groupID %q", base.MD(dbName), base.MD(bucket), cas, base.MD(sc.Config.Bootstrap.ConfigGroupID))
1804+
return true, cnf, nil
1805+
}
1806+
1807+
func (sc *ServerContext) _fetchDatabase(ctx context.Context, dbName string) (found bool, dbConfig *DatabaseConfig, err error) {
1808+
var cnf DatabaseConfig
1809+
callback := func(bucket string) (exit bool, callbackErr error) {
1810+
var foundInBucket bool
1811+
foundInBucket, cnf, callbackErr = sc._fetchDatabaseFromBucket(ctx, bucket, dbName)
1812+
return foundInBucket, callbackErr
18001813
}
18011814

18021815
err = sc.findBucketWithCallback(callback)

rest/config_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2982,7 +2982,7 @@ func TestInvalidDbConfigNoLongerPresentInBucket(t *testing.T) {
29822982
}, time.Second*10, time.Millisecond*100)
29832983

29842984
// remove the invalid config from the bucket
2985-
rt.DeleteDbConfigInBucket(dbName, realBucketName)
2985+
rt.RemoveDbConfigFromBucket(dbName, realBucketName)
29862986

29872987
// force reload of configs from bucket
29882988
rt.ServerContext().ForceDbConfigsReload(t, ctx)
@@ -3054,7 +3054,7 @@ func TestNotFoundOnInvalidDatabase(t *testing.T) {
30543054
assert.Contains(t, resp.Body.String(), "You must update database config immediately")
30553055

30563056
// delete the invalid db config to force the not found error
3057-
rt.DeleteDbConfigInBucket(dbConfig.Name, realBucketName)
3057+
rt.RemoveDbConfigFromBucket(dbConfig.Name, realBucketName)
30583058

30593059
// fix the bucket name and try fix corrupt db through create db endpoint
30603060
dbConfig.Bucket = &realBucketName

rest/utilities_testing.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ type RestTesterConfig struct {
7676
syncGatewayVersion *base.ComparableBuildVersion // alternate version of Sync Gateway to use on startup
7777
allowDbConfigEnvVars *bool
7878
maxConcurrentRevs *int
79+
UseXattrConfig bool
7980
}
8081

8182
type collectionConfiguration uint8
@@ -234,6 +235,7 @@ func (rt *RestTester) Bucket() base.Bucket {
234235
sc.Bootstrap.ServerTLSSkipVerify = base.BoolPtr(base.TestTLSSkipVerify())
235236
sc.Unsupported.Serverless.Enabled = &rt.serverless
236237
sc.Unsupported.AllowDbConfigEnvVars = rt.RestTesterConfig.allowDbConfigEnvVars
238+
sc.Unsupported.UseXattrConfig = &rt.UseXattrConfig
237239
sc.Replicator.MaxConcurrentRevs = rt.RestTesterConfig.maxConcurrentRevs
238240
if rt.serverless {
239241
if !rt.PersistentConfig {

rest/utilities_testing_resttester.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -323,7 +323,7 @@ func (rt *RestTester) InsertDbConfigToBucket(config *DatabaseConfig, bucketName
323323
require.NoError(rt.TB(), insertErr)
324324
}
325325

326-
func (rt *RestTester) DeleteDbConfigInBucket(dbName, bucketName string) {
326+
func (rt *RestTester) RemoveDbConfigFromBucket(dbName string, bucketName string) {
327327
deleteErr := rt.ServerContext().BootstrapContext.DeleteConfig(base.TestCtx(rt.TB()), bucketName, rt.ServerContext().Config.Bootstrap.ConfigGroupID, dbName)
328328
require.NoError(rt.TB(), deleteErr)
329329
}

0 commit comments

Comments
 (0)