Skip to content

Commit 41b6064

Browse files
Startup retry (#51)
* start up retry * update * update test * update error message * add retriable errors
1 parent 2198acc commit 41b6064

File tree

5 files changed

+298
-8
lines changed

5 files changed

+298
-8
lines changed

azureappconfiguration/azureappconfiguration.go

Lines changed: 55 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ import (
2626
"strings"
2727
"sync"
2828
"sync/atomic"
29+
"time"
2930

3031
"github.com/Azure/AppConfiguration-GoProvider/azureappconfiguration/internal/jsonc"
3132
"github.com/Azure/AppConfiguration-GoProvider/azureappconfiguration/internal/refresh"
@@ -140,7 +141,7 @@ func Load(ctx context.Context, authentication AuthenticationOptions, options *Op
140141
}
141142
}
142143

143-
if err := azappcfg.load(ctx); err != nil {
144+
if err := azappcfg.startupWithRetry(ctx, options.StartupOptions.Timeout, azappcfg.load); err != nil {
144145
return nil, err
145146
}
146147
// Set the initial load finished flag
@@ -681,6 +682,59 @@ func (azappcfg *AzureAppConfiguration) executeFailoverPolicy(ctx context.Context
681682
return fmt.Errorf("failed to get settings from all clients: %v", errors)
682683
}
683684

685+
// startupWithRetry implements retry logic for startup loading with timeout and exponential backoff
686+
func (azappcfg *AzureAppConfiguration) startupWithRetry(ctx context.Context, timeout time.Duration, operation func(context.Context) error) error {
687+
// If no timeout is specified, use the default startup timeout
688+
if timeout <= 0 {
689+
timeout = defaultStartupTimeout
690+
}
691+
692+
// Create a context with timeout for the entire startup process
693+
startupCtx, cancel := context.WithTimeout(ctx, timeout)
694+
defer cancel()
695+
696+
attempt := 0
697+
startTime := time.Now()
698+
699+
for {
700+
attempt++
701+
702+
// Try to load with the current context
703+
err := operation(startupCtx)
704+
if err == nil {
705+
return nil
706+
}
707+
708+
// Check if the error is retriable
709+
if !(isFailoverable(err) ||
710+
strings.Contains(err.Error(), "no client is available") ||
711+
strings.Contains(err.Error(), "failed to get settings from all clients")) {
712+
return fmt.Errorf("load from Azure App Configuration failed with non-retriable error: %w", err)
713+
}
714+
715+
// Calculate backoff duration
716+
timeElapsed := time.Since(startTime)
717+
backoffDuration := getFixedBackoffDuration(timeElapsed)
718+
if backoffDuration == 0 {
719+
backoffDuration = calculateBackoffDuration(attempt)
720+
}
721+
722+
// Check if we have enough time left to wait and retry
723+
timeRemaining := timeout - timeElapsed
724+
if timeRemaining <= backoffDuration {
725+
return fmt.Errorf("load from Azure App Configuration failed after %d attempts within timeout %v: %w", attempt, timeout, err)
726+
}
727+
728+
// Wait for the backoff duration before retrying
729+
select {
730+
case <-startupCtx.Done():
731+
return fmt.Errorf("load from Azure App Configuration timed out: %w", startupCtx.Err())
732+
case <-time.After(backoffDuration):
733+
// Continue to next retry attempt
734+
}
735+
}
736+
}
737+
684738
func (azappcfg *AzureAppConfiguration) trimPrefix(key string) string {
685739
result := key
686740
for _, prefix := range azappcfg.trimPrefixes {

azureappconfiguration/client_manager.go

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -335,17 +335,17 @@ func (client *configurationClientWrapper) updateBackoffStatus(success bool) {
335335
client.backOffEndTime = time.Time{}
336336
} else {
337337
client.failedAttempts++
338-
client.backOffEndTime = time.Now().Add(client.getBackoffDuration())
338+
client.backOffEndTime = time.Now().Add(calculateBackoffDuration(client.failedAttempts))
339339
}
340340
}
341341

342-
func (client *configurationClientWrapper) getBackoffDuration() time.Duration {
343-
if client.failedAttempts <= 1 {
342+
func calculateBackoffDuration(failedAttempts int) time.Duration {
343+
if failedAttempts <= 1 {
344344
return minBackoffDuration
345345
}
346346

347347
// Cap the exponent to prevent overflow
348-
exponent := math.Min(float64(client.failedAttempts-1), float64(safeShiftLimit))
348+
exponent := math.Min(float64(failedAttempts-1), float64(safeShiftLimit))
349349
calculatedMilliseconds := float64(minBackoffDuration.Milliseconds()) * math.Pow(2, exponent)
350350
if calculatedMilliseconds > float64(maxBackoffDuration.Milliseconds()) || calculatedMilliseconds <= 0 {
351351
calculatedMilliseconds = float64(maxBackoffDuration.Milliseconds())
@@ -355,6 +355,21 @@ func (client *configurationClientWrapper) getBackoffDuration() time.Duration {
355355
return jitter(calculatedDuration)
356356
}
357357

358+
func getFixedBackoffDuration(timeElapsed time.Duration) time.Duration {
359+
if timeElapsed < time.Second*100 {
360+
return time.Second * 5
361+
}
362+
if timeElapsed < time.Second*200 {
363+
return time.Second * 10
364+
}
365+
366+
if timeElapsed < time.Second*600 {
367+
return minBackoffDuration
368+
}
369+
370+
return 0
371+
}
372+
358373
func jitter(duration time.Duration) time.Duration {
359374
// Calculate the amount of jitter to add to the duration
360375
jitter := float64(duration) * jitterRatio

azureappconfiguration/constants.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,3 +69,8 @@ const (
6969
jitterRatio float64 = 0.25
7070
safeShiftLimit int = 63
7171
)
72+
73+
// Startup constants
74+
const (
75+
defaultStartupTimeout time.Duration = 100 * time.Second
76+
)

azureappconfiguration/failover_test.go

Lines changed: 210 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
"fmt"
1010
"net"
1111
"net/http"
12+
"strings"
1213
"testing"
1314
"time"
1415

@@ -664,17 +665,223 @@ func TestClientWrapper_GetBackoffDuration(t *testing.T) {
664665

665666
// First failure should return minimum backoff duration
666667
client.failedAttempts = 1
667-
duration := client.getBackoffDuration()
668+
duration := calculateBackoffDuration(client.failedAttempts)
668669
assert.True(t, duration >= minBackoffDuration)
669670
assert.True(t, duration <= minBackoffDuration*2) // Account for jitter
670671

671672
// Multiple failures should increase duration exponentially
672673
client.failedAttempts = 3
673-
duration3 := client.getBackoffDuration()
674+
duration3 := calculateBackoffDuration(client.failedAttempts)
674675
assert.True(t, duration3 > duration)
675676

676677
// Very high failure count should be capped at max duration
677678
client.failedAttempts = 100
678-
durationMax := client.getBackoffDuration()
679+
durationMax := calculateBackoffDuration(client.failedAttempts)
679680
assert.True(t, durationMax <= maxBackoffDuration*2) // Account for jitter
680681
}
682+
683+
// Test startupWithRetry with successful operation on first attempt
684+
func TestStartupWithRetry_Success_FirstAttempt(t *testing.T) {
685+
mockClientManager := new(mockClientManager)
686+
687+
azappcfg := &AzureAppConfiguration{
688+
clientManager: mockClientManager,
689+
keyValues: make(map[string]any),
690+
featureFlags: make(map[string]any),
691+
kvSelectors: []Selector{{KeyFilter: "*", LabelFilter: "\x00"}},
692+
}
693+
694+
// Create a successful operation that simply returns nil
695+
operation := func(ctx context.Context) error {
696+
return nil // Success on first attempt
697+
}
698+
699+
ctx := context.Background()
700+
err := azappcfg.startupWithRetry(ctx, 10*time.Second, operation)
701+
702+
assert.NoError(t, err)
703+
}
704+
705+
// Test startupWithRetry with retry after retriable error
706+
func TestStartupWithRetry_Success_AfterRetriableError(t *testing.T) {
707+
mockClientManager := new(mockClientManager)
708+
709+
azappcfg := &AzureAppConfiguration{
710+
clientManager: mockClientManager,
711+
keyValues: make(map[string]any),
712+
featureFlags: make(map[string]any),
713+
kvSelectors: []Selector{{KeyFilter: "*", LabelFilter: "\x00"}},
714+
}
715+
716+
callCount := 0
717+
retriableError := &azcore.ResponseError{StatusCode: http.StatusServiceUnavailable}
718+
719+
// Create an operation that fails first, then succeeds
720+
operation := func(ctx context.Context) error {
721+
callCount++
722+
if callCount == 1 {
723+
return retriableError // Fail on first attempt
724+
}
725+
return nil // Success on second attempt
726+
}
727+
728+
ctx := context.Background()
729+
err := azappcfg.startupWithRetry(ctx, 10*time.Second, operation)
730+
731+
assert.NoError(t, err)
732+
assert.Equal(t, 2, callCount, "Operation should be called twice")
733+
}
734+
735+
// Test startupWithRetry with non-retriable error
736+
func TestStartupWithRetry_NonRetriableError(t *testing.T) {
737+
mockClientManager := new(mockClientManager)
738+
739+
azappcfg := &AzureAppConfiguration{
740+
clientManager: mockClientManager,
741+
keyValues: make(map[string]any),
742+
featureFlags: make(map[string]any),
743+
kvSelectors: []Selector{{KeyFilter: "*", LabelFilter: "\x00"}},
744+
}
745+
746+
callCount := 0
747+
nonRetriableError := &azcore.ResponseError{StatusCode: http.StatusBadRequest}
748+
749+
// Create an operation that fails with non-retriable error
750+
operation := func(ctx context.Context) error {
751+
callCount++
752+
return nonRetriableError // Non-retriable error
753+
}
754+
755+
ctx := context.Background()
756+
err := azappcfg.startupWithRetry(ctx, 10*time.Second, operation)
757+
758+
assert.Error(t, err)
759+
assert.Contains(t, err.Error(), "load from Azure App Configuration failed with non-retriable error")
760+
assert.Equal(t, 1, callCount, "Operation should be called only once for non-retriable error")
761+
}
762+
763+
// Test startupWithRetry with timeout
764+
func TestStartupWithRetry_Timeout(t *testing.T) {
765+
mockClientManager := new(mockClientManager)
766+
767+
azappcfg := &AzureAppConfiguration{
768+
clientManager: mockClientManager,
769+
keyValues: make(map[string]any),
770+
featureFlags: make(map[string]any),
771+
kvSelectors: []Selector{{KeyFilter: "*", LabelFilter: "\x00"}},
772+
}
773+
774+
callCount := 0
775+
retriableError := &azcore.ResponseError{StatusCode: http.StatusServiceUnavailable}
776+
777+
// Create an operation that always fails with retriable error
778+
operation := func(ctx context.Context) error {
779+
callCount++
780+
return retriableError // Always fail
781+
}
782+
783+
ctx := context.Background()
784+
// Use a very short timeout to trigger timeout quickly
785+
err := azappcfg.startupWithRetry(ctx, 100*time.Millisecond, operation)
786+
787+
assert.Error(t, err)
788+
assert.True(t,
789+
err.Error() == "startup timeout reached after 100ms" ||
790+
err.Error() == fmt.Sprintf("load from Azure App Configuration failed after %d attempts within timeout 100ms: %v", callCount, retriableError),
791+
"Error should indicate timeout or max attempts reached: %v", err)
792+
assert.True(t, callCount >= 1, "Operation should be called at least once")
793+
}
794+
795+
// Test startupWithRetry with context cancellation during backoff
796+
func TestStartupWithRetry_ContextCancelledDuringBackoff(t *testing.T) {
797+
mockClientManager := new(mockClientManager)
798+
799+
azappcfg := &AzureAppConfiguration{
800+
clientManager: mockClientManager,
801+
keyValues: make(map[string]any),
802+
featureFlags: make(map[string]any),
803+
kvSelectors: []Selector{{KeyFilter: "*", LabelFilter: "\x00"}},
804+
}
805+
806+
callCount := 0
807+
retriableError := &azcore.ResponseError{StatusCode: http.StatusServiceUnavailable}
808+
809+
// Create an operation that fails with retriable error
810+
operation := func(ctx context.Context) error {
811+
callCount++
812+
return retriableError
813+
}
814+
815+
ctx, cancel := context.WithCancel(context.Background())
816+
817+
// Cancel the context after a short delay to simulate cancellation during backoff
818+
go func() {
819+
time.Sleep(50 * time.Millisecond)
820+
cancel()
821+
}()
822+
823+
err := azappcfg.startupWithRetry(ctx, 10*time.Second, operation)
824+
825+
assert.Error(t, err)
826+
assert.Contains(t, err.Error(), "load from Azure App Configuration timed out: context canceled")
827+
}
828+
829+
// Test startupWithRetry with default timeout when zero timeout provided
830+
func TestStartupWithRetry_DefaultTimeout(t *testing.T) {
831+
mockClientManager := new(mockClientManager)
832+
833+
azappcfg := &AzureAppConfiguration{
834+
clientManager: mockClientManager,
835+
keyValues: make(map[string]any),
836+
featureFlags: make(map[string]any),
837+
kvSelectors: []Selector{{KeyFilter: "*", LabelFilter: "\x00"}},
838+
}
839+
840+
callCount := 0
841+
// Create an operation that succeeds
842+
operation := func(ctx context.Context) error {
843+
callCount++
844+
return nil
845+
}
846+
847+
ctx := context.Background()
848+
// Pass zero timeout to test default timeout usage
849+
err := azappcfg.startupWithRetry(ctx, 0, operation)
850+
851+
assert.NoError(t, err)
852+
assert.Equal(t, 1, callCount, "Operation should be called once")
853+
}
854+
855+
// Test startupWithRetry with insufficient time remaining for retry
856+
func TestStartupWithRetry_InsufficientTimeForRetry(t *testing.T) {
857+
mockClientManager := new(mockClientManager)
858+
859+
azappcfg := &AzureAppConfiguration{
860+
clientManager: mockClientManager,
861+
keyValues: make(map[string]any),
862+
featureFlags: make(map[string]any),
863+
kvSelectors: []Selector{{KeyFilter: "*", LabelFilter: "\x00"}},
864+
}
865+
866+
callCount := 0
867+
retriableError := &azcore.ResponseError{StatusCode: http.StatusServiceUnavailable}
868+
869+
// Create an operation that always fails
870+
operation := func(ctx context.Context) error {
871+
callCount++
872+
// Add some delay to consume time
873+
time.Sleep(50 * time.Millisecond)
874+
return retriableError
875+
}
876+
877+
ctx := context.Background()
878+
// Use a short timeout that will be consumed by the first failure and not allow retry
879+
err := azappcfg.startupWithRetry(ctx, 80*time.Millisecond, operation)
880+
881+
assert.Error(t, err)
882+
assert.True(t,
883+
err.Error() == "startup timeout reached after 80ms" ||
884+
strings.Contains(err.Error(), "load from Azure App Configuration failed after") && strings.Contains(err.Error(), "attempts within timeout"),
885+
"Error should indicate timeout or insufficient time: %v", err)
886+
assert.True(t, callCount >= 1, "Operation should be called at least once")
887+
}

azureappconfiguration/options.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,9 @@ type Options struct {
4343
// LoadBalancingEnabled specifies whether to enable load balancing across multiple replicas of the Azure App Configuration service.
4444
// It defaults to false.
4545
LoadBalancingEnabled bool
46+
47+
// StartupOptions is used when initially loading data into the configuration provider.
48+
StartupOptions StartupOptions
4649
}
4750

4851
// AuthenticationOptions contains parameters for authenticating with the Azure App Configuration service.
@@ -159,3 +162,9 @@ type ConstructionOptions struct {
159162
// If not provided, the default separator "." will be used.
160163
Separator string
161164
}
165+
166+
// StartupOptions is used when initially loading data into the configuration provider.
167+
type StartupOptions struct {
168+
// Timeout specifies the amount of time allowed to load data from Azure App Configuration on startup.
169+
Timeout time.Duration
170+
}

0 commit comments

Comments
 (0)