From 74d1d550514446312e86e85a057391c415e332be Mon Sep 17 00:00:00 2001 From: Daniel Adams Date: Thu, 5 Nov 2020 10:14:04 -0500 Subject: [PATCH] Feature: Exponential Backoff in Retry Middleware --- docs/content/middlewares/retry.md | 30 +++-- .../dynamic-configuration/docker-labels.yml | 1 + .../reference/dynamic-configuration/file.toml | 1 + .../reference/dynamic-configuration/file.yaml | 1 + .../reference/dynamic-configuration/kv-ref.md | 1 + .../marathon-labels.json | 1 + integration/fixtures/retry/backoff.toml | 45 ++++++++ integration/retry_test.go | 31 ++++++ pkg/config/dynamic/middlewares.go | 3 +- pkg/config/label/label_test.go | 8 +- pkg/middlewares/retry/retry.go | 103 ++++++++++++------ pkg/middlewares/retry/retry_test.go | 37 +++++++ 12 files changed, 218 insertions(+), 44 deletions(-) create mode 100644 integration/fixtures/retry/backoff.toml diff --git a/docs/content/middlewares/retry.md b/docs/content/middlewares/retry.md index 31cc9728e..4766c2ac0 100644 --- a/docs/content/middlewares/retry.md +++ b/docs/content/middlewares/retry.md @@ -9,17 +9,19 @@ TODO: add schema The Retry middleware is in charge of reissuing a request a given number of times to a backend server if that server does not reply. To be clear, as soon as the server answers, the middleware stops retrying, regardless of the response status. +The Retry middleware has an optional configuration for exponential backoff. ## Configuration Examples ```yaml tab="Docker" -# Retry to send request 4 times +# Retry to send request 4 times with exponential backoff labels: - "traefik.http.middlewares.test-retry.retry.attempts=4" + - "traefik.http.middlewares.test-retry.retry.initialinterval=100ms" ``` ```yaml tab="Kubernetes" -# Retry to send request 4 times +# Retry to send request 4 times with exponential backoff apiVersion: traefik.containo.us/v1alpha1 kind: Middleware metadata: @@ -27,45 +29,55 @@ metadata: spec: retry: attempts: 4 + initialInterval: 100ms ``` ```yaml tab="Consul Catalog" -# Retry to send request 4 times +# Retry to send request 4 times with exponential backoff - "traefik.http.middlewares.test-retry.retry.attempts=4" +- "traefik.http.middlewares.test-retry.retry.initialinterval=100ms" ``` ```json tab="Marathon" "labels": { - "traefik.http.middlewares.test-retry.retry.attempts": "4" + "traefik.http.middlewares.test-retry.retry.attempts": "4", + "traefik.http.middlewares.test-retry.retry.initialinterval": "100ms", } ``` ```yaml tab="Rancher" -# Retry to send request 4 times +# Retry to send request 4 times with exponential backoff labels: - "traefik.http.middlewares.test-retry.retry.attempts=4" + - "traefik.http.middlewares.test-retry.retry.initialinterval=100ms" ``` ```toml tab="File (TOML)" # Retry to send request 4 times [http.middlewares] [http.middlewares.test-retry.retry] - attempts = 4 + attempts = 4 + initialInterval = "100ms" ``` ```yaml tab="File (YAML)" -# Retry to send request 4 times +# Retry to send request 4 times with exponential backoff http: middlewares: test-retry: retry: - attempts: 4 + attempts: 4 + initialInterval: 100ms ``` ## Configuration Options -### `attempts` +### `attempts` _mandatory_ The `attempts` option defines how many times the request should be retried. + +### `initialInterval` + +The `initialInterval` option defines the first wait time in the exponential backoff series (provided in seconds or as a valid duration format, see [time.ParseDuration](https://golang.org/pkg/time/#ParseDuration)). The maximum interval is calculated as twice the `initialInterval`. If unspecified, requests will be retried immediately. diff --git a/docs/content/reference/dynamic-configuration/docker-labels.yml b/docs/content/reference/dynamic-configuration/docker-labels.yml index 0ff31abd7..3f5c05189 100644 --- a/docs/content/reference/dynamic-configuration/docker-labels.yml +++ b/docs/content/reference/dynamic-configuration/docker-labels.yml @@ -112,6 +112,7 @@ - "traefik.http.middlewares.middleware19.replacepathregex.regex=foobar" - "traefik.http.middlewares.middleware19.replacepathregex.replacement=foobar" - "traefik.http.middlewares.middleware20.retry.attempts=42" +- "traefik.http.middlewares.middleware20.retry.initialinterval=42" - "traefik.http.middlewares.middleware21.stripprefix.forceslash=true" - "traefik.http.middlewares.middleware21.stripprefix.prefixes=foobar, foobar" - "traefik.http.middlewares.middleware22.stripprefixregex.regex=foobar, foobar" diff --git a/docs/content/reference/dynamic-configuration/file.toml b/docs/content/reference/dynamic-configuration/file.toml index 37c7daafe..353cb7285 100644 --- a/docs/content/reference/dynamic-configuration/file.toml +++ b/docs/content/reference/dynamic-configuration/file.toml @@ -261,6 +261,7 @@ [http.middlewares.Middleware20] [http.middlewares.Middleware20.retry] attempts = 42 + initialInterval = 42 [http.middlewares.Middleware21] [http.middlewares.Middleware21.stripPrefix] prefixes = ["foobar", "foobar"] diff --git a/docs/content/reference/dynamic-configuration/file.yaml b/docs/content/reference/dynamic-configuration/file.yaml index cd8359311..456272ec1 100644 --- a/docs/content/reference/dynamic-configuration/file.yaml +++ b/docs/content/reference/dynamic-configuration/file.yaml @@ -298,6 +298,7 @@ http: Middleware20: retry: attempts: 42 + initialInterval: 42 Middleware21: stripPrefix: prefixes: diff --git a/docs/content/reference/dynamic-configuration/kv-ref.md b/docs/content/reference/dynamic-configuration/kv-ref.md index f7db9f449..64199fc3b 100644 --- a/docs/content/reference/dynamic-configuration/kv-ref.md +++ b/docs/content/reference/dynamic-configuration/kv-ref.md @@ -129,6 +129,7 @@ | `traefik/http/middlewares/Middleware19/replacePathRegex/regex` | `foobar` | | `traefik/http/middlewares/Middleware19/replacePathRegex/replacement` | `foobar` | | `traefik/http/middlewares/Middleware20/retry/attempts` | `42` | +| `traefik/http/middlewares/Middleware20/retry/initialInterval` | `42` | | `traefik/http/middlewares/Middleware21/stripPrefix/forceSlash` | `true` | | `traefik/http/middlewares/Middleware21/stripPrefix/prefixes/0` | `foobar` | | `traefik/http/middlewares/Middleware21/stripPrefix/prefixes/1` | `foobar` | diff --git a/docs/content/reference/dynamic-configuration/marathon-labels.json b/docs/content/reference/dynamic-configuration/marathon-labels.json index 99e111b50..1698cef1a 100644 --- a/docs/content/reference/dynamic-configuration/marathon-labels.json +++ b/docs/content/reference/dynamic-configuration/marathon-labels.json @@ -111,6 +111,7 @@ "traefik.http.middlewares.middleware19.replacepathregex.regex": "foobar", "traefik.http.middlewares.middleware19.replacepathregex.replacement": "foobar", "traefik.http.middlewares.middleware20.retry.attempts": "42", +"traefik.http.middlewares.middleware20.retry.initialinterval": "42", "traefik.http.middlewares.middleware21.stripprefix.forceslash": "true", "traefik.http.middlewares.middleware21.stripprefix.prefixes": "foobar, foobar", "traefik.http.middlewares.middleware22.stripprefixregex.regex": "foobar, foobar", diff --git a/integration/fixtures/retry/backoff.toml b/integration/fixtures/retry/backoff.toml new file mode 100644 index 000000000..3ef5c02b1 --- /dev/null +++ b/integration/fixtures/retry/backoff.toml @@ -0,0 +1,45 @@ +[global] + checkNewVersion = false + sendAnonymousUsage = false + +[log] + level = "DEBUG" + +[entryPoints] + [entryPoints.web] + address = ":8000" + +[api] + insecure = true + +[providers.file] + filename = "{{ .SelfFilename }}" + +## dynamic configuration ## + +[http.routers] + [http.routers.router1] + service = "service1" + middlewares = [ "retry" ] + rule = "PathPrefix(`/`)" + +[http.middlewares.retry.retry] + attempts = 4 + initialInterval = "500ms" + + +[http.services] + [http.services.service1] + [http.services.service1.loadBalancer] + + [[http.services.service1.loadBalancer.servers]] + url = "http://{{.WhoamiEndpoint}}:8080" + + [[http.services.service1.loadBalancer.servers]] + url = "http://{{.WhoamiEndpoint}}:8081" + + [[http.services.service1.loadBalancer.servers]] + url = "http://{{.WhoamiEndpoint}}:8082" + + [[http.services.service1.loadBalancer.servers]] + url = "http://{{.WhoamiEndpoint}}:80" diff --git a/integration/retry_test.go b/integration/retry_test.go index c270bfa7e..e55dc10c1 100644 --- a/integration/retry_test.go +++ b/integration/retry_test.go @@ -34,10 +34,41 @@ func (s *RetrySuite) TestRetry(c *check.C) { err = try.GetRequest("http://127.0.0.1:8080/api/rawdata", 60*time.Second, try.BodyContains("PathPrefix(`/`)")) c.Assert(err, checker.IsNil) + start := time.Now() // This simulates a DialTimeout when connecting to the backend server. response, err := http.Get("http://127.0.0.1:8000/") + duration, allowed := time.Since(start), time.Millisecond*250 c.Assert(err, checker.IsNil) c.Assert(response.StatusCode, checker.Equals, http.StatusOK) + c.Assert(int64(duration), checker.LessThan, int64(allowed)) +} + +func (s *RetrySuite) TestRetryBackoff(c *check.C) { + whoamiEndpoint := s.composeProject.Container(c, "whoami").NetworkSettings.IPAddress + file := s.adaptFile(c, "fixtures/retry/backoff.toml", struct { + WhoamiEndpoint string + }{whoamiEndpoint}) + defer os.Remove(file) + + cmd, display := s.traefikCmd(withConfigFile(file)) + defer display(c) + err := cmd.Start() + c.Assert(err, checker.IsNil) + defer s.killCmd(cmd) + + err = try.GetRequest("http://127.0.0.1:8080/api/rawdata", 60*time.Second, try.BodyContains("PathPrefix(`/`)")) + c.Assert(err, checker.IsNil) + + start := time.Now() + // This simulates a DialTimeout when connecting to the backend server. + response, err := http.Get("http://127.0.0.1:8000/") + duration := time.Since(start) + // test case delays: 500 + 700 + 1000ms with randomization. It should be safely > 1500ms + minAllowed := time.Millisecond * 1500 + + c.Assert(err, checker.IsNil) + c.Assert(response.StatusCode, checker.Equals, http.StatusOK) + c.Assert(int64(duration), checker.GreaterThan, int64(minAllowed)) } func (s *RetrySuite) TestRetryWebsocket(c *check.C) { diff --git a/pkg/config/dynamic/middlewares.go b/pkg/config/dynamic/middlewares.go index 3f522907c..17f949c50 100644 --- a/pkg/config/dynamic/middlewares.go +++ b/pkg/config/dynamic/middlewares.go @@ -376,7 +376,8 @@ type ReplacePathRegex struct { // Retry holds the retry configuration. type Retry struct { - Attempts int `json:"attempts,omitempty" toml:"attempts,omitempty" yaml:"attempts,omitempty" export:"true"` + Attempts int `json:"attempts,omitempty" toml:"attempts,omitempty" yaml:"attempts,omitempty"` + InitialInterval ptypes.Duration `json:"initialInterval,omitempty" toml:"initialInterval,omitempty" yaml:"initialInterval,omitempty"` } // +k8s:deepcopy-gen=true diff --git a/pkg/config/label/label_test.go b/pkg/config/label/label_test.go index 319af7d8c..5182d53ce 100644 --- a/pkg/config/label/label_test.go +++ b/pkg/config/label/label_test.go @@ -122,6 +122,7 @@ func TestDecodeConfiguration(t *testing.T) { "traefik.http.middlewares.Middleware15.replacepathregex.regex": "foobar", "traefik.http.middlewares.Middleware15.replacepathregex.replacement": "foobar", "traefik.http.middlewares.Middleware16.retry.attempts": "42", + "traefik.http.middlewares.Middleware16.retry.initialinterval": "1s", "traefik.http.middlewares.Middleware17.stripprefix.prefixes": "foobar, fiibar", "traefik.http.middlewares.Middleware18.stripprefixregex.regex": "foobar, fiibar", "traefik.http.middlewares.Middleware19.compress": "true", @@ -416,7 +417,8 @@ func TestDecodeConfiguration(t *testing.T) { }, "Middleware16": { Retry: &dynamic.Retry{ - Attempts: 42, + Attempts: 42, + InitialInterval: ptypes.Duration(time.Second), }, }, "Middleware17": { @@ -884,7 +886,8 @@ func TestEncodeConfiguration(t *testing.T) { }, "Middleware16": { Retry: &dynamic.Retry{ - Attempts: 42, + Attempts: 42, + InitialInterval: ptypes.Duration(time.Second), }, }, "Middleware17": { @@ -1238,6 +1241,7 @@ func TestEncodeConfiguration(t *testing.T) { "traefik.HTTP.Middlewares.Middleware15.ReplacePathRegex.Regex": "foobar", "traefik.HTTP.Middlewares.Middleware15.ReplacePathRegex.Replacement": "foobar", "traefik.HTTP.Middlewares.Middleware16.Retry.Attempts": "42", + "traefik.HTTP.Middlewares.Middleware16.Retry.InitialInterval": "1000000000", "traefik.HTTP.Middlewares.Middleware17.StripPrefix.Prefixes": "foobar, fiibar", "traefik.HTTP.Middlewares.Middleware17.StripPrefix.ForceSlash": "true", "traefik.HTTP.Middlewares.Middleware18.StripPrefixRegex.Regex": "foobar, fiibar", diff --git a/pkg/middlewares/retry/retry.go b/pkg/middlewares/retry/retry.go index 31b2b0812..57333d9d3 100644 --- a/pkg/middlewares/retry/retry.go +++ b/pkg/middlewares/retry/retry.go @@ -5,10 +5,13 @@ import ( "context" "fmt" "io/ioutil" + "math" "net" "net/http" "net/http/httptrace" + "time" + "github.com/cenkalti/backoff/v4" "github.com/opentracing/opentracing-go/ext" "github.com/traefik/traefik/v2/pkg/config/dynamic" "github.com/traefik/traefik/v2/pkg/log" @@ -34,12 +37,18 @@ type Listener interface { // each of them about a retry attempt. type Listeners []Listener +// nexter returns the duration to wait before retrying the operation. +type nexter interface { + NextBackOff() time.Duration +} + // retry is a middleware that retries requests. type retry struct { - attempts int - next http.Handler - listener Listener - name string + attempts int + initialInterval time.Duration + next http.Handler + listener Listener + name string } // New returns a new retry middleware. @@ -51,10 +60,11 @@ func New(ctx context.Context, next http.Handler, config dynamic.Retry, listener } return &retry{ - attempts: config.Attempts, - next: next, - listener: listener, - name: name, + attempts: config.Attempts, + initialInterval: time.Duration(config.InitialInterval), + next: next, + listener: listener, + name: name, }, nil } @@ -72,36 +82,65 @@ func (r *retry) ServeHTTP(rw http.ResponseWriter, req *http.Request) { } attempts := 1 + backOff := r.newBackOff() + currentInterval := 0 * time.Millisecond for { - shouldRetry := attempts < r.attempts - retryResponseWriter := newResponseWriter(rw, shouldRetry) + select { + case <-time.After(currentInterval): - // Disable retries when the backend already received request data - trace := &httptrace.ClientTrace{ - WroteHeaders: func() { - retryResponseWriter.DisableRetries() - }, - WroteRequest: func(httptrace.WroteRequestInfo) { - retryResponseWriter.DisableRetries() - }, + shouldRetry := attempts < r.attempts + retryResponseWriter := newResponseWriter(rw, shouldRetry) + + // Disable retries when the backend already received request data + trace := &httptrace.ClientTrace{ + WroteHeaders: func() { + retryResponseWriter.DisableRetries() + }, + WroteRequest: func(httptrace.WroteRequestInfo) { + retryResponseWriter.DisableRetries() + }, + } + newCtx := httptrace.WithClientTrace(req.Context(), trace) + + r.next.ServeHTTP(retryResponseWriter, req.WithContext(newCtx)) + + if !retryResponseWriter.ShouldRetry() { + return + } + + currentInterval = backOff.NextBackOff() + + attempts++ + + log.FromContext(middlewares.GetLoggerCtx(req.Context(), r.name, typeName)). + Debugf("New attempt %d for request: %v", attempts, req.URL) + + r.listener.Retried(req, attempts) + + case <-req.Context().Done(): + return } - newCtx := httptrace.WithClientTrace(req.Context(), trace) - - r.next.ServeHTTP(retryResponseWriter, req.WithContext(newCtx)) - - if !retryResponseWriter.ShouldRetry() { - break - } - - attempts++ - - log.FromContext(middlewares.GetLoggerCtx(req.Context(), r.name, typeName)). - Debugf("New attempt %d for request: %v", attempts, req.URL) - - r.listener.Retried(req, attempts) } } +func (r *retry) newBackOff() nexter { + if r.attempts < 2 || r.initialInterval <= 0 { + return &backoff.ZeroBackOff{} + } + + b := backoff.NewExponentialBackOff() + b.InitialInterval = r.initialInterval + + // calculate the multiplier for the given number of attempts + // so that applying the multiplier for the given number of attempts will not exceed 2 times the initial interval + // it allows to control the progression along the attempts + b.Multiplier = math.Pow(2, 1/float64(r.attempts-1)) + + // according to docs, b.Reset() must be called before using + b.Reset() + return b +} + // Retried exists to implement the Listener interface. It calls Retried on each of its slice entries. func (l Listeners) Retried(req *http.Request, attempt int) { for _, listener := range l { diff --git a/pkg/middlewares/retry/retry_test.go b/pkg/middlewares/retry/retry_test.go index 88ae06c75..47e97efc6 100644 --- a/pkg/middlewares/retry/retry_test.go +++ b/pkg/middlewares/retry/retry_test.go @@ -9,10 +9,12 @@ import ( "strconv" "strings" "testing" + "time" "github.com/gorilla/websocket" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + ptypes "github.com/traefik/paerser/types" "github.com/traefik/traefik/v2/pkg/config/dynamic" "github.com/traefik/traefik/v2/pkg/middlewares/emptybackendhandler" "github.com/traefik/traefik/v2/pkg/testhelpers" @@ -35,6 +37,13 @@ func TestRetry(t *testing.T) { wantResponseStatus: http.StatusOK, amountFaultyEndpoints: 0, }, + { + desc: "no retry on success with backoff", + config: dynamic.Retry{Attempts: 1, InitialInterval: ptypes.Duration(time.Microsecond * 50)}, + wantRetryAttempts: 0, + wantResponseStatus: http.StatusOK, + amountFaultyEndpoints: 0, + }, { desc: "no retry when max request attempts is one", config: dynamic.Retry{Attempts: 1}, @@ -42,6 +51,13 @@ func TestRetry(t *testing.T) { wantResponseStatus: http.StatusBadGateway, amountFaultyEndpoints: 1, }, + { + desc: "no retry when max request attempts is one with backoff", + config: dynamic.Retry{Attempts: 1, InitialInterval: ptypes.Duration(time.Microsecond * 50)}, + wantRetryAttempts: 0, + wantResponseStatus: http.StatusBadGateway, + amountFaultyEndpoints: 1, + }, { desc: "one retry when one server is faulty", config: dynamic.Retry{Attempts: 2}, @@ -49,6 +65,13 @@ func TestRetry(t *testing.T) { wantResponseStatus: http.StatusOK, amountFaultyEndpoints: 1, }, + { + desc: "one retry when one server is faulty with backoff", + config: dynamic.Retry{Attempts: 2, InitialInterval: ptypes.Duration(time.Microsecond * 50)}, + wantRetryAttempts: 1, + wantResponseStatus: http.StatusOK, + amountFaultyEndpoints: 1, + }, { desc: "two retries when two servers are faulty", config: dynamic.Retry{Attempts: 3}, @@ -56,6 +79,13 @@ func TestRetry(t *testing.T) { wantResponseStatus: http.StatusOK, amountFaultyEndpoints: 2, }, + { + desc: "two retries when two servers are faulty with backoff", + config: dynamic.Retry{Attempts: 3, InitialInterval: ptypes.Duration(time.Microsecond * 50)}, + wantRetryAttempts: 2, + wantResponseStatus: http.StatusOK, + amountFaultyEndpoints: 2, + }, { desc: "max attempts exhausted delivers the 5xx response", config: dynamic.Retry{Attempts: 3}, @@ -63,6 +93,13 @@ func TestRetry(t *testing.T) { wantResponseStatus: http.StatusBadGateway, amountFaultyEndpoints: 3, }, + { + desc: "max attempts exhausted delivers the 5xx response with backoff", + config: dynamic.Retry{Attempts: 3, InitialInterval: ptypes.Duration(time.Microsecond * 50)}, + wantRetryAttempts: 2, + wantResponseStatus: http.StatusBadGateway, + amountFaultyEndpoints: 3, + }, } backendServer := httptest.NewServer(http.HandlerFunc(func(rw http.ResponseWriter, req *http.Request) {