diff --git a/shared/management/client/grpc.go b/shared/management/client/grpc.go index 520a83e3604..278b874ca32 100644 --- a/shared/management/client/grpc.go +++ b/shared/management/client/grpc.go @@ -25,7 +25,17 @@ import ( "github.com/netbirdio/netbird/util/wsproxy" ) -const ConnectTimeout = 10 * time.Second +const ( + ConnectTimeout = 10 * time.Second + + RegistrationRetryTimeout = 180 * time.Second + RegistrationRetryInterval = 5 * time.Second + RegistrationRetryMaxDelay = 30 * time.Second + + LoginRetryTimeout = 45 * time.Second + LoginRetryInterval = 3 * time.Second + LoginRetryMaxDelay = 15 * time.Second +) const ( errMsgMgmtPublicKey = "failed getting Management Service public key: %s" @@ -312,6 +322,38 @@ func (c *GrpcClient) IsHealthy() bool { return true } +// newRegistrationBackoff creates a backoff strategy for peer registration operations +func newRegistrationBackoff(ctx context.Context) backoff.BackOff { + return backoff.WithContext(&backoff.ExponentialBackOff{ + InitialInterval: RegistrationRetryInterval, + RandomizationFactor: backoff.DefaultRandomizationFactor, + Multiplier: backoff.DefaultMultiplier, + MaxInterval: RegistrationRetryMaxDelay, + MaxElapsedTime: RegistrationRetryTimeout, + Stop: backoff.Stop, + Clock: backoff.SystemClock, + }, ctx) +} + +// newLoginBackoff creates a backoff strategy for peer login operations +func newLoginBackoff(ctx context.Context) backoff.BackOff { + return backoff.WithContext(&backoff.ExponentialBackOff{ + InitialInterval: LoginRetryInterval, + RandomizationFactor: backoff.DefaultRandomizationFactor, + Multiplier: backoff.DefaultMultiplier, + MaxInterval: LoginRetryMaxDelay, + MaxElapsedTime: LoginRetryTimeout, + Stop: backoff.Stop, + Clock: backoff.SystemClock, + }, ctx) +} + +// isRegistrationRequest determines if a login request is actually a registration. +// Returns true if either the setup key or the JWT token is present in the request. +func isRegistrationRequest(req *proto.LoginRequest) bool { + return req.SetupKey != "" || req.JwtToken != "" +} + func (c *GrpcClient) login(serverKey wgtypes.Key, req *proto.LoginRequest) (*proto.LoginResponse, error) { if !c.ready() { return nil, errors.New(errMsgNoMgmtConnection) @@ -334,9 +376,15 @@ func (c *GrpcClient) login(serverKey wgtypes.Key, req *proto.LoginRequest) (*pro Body: loginReq, }) if err != nil { - // retry only on context canceled - if s, ok := gstatus.FromError(err); ok && s.Code() == codes.Canceled { - return err + if s, ok := gstatus.FromError(err); ok { + if s.Code() == codes.Canceled { + log.Debugf("login canceled") + return backoff.Permanent(err) + } + if s.Code() == codes.DeadlineExceeded { + log.Debugf("login timeout, retrying...") + return err + } } return backoff.Permanent(err) } @@ -344,7 +392,15 @@ func (c *GrpcClient) login(serverKey wgtypes.Key, req *proto.LoginRequest) (*pro return nil } - err = backoff.Retry(operation, nbgrpc.Backoff(c.ctx)) + isRegistration := isRegistrationRequest(req) + var backoffStrategy backoff.BackOff + if isRegistration { + backoffStrategy = newRegistrationBackoff(c.ctx) + } else { + backoffStrategy = newLoginBackoff(c.ctx) + } + + err = backoff.Retry(operation, backoffStrategy) if err != nil { log.Errorf("failed to login to Management Service: %v", err) return nil, err