fix(tailnet): retry after transport dial timeouts (#22977) (cherry-pi… · coder/coder@1a774ab

@@ -1075,6 +1075,84 @@ func TestController_Disconnects(t *testing.T) {

10751075

_ = testutil.TryReceive(testCtx, t, uut.Closed())

10761076

}

107710771078+

func TestController_RetriesWrappedDeadlineExceeded(t *testing.T) {

1079+

t.Parallel()

1080+

testCtx := testutil.Context(t, testutil.WaitShort)

1081+

ctx, cancel := context.WithCancel(testCtx)

1082+

defer cancel()

1083+1084+

logger := testutil.Logger(t)

1085+

dialer := &scriptedDialer{

1086+

attempts: make(chan int, 10),

1087+

dialFn: func(ctx context.Context, attempt int) (tailnet.ControlProtocolClients, error) {

1088+

if attempt == 1 {

1089+

return tailnet.ControlProtocolClients{}, &net.OpError{

1090+

Op: "dial",

1091+

Net: "tcp",

1092+

Err: context.DeadlineExceeded,

1093+

}

1094+

}

1095+1096+

<-ctx.Done()

1097+

return tailnet.ControlProtocolClients{}, ctx.Err()

1098+

},

1099+

}

1100+1101+

uut := tailnet.NewController(logger.Named("ctrl"), dialer)

1102+

uut.Run(ctx)

1103+1104+

require.Equal(t, 1, testutil.TryReceive(testCtx, t, dialer.attempts))

1105+

require.Equal(t, 2, testutil.TryReceive(testCtx, t, dialer.attempts))

1106+1107+

select {

1108+

case <-uut.Closed():

1109+

t.Fatal("controller exited after wrapped deadline exceeded")

1110+

default:

1111+

}

1112+1113+

cancel()

1114+

_ = testutil.TryReceive(testCtx, t, uut.Closed())

1115+

}

1116+1117+

func TestController_DoesNotRedialAfterCancel(t *testing.T) {

1118+

t.Parallel()

1119+

testCtx := testutil.Context(t, testutil.WaitShort)

1120+

ctx, cancel := context.WithCancel(testCtx)

1121+

logger := testutil.Logger(t)

1122+1123+

fClient := newFakeWorkspaceUpdateClient(testCtx, t)

1124+

dialer := &scriptedDialer{

1125+

attempts: make(chan int, 10),

1126+

dialFn: func(_ context.Context, _ int) (tailnet.ControlProtocolClients, error) {

1127+

return tailnet.ControlProtocolClients{

1128+

WorkspaceUpdates: fClient,

1129+

Closer: fakeCloser{},

1130+

}, nil

1131+

},

1132+

}

1133+

fCtrl := newFakeUpdatesController(testCtx, t)

1134+1135+

uut := tailnet.NewController(logger.Named("ctrl"), dialer)

1136+

uut.WorkspaceUpdatesCtrl = fCtrl

1137+

uut.Run(ctx)

1138+1139+

require.Equal(t, 1, testutil.TryReceive(testCtx, t, dialer.attempts))

1140+

call := testutil.TryReceive(testCtx, t, fCtrl.calls)

1141+

require.Equal(t, fClient, call.client)

1142+

testutil.RequireSend[tailnet.CloserWaiter](testCtx, t, call.resp, newFakeCloserWaiter())

1143+1144+

cancel()

1145+

closeCall := testutil.TryReceive(testCtx, t, fClient.close)

1146+

testutil.RequireSend(testCtx, t, closeCall, nil)

1147+

_ = testutil.TryReceive(testCtx, t, uut.Closed())

1148+1149+

select {

1150+

case attempt := <-dialer.attempts:

1151+

t.Fatalf("unexpected redial attempt after cancel: %d", attempt)

1152+

default:

1153+

}

1154+

}

1155+10781156

func TestController_TelemetrySuccess(t *testing.T) {

10791157

t.Parallel()

10801158

ctx := testutil.Context(t, testutil.WaitShort)

@@ -2070,6 +2148,31 @@ func newFakeCloserWaiter() *fakeCloserWaiter {

20702148

}

20712149

}

207221502151+

type scriptedDialer struct {

2152+

attempts chan int

2153+

dialFn func(context.Context, int) (tailnet.ControlProtocolClients, error)

2154+2155+

mu sync.Mutex

2156+

attemptN int

2157+

}

2158+2159+

func (d *scriptedDialer) Dial(ctx context.Context, _ tailnet.ResumeTokenController) (tailnet.ControlProtocolClients, error) {

2160+

d.mu.Lock()

2161+

d.attemptN++

2162+

attempt := d.attemptN

2163+

d.mu.Unlock()

2164+2165+

if d.attempts != nil {

2166+

select {

2167+

case d.attempts <- attempt:

2168+

case <-ctx.Done():

2169+

return tailnet.ControlProtocolClients{}, ctx.Err()

2170+

}

2171+

}

2172+2173+

return d.dialFn(ctx, attempt)

2174+

}

2175+20732176

type fakeWorkspaceUpdatesDialer struct {

20742177

client tailnet.WorkspaceUpdatesClient

20752178

}