fix(tailnet): retry after transport dial timeouts (#22977) (cherry-pi… · coder/coder@1a774ab
@@ -1075,6 +1075,84 @@ func TestController_Disconnects(t *testing.T) {
10751075_ = testutil.TryReceive(testCtx, t, uut.Closed())
10761076}
107710771078+func TestController_RetriesWrappedDeadlineExceeded(t *testing.T) {
1079+t.Parallel()
1080+testCtx := testutil.Context(t, testutil.WaitShort)
1081+ctx, cancel := context.WithCancel(testCtx)
1082+defer cancel()
1083+1084+logger := testutil.Logger(t)
1085+dialer := &scriptedDialer{
1086+attempts: make(chan int, 10),
1087+dialFn: func(ctx context.Context, attempt int) (tailnet.ControlProtocolClients, error) {
1088+if attempt == 1 {
1089+return tailnet.ControlProtocolClients{}, &net.OpError{
1090+Op: "dial",
1091+Net: "tcp",
1092+Err: context.DeadlineExceeded,
1093+ }
1094+ }
1095+1096+<-ctx.Done()
1097+return tailnet.ControlProtocolClients{}, ctx.Err()
1098+ },
1099+ }
1100+1101+uut := tailnet.NewController(logger.Named("ctrl"), dialer)
1102+uut.Run(ctx)
1103+1104+require.Equal(t, 1, testutil.TryReceive(testCtx, t, dialer.attempts))
1105+require.Equal(t, 2, testutil.TryReceive(testCtx, t, dialer.attempts))
1106+1107+select {
1108+case <-uut.Closed():
1109+t.Fatal("controller exited after wrapped deadline exceeded")
1110+default:
1111+ }
1112+1113+cancel()
1114+_ = testutil.TryReceive(testCtx, t, uut.Closed())
1115+}
1116+1117+func TestController_DoesNotRedialAfterCancel(t *testing.T) {
1118+t.Parallel()
1119+testCtx := testutil.Context(t, testutil.WaitShort)
1120+ctx, cancel := context.WithCancel(testCtx)
1121+logger := testutil.Logger(t)
1122+1123+fClient := newFakeWorkspaceUpdateClient(testCtx, t)
1124+dialer := &scriptedDialer{
1125+attempts: make(chan int, 10),
1126+dialFn: func(_ context.Context, _ int) (tailnet.ControlProtocolClients, error) {
1127+return tailnet.ControlProtocolClients{
1128+WorkspaceUpdates: fClient,
1129+Closer: fakeCloser{},
1130+ }, nil
1131+ },
1132+ }
1133+fCtrl := newFakeUpdatesController(testCtx, t)
1134+1135+uut := tailnet.NewController(logger.Named("ctrl"), dialer)
1136+uut.WorkspaceUpdatesCtrl = fCtrl
1137+uut.Run(ctx)
1138+1139+require.Equal(t, 1, testutil.TryReceive(testCtx, t, dialer.attempts))
1140+call := testutil.TryReceive(testCtx, t, fCtrl.calls)
1141+require.Equal(t, fClient, call.client)
1142+testutil.RequireSend[tailnet.CloserWaiter](testCtx, t, call.resp, newFakeCloserWaiter())
1143+1144+cancel()
1145+closeCall := testutil.TryReceive(testCtx, t, fClient.close)
1146+testutil.RequireSend(testCtx, t, closeCall, nil)
1147+_ = testutil.TryReceive(testCtx, t, uut.Closed())
1148+1149+select {
1150+case attempt := <-dialer.attempts:
1151+t.Fatalf("unexpected redial attempt after cancel: %d", attempt)
1152+default:
1153+ }
1154+}
1155+10781156func TestController_TelemetrySuccess(t *testing.T) {
10791157t.Parallel()
10801158ctx := testutil.Context(t, testutil.WaitShort)
@@ -2070,6 +2148,31 @@ func newFakeCloserWaiter() *fakeCloserWaiter {
20702148 }
20712149}
207221502151+type scriptedDialer struct {
2152+attempts chan int
2153+dialFn func(context.Context, int) (tailnet.ControlProtocolClients, error)
2154+2155+mu sync.Mutex
2156+attemptN int
2157+}
2158+2159+func (d *scriptedDialer) Dial(ctx context.Context, _ tailnet.ResumeTokenController) (tailnet.ControlProtocolClients, error) {
2160+d.mu.Lock()
2161+d.attemptN++
2162+attempt := d.attemptN
2163+d.mu.Unlock()
2164+2165+if d.attempts != nil {
2166+select {
2167+case d.attempts <- attempt:
2168+case <-ctx.Done():
2169+return tailnet.ControlProtocolClients{}, ctx.Err()
2170+ }
2171+ }
2172+2173+return d.dialFn(ctx, attempt)
2174+}
2175+20732176type fakeWorkspaceUpdatesDialer struct {
20742177client tailnet.WorkspaceUpdatesClient
20752178}