Skip to content

Commit b73f9d8

Browse files
authored
feat: add computed workspace and agent health fields to the api (coder#8280)
1 parent eabf929 commit b73f9d8

16 files changed

+509
-26
lines changed

cli/testdata/coder_list_--output_json.golden

+5-1
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,10 @@
5252
"ttl_ms": 28800000,
5353
"last_used_at": "[timestamp]",
5454
"deleting_at": null,
55-
"locked_at": null
55+
"locked_at": null,
56+
"health": {
57+
"healthy": true,
58+
"failing_agents": []
59+
}
5660
}
5761
]

coderd/apidoc/docs.go

+49
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

coderd/apidoc/swagger.json

+49
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

coderd/workspaceagents.go

+18
Original file line numberDiff line numberDiff line change
@@ -1262,6 +1262,24 @@ func convertWorkspaceAgent(derpMap *tailcfg.DERPMap, coordinator tailnet.Coordin
12621262
workspaceAgent.ReadyAt = &dbAgent.ReadyAt.Time
12631263
}
12641264

1265+
switch {
1266+
case workspaceAgent.Status != codersdk.WorkspaceAgentConnected && workspaceAgent.LifecycleState == codersdk.WorkspaceAgentLifecycleOff:
1267+
workspaceAgent.Health.Reason = "agent is not running"
1268+
case workspaceAgent.Status == codersdk.WorkspaceAgentTimeout:
1269+
workspaceAgent.Health.Reason = "agent is taking too long to connect"
1270+
case workspaceAgent.Status == codersdk.WorkspaceAgentDisconnected:
1271+
workspaceAgent.Health.Reason = "agent has lost connection"
1272+
// Note: We could also handle codersdk.WorkspaceAgentLifecycleStartTimeout
1273+
// here, but it's more of a soft issue, so we don't want to mark the agent
1274+
// as unhealthy.
1275+
case workspaceAgent.LifecycleState == codersdk.WorkspaceAgentLifecycleStartError:
1276+
workspaceAgent.Health.Reason = "agent startup script exited with an error"
1277+
case workspaceAgent.LifecycleState.ShuttingDown():
1278+
workspaceAgent.Health.Reason = "agent is shutting down"
1279+
default:
1280+
workspaceAgent.Health.Healthy = true
1281+
}
1282+
12651283
return workspaceAgent, nil
12661284
}
12671285

coderd/workspaceagents_test.go

+3
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ func TestWorkspaceAgent(t *testing.T) {
7272
require.Equal(t, tmpDir, workspace.LatestBuild.Resources[0].Agents[0].Directory)
7373
_, err = client.WorkspaceAgent(ctx, workspace.LatestBuild.Resources[0].Agents[0].ID)
7474
require.NoError(t, err)
75+
require.True(t, workspace.LatestBuild.Resources[0].Agents[0].Health.Healthy)
7576
})
7677
t.Run("HasFallbackTroubleshootingURL", func(t *testing.T) {
7778
t.Parallel()
@@ -167,6 +168,8 @@ func TestWorkspaceAgent(t *testing.T) {
167168
}, testutil.IntervalMedium, "agent status timeout")
168169

169170
require.Equal(t, wantTroubleshootingURL, workspace.LatestBuild.Resources[0].Agents[0].TroubleshootingURL)
171+
require.False(t, workspace.LatestBuild.Resources[0].Agents[0].Health.Healthy)
172+
require.NotEmpty(t, workspace.LatestBuild.Resources[0].Agents[0].Health.Reason)
170173
})
171174
}
172175

coderd/workspaces.go

+13
Original file line numberDiff line numberDiff line change
@@ -1110,6 +1110,15 @@ func convertWorkspace(
11101110
lockedAt = &workspace.LockedAt.Time
11111111
}
11121112

1113+
failingAgents := []uuid.UUID{}
1114+
for _, resource := range workspaceBuild.Resources {
1115+
for _, agent := range resource.Agents {
1116+
if !agent.Health.Healthy {
1117+
failingAgents = append(failingAgents, agent.ID)
1118+
}
1119+
}
1120+
}
1121+
11131122
var (
11141123
ttlMillis = convertWorkspaceTTLMillis(workspace.Ttl)
11151124
deletingAt = calculateDeletingAt(workspace, template, workspaceBuild)
@@ -1135,6 +1144,10 @@ func convertWorkspace(
11351144
LastUsedAt: workspace.LastUsedAt,
11361145
DeletingAt: deletingAt,
11371146
LockedAt: lockedAt,
1147+
Health: codersdk.WorkspaceHealth{
1148+
Healthy: len(failingAgents) == 0,
1149+
FailingAgents: failingAgents,
1150+
},
11381151
}
11391152
}
11401153

coderd/workspaces_test.go

+142
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,148 @@ func TestWorkspace(t *testing.T) {
164164
assert.Equal(t, templateDisplayName, ws.TemplateDisplayName)
165165
assert.Equal(t, templateAllowUserCancelWorkspaceJobs, ws.TemplateAllowUserCancelWorkspaceJobs)
166166
})
167+
168+
t.Run("Health", func(t *testing.T) {
169+
t.Parallel()
170+
171+
t.Run("Healthy", func(t *testing.T) {
172+
t.Parallel()
173+
client := coderdtest.New(t, &coderdtest.Options{IncludeProvisionerDaemon: true})
174+
user := coderdtest.CreateFirstUser(t, client)
175+
version := coderdtest.CreateTemplateVersion(t, client, user.OrganizationID, &echo.Responses{
176+
Parse: echo.ParseComplete,
177+
ProvisionApply: []*proto.Provision_Response{{
178+
Type: &proto.Provision_Response_Complete{
179+
Complete: &proto.Provision_Complete{
180+
Resources: []*proto.Resource{{
181+
Name: "some",
182+
Type: "example",
183+
Agents: []*proto.Agent{{
184+
Id: uuid.NewString(),
185+
Auth: &proto.Agent_Token{},
186+
}},
187+
}},
188+
},
189+
},
190+
}},
191+
})
192+
coderdtest.AwaitTemplateVersionJob(t, client, version.ID)
193+
template := coderdtest.CreateTemplate(t, client, user.OrganizationID, version.ID)
194+
workspace := coderdtest.CreateWorkspace(t, client, user.OrganizationID, template.ID)
195+
coderdtest.AwaitWorkspaceBuildJob(t, client, workspace.LatestBuild.ID)
196+
197+
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitLong)
198+
defer cancel()
199+
200+
workspace, err := client.Workspace(ctx, workspace.ID)
201+
require.NoError(t, err)
202+
203+
agent := workspace.LatestBuild.Resources[0].Agents[0]
204+
205+
assert.True(t, workspace.Health.Healthy)
206+
assert.Equal(t, []uuid.UUID{}, workspace.Health.FailingAgents)
207+
assert.True(t, agent.Health.Healthy)
208+
assert.Empty(t, agent.Health.Reason)
209+
})
210+
211+
t.Run("Unhealthy", func(t *testing.T) {
212+
t.Parallel()
213+
client := coderdtest.New(t, &coderdtest.Options{IncludeProvisionerDaemon: true})
214+
user := coderdtest.CreateFirstUser(t, client)
215+
version := coderdtest.CreateTemplateVersion(t, client, user.OrganizationID, &echo.Responses{
216+
Parse: echo.ParseComplete,
217+
ProvisionApply: []*proto.Provision_Response{{
218+
Type: &proto.Provision_Response_Complete{
219+
Complete: &proto.Provision_Complete{
220+
Resources: []*proto.Resource{{
221+
Name: "some",
222+
Type: "example",
223+
Agents: []*proto.Agent{{
224+
Id: uuid.NewString(),
225+
Auth: &proto.Agent_Token{},
226+
ConnectionTimeoutSeconds: 1,
227+
}},
228+
}},
229+
},
230+
},
231+
}},
232+
})
233+
coderdtest.AwaitTemplateVersionJob(t, client, version.ID)
234+
template := coderdtest.CreateTemplate(t, client, user.OrganizationID, version.ID)
235+
workspace := coderdtest.CreateWorkspace(t, client, user.OrganizationID, template.ID)
236+
coderdtest.AwaitWorkspaceBuildJob(t, client, workspace.LatestBuild.ID)
237+
238+
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitLong)
239+
defer cancel()
240+
241+
var err error
242+
testutil.Eventually(ctx, t, func(ctx context.Context) bool {
243+
workspace, err = client.Workspace(ctx, workspace.ID)
244+
return assert.NoError(t, err) && !workspace.Health.Healthy
245+
}, testutil.IntervalMedium)
246+
247+
agent := workspace.LatestBuild.Resources[0].Agents[0]
248+
249+
assert.False(t, workspace.Health.Healthy)
250+
assert.Equal(t, []uuid.UUID{agent.ID}, workspace.Health.FailingAgents)
251+
assert.False(t, agent.Health.Healthy)
252+
assert.NotEmpty(t, agent.Health.Reason)
253+
})
254+
255+
t.Run("Mixed health", func(t *testing.T) {
256+
t.Parallel()
257+
client := coderdtest.New(t, &coderdtest.Options{IncludeProvisionerDaemon: true})
258+
user := coderdtest.CreateFirstUser(t, client)
259+
version := coderdtest.CreateTemplateVersion(t, client, user.OrganizationID, &echo.Responses{
260+
Parse: echo.ParseComplete,
261+
ProvisionApply: []*proto.Provision_Response{{
262+
Type: &proto.Provision_Response_Complete{
263+
Complete: &proto.Provision_Complete{
264+
Resources: []*proto.Resource{{
265+
Name: "some",
266+
Type: "example",
267+
Agents: []*proto.Agent{{
268+
Id: uuid.NewString(),
269+
Name: "a1",
270+
Auth: &proto.Agent_Token{},
271+
}, {
272+
Id: uuid.NewString(),
273+
Name: "a2",
274+
Auth: &proto.Agent_Token{},
275+
ConnectionTimeoutSeconds: 1,
276+
}},
277+
}},
278+
},
279+
},
280+
}},
281+
})
282+
coderdtest.AwaitTemplateVersionJob(t, client, version.ID)
283+
template := coderdtest.CreateTemplate(t, client, user.OrganizationID, version.ID)
284+
workspace := coderdtest.CreateWorkspace(t, client, user.OrganizationID, template.ID)
285+
coderdtest.AwaitWorkspaceBuildJob(t, client, workspace.LatestBuild.ID)
286+
287+
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitLong)
288+
defer cancel()
289+
290+
var err error
291+
testutil.Eventually(ctx, t, func(ctx context.Context) bool {
292+
workspace, err = client.Workspace(ctx, workspace.ID)
293+
return assert.NoError(t, err) && !workspace.Health.Healthy
294+
}, testutil.IntervalMedium)
295+
296+
assert.False(t, workspace.Health.Healthy)
297+
assert.Len(t, workspace.Health.FailingAgents, 1)
298+
299+
agent1 := workspace.LatestBuild.Resources[0].Agents[0]
300+
agent2 := workspace.LatestBuild.Resources[0].Agents[1]
301+
302+
assert.Equal(t, []uuid.UUID{agent2.ID}, workspace.Health.FailingAgents)
303+
assert.True(t, agent1.Health.Healthy)
304+
assert.Empty(t, agent1.Health.Reason)
305+
assert.False(t, agent2.Health.Healthy)
306+
assert.NotEmpty(t, agent2.Health.Reason)
307+
})
308+
})
167309
}
168310

169311
func TestAdminViewAllWorkspaces(t *testing.T) {

0 commit comments

Comments
 (0)