Skip to content

Commit

Permalink
[server] Improve behavior and reporting around "refreshToken" (gitpod…
Browse files Browse the repository at this point in the history
…-io#19482)

* [server] Check token even in case of timeout

* [server] Metrics: Request counter and histogram for token refreshes

* review comments
  • Loading branch information
geropl authored Feb 28, 2024
1 parent 730eed3 commit a05f84e
Show file tree
Hide file tree
Showing 2 changed files with 66 additions and 7 deletions.
24 changes: 24 additions & 0 deletions components/server/src/prometheus-metrics.ts
Original file line number Diff line number Diff line change
Expand Up @@ -381,3 +381,27 @@ type AuthorizerSubjectIdMatch = "ctx-user-id-missing" | "passed-subject-id-missi
export function reportAuthorizerSubjectId(match: AuthorizerSubjectIdMatch) {
authorizerSubjectId.labels(match).inc();
}

export const scmTokenRefreshRequestsTotal = new prometheusClient.Counter({
name: "gitpod_scm_token_refresh_requests_total",
help: "Counter for the number of token refresh requests we issue against SCM systems",
labelNames: ["host", "result"],
});
export type ScmTokenRefreshResult =
| "success"
| "timeout"
| "error"
| "still_valid"
| "no_token"
| "not_refreshable"
| "success_after_timeout";
export function reportScmTokenRefreshRequest(host: string, result: ScmTokenRefreshResult) {
scmTokenRefreshRequestsTotal.labels(host, result).inc();
}

export const scmTokenRefreshLatencyHistogram = new prometheusClient.Histogram({
name: "gitpod_scm_token_refresh_latency_seconds",
help: "SCM token refresh latency in seconds",
labelNames: ["host"],
buckets: [0.01, 0.1, 0.2, 0.5, 1, 2, 5, 10],
});
49 changes: 42 additions & 7 deletions components/server/src/user/token-service.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import { GarbageCollectedCache } from "@gitpod/gitpod-protocol/lib/util/garbage-
import { log } from "@gitpod/gitpod-protocol/lib/util/logging";
import { getExperimentsClientForBackend } from "@gitpod/gitpod-protocol/lib/experiments/configcat-server";
import { RedisMutex } from "../redis/mutex";
import { reportScmTokenRefreshRequest, scmTokenRefreshLatencyHistogram } from "../prometheus-metrics";

@injectable()
export class TokenService implements TokenProvider {
Expand Down Expand Up @@ -63,42 +64,76 @@ export class TokenService implements TokenProvider {
}
const identity = this.getIdentityForHost(user, host);

function isValid(t: Token): boolean {
if (!t.expiryDate) {
return true;
}

const aboutToExpireTime = new Date();
aboutToExpireTime.setTime(aboutToExpireTime.getTime() + 5 * 60 * 1000);
if (t.expiryDate >= aboutToExpireTime.toISOString()) {
return true;
}
return false;
}

const doRefreshToken = async () => {
// Check: Current token so we can actually refresh?
const token = await this.userDB.findTokenForIdentity(identity);
if (!token) {
reportScmTokenRefreshRequest(host, "no_token");
return undefined;
}

const aboutToExpireTime = new Date();
aboutToExpireTime.setTime(aboutToExpireTime.getTime() + 5 * 60 * 1000);
if (!token.expiryDate || token.expiryDate >= aboutToExpireTime.toISOString()) {
if (isValid(token)) {
reportScmTokenRefreshRequest(host, "still_valid");
return token;
}

// Can we refresh these kind of tokens?
const { authProvider } = this.hostContextProvider.get(host)!;
if (!authProvider.refreshToken) {
reportScmTokenRefreshRequest(host, "not_refreshable");
return undefined;
}

await authProvider.refreshToken(user);
return await this.userDB.findTokenForIdentity(identity);
// Perform actual refresh
const stopTimer = scmTokenRefreshLatencyHistogram.startTimer({ host });
try {
await authProvider.refreshToken(user);
} finally {
stopTimer({ host });
}

const freshToken = await this.userDB.findTokenForIdentity(identity);
reportScmTokenRefreshRequest(host, "success");
return freshToken;
};

try {
const refreshedToken = await this.redisMutex.using(
[`token-refresh-${host}-${userId}`],
2000, // After 2s without extension the lock is released
3000, // After 3s without extension the lock is released
doRefreshToken,
{ retryCount: 10, retryDelay: 500 }, // We wait at most 10s until we give up, and conclude that we can't refresh the token now.
{ retryCount: 20, retryDelay: 500 }, // We wait at most 10s until we give up, and conclude that we can't refresh the token now.
);
return refreshedToken;
} catch (err) {
if (RedisMutex.isLockedError(err)) {
// In this case we already timed-out. BUT there is a high chance we are waiting on somebody else, who might already done the work for us.
// So just checking again here
const token = await this.userDB.findTokenForIdentity(identity);
if (token && isValid(token)) {
log.debug({ userId }, `Token refresh timed out, but still successful`, { host });
reportScmTokenRefreshRequest(host, "success_after_timeout");
return token;
}

log.error({ userId }, `Failed to refresh token (timeout waiting on lock)`, err, { host });
reportScmTokenRefreshRequest(host, "timeout");
throw new Error(`Failed to refresh token (timeout waiting on lock)`);
}
reportScmTokenRefreshRequest(host, "error");
throw err;
}
}
Expand Down

0 comments on commit a05f84e

Please sign in to comment.