diff --git a/components/server/src/prometheus-metrics.ts b/components/server/src/prometheus-metrics.ts index 0d24ae605ed760..e5191d432f7aee 100644 --- a/components/server/src/prometheus-metrics.ts +++ b/components/server/src/prometheus-metrics.ts @@ -381,3 +381,27 @@ type AuthorizerSubjectIdMatch = "ctx-user-id-missing" | "passed-subject-id-missi export function reportAuthorizerSubjectId(match: AuthorizerSubjectIdMatch) { authorizerSubjectId.labels(match).inc(); } + +export const scmTokenRefreshRequestsTotal = new prometheusClient.Counter({ + name: "gitpod_scm_token_refresh_requests_total", + help: "Counter for the number of token refresh requests we issue against SCM systems", + labelNames: ["host", "result"], +}); +export type ScmTokenRefreshResult = + | "success" + | "timeout" + | "error" + | "still_valid" + | "no_token" + | "not_refreshable" + | "success_after_timeout"; +export function reportScmTokenRefreshRequest(host: string, result: ScmTokenRefreshResult) { + scmTokenRefreshRequestsTotal.labels(host, result).inc(); +} + +export const scmTokenRefreshLatencyHistogram = new prometheusClient.Histogram({ + name: "gitpod_scm_token_refresh_latency_seconds", + help: "SCM token refresh latency in seconds", + labelNames: ["host"], + buckets: [0.01, 0.1, 0.2, 0.5, 1, 2, 5, 10], +}); diff --git a/components/server/src/user/token-service.ts b/components/server/src/user/token-service.ts index f2ccbdfec9bd9a..0840830bab4352 100644 --- a/components/server/src/user/token-service.ts +++ b/components/server/src/user/token-service.ts @@ -15,6 +15,7 @@ import { GarbageCollectedCache } from "@gitpod/gitpod-protocol/lib/util/garbage- import { log } from "@gitpod/gitpod-protocol/lib/util/logging"; import { getExperimentsClientForBackend } from "@gitpod/gitpod-protocol/lib/experiments/configcat-server"; import { RedisMutex } from "../redis/mutex"; +import { reportScmTokenRefreshRequest, scmTokenRefreshLatencyHistogram } from "../prometheus-metrics"; @injectable() export class TokenService implements TokenProvider { @@ -63,42 +64,76 @@ export class TokenService implements TokenProvider { } const identity = this.getIdentityForHost(user, host); + function isValid(t: Token): boolean { + if (!t.expiryDate) { + return true; + } + + const aboutToExpireTime = new Date(); + aboutToExpireTime.setTime(aboutToExpireTime.getTime() + 5 * 60 * 1000); + if (t.expiryDate >= aboutToExpireTime.toISOString()) { + return true; + } + return false; + } + const doRefreshToken = async () => { // Check: Current token so we can actually refresh? const token = await this.userDB.findTokenForIdentity(identity); if (!token) { + reportScmTokenRefreshRequest(host, "no_token"); return undefined; } - const aboutToExpireTime = new Date(); - aboutToExpireTime.setTime(aboutToExpireTime.getTime() + 5 * 60 * 1000); - if (!token.expiryDate || token.expiryDate >= aboutToExpireTime.toISOString()) { + if (isValid(token)) { + reportScmTokenRefreshRequest(host, "still_valid"); return token; } // Can we refresh these kind of tokens? const { authProvider } = this.hostContextProvider.get(host)!; if (!authProvider.refreshToken) { + reportScmTokenRefreshRequest(host, "not_refreshable"); return undefined; } - await authProvider.refreshToken(user); - return await this.userDB.findTokenForIdentity(identity); + // Perform actual refresh + const stopTimer = scmTokenRefreshLatencyHistogram.startTimer({ host }); + try { + await authProvider.refreshToken(user); + } finally { + stopTimer({ host }); + } + + const freshToken = await this.userDB.findTokenForIdentity(identity); + reportScmTokenRefreshRequest(host, "success"); + return freshToken; }; try { const refreshedToken = await this.redisMutex.using( [`token-refresh-${host}-${userId}`], - 2000, // After 2s without extension the lock is released + 3000, // After 3s without extension the lock is released doRefreshToken, - { retryCount: 10, retryDelay: 500 }, // We wait at most 10s until we give up, and conclude that we can't refresh the token now. + { retryCount: 20, retryDelay: 500 }, // We wait at most 10s until we give up, and conclude that we can't refresh the token now. ); return refreshedToken; } catch (err) { if (RedisMutex.isLockedError(err)) { + // In this case we already timed-out. BUT there is a high chance we are waiting on somebody else, who might already done the work for us. + // So just checking again here + const token = await this.userDB.findTokenForIdentity(identity); + if (token && isValid(token)) { + log.debug({ userId }, `Token refresh timed out, but still successful`, { host }); + reportScmTokenRefreshRequest(host, "success_after_timeout"); + return token; + } + log.error({ userId }, `Failed to refresh token (timeout waiting on lock)`, err, { host }); + reportScmTokenRefreshRequest(host, "timeout"); throw new Error(`Failed to refresh token (timeout waiting on lock)`); } + reportScmTokenRefreshRequest(host, "error"); throw err; } }