serve robots.txt from yari (mdn#4186)

* serve robots.txt from yari * serve robots.txt from yari * feedbacked * eslint fix
gavtman · Jul 14, 2021 · 51336ce · 51336ce
1 parent 2076d91
commit 51336ce
Show file tree

Hide file tree

Showing 11 changed files with 94 additions and 18 deletions.
diff --git a/.github/workflows/dev-build.yml b/.github/workflows/dev-build.yml
@@ -136,10 +136,6 @@ jobs:
           # Same with the Speedcurve LUX
           BUILD_SPEEDCURVE_LUX_ID: 000000000
 
-          # Make sure every built page always has
-          # '<meta name="robots" content="noindex, nofollow">' nomatter what
-          # kind of document it is.
-          BUILD_ALWAYS_NO_ROBOTS: true
         run: |
           if [ ${{ github.event.inputs.archived_content }} == "true" ]; then
             echo "Will build mdn/archived-content too"

diff --git a/.github/workflows/prod-build.yml b/.github/workflows/prod-build.yml
@@ -166,6 +166,10 @@ jobs:
           # This enables the Plus call-to-action banner and the Plus landing page
           REACT_APP_ENABLE_PLUS: true
 
+          # The default is to always set no to robots. This deployment is the only
+          # exception in the world where we actually want to welcome robots.
+          BUILD_ALWAYS_ALLOW_ROBOTS: true
+
         run: |
           if [ ${{ env.BUILD_ARCHIVED_CONTENT }} == "true" ]; then
             echo "Will build mdn/archived-content too"

diff --git a/.github/workflows/stage-build.yml b/.github/workflows/stage-build.yml
@@ -163,11 +163,6 @@ jobs:
           # https://speedcurve.com/mozilla-add-ons/mdn/settings/updated/#lux
           BUILD_SPEEDCURVE_LUX_ID: 108906238
 
-          # Make sure every built page always has
-          # '<meta name="robots" content="noindex, nofollow">' nomatter what
-          # kind of document it is.
-          BUILD_ALWAYS_NO_ROBOTS: true
-
           # This enables the Plus call-to-action banner and the Plus landing page
           REACT_APP_ENABLE_PLUS: true
 

diff --git a/build/constants.js b/build/constants.js
@@ -60,8 +60,8 @@ const FIX_FLAWS_VERBOSE = JSON.parse(
 );
 
 // See explanation in docs/envvars.md
-const ALWAYS_NO_ROBOTS = JSON.parse(
-  process.env.BUILD_ALWAYS_NO_ROBOTS || "false"
+const ALWAYS_ALLOW_ROBOTS = JSON.parse(
+  process.env.BUILD_ALWAYS_ALLOW_ROBOTS || "false"
 );
 
 const HOMEPAGE_FEED_URL =
@@ -89,7 +89,7 @@ module.exports = {
   FIX_FLAWS,
   FIX_FLAWS_DRY_RUN,
   FIX_FLAWS_VERBOSE,
-  ALWAYS_NO_ROBOTS,
+  ALWAYS_ALLOW_ROBOTS,
   HOMEPAGE_FEED_URL,
   HOMEPAGE_FEED_DISPLAY_MAX,
   BUILD_SUBSCRIPTION_CONFIG_URL,

diff --git a/docs/envvars.md b/docs/envvars.md
@@ -165,14 +165,16 @@ You can get it here on [this settings page](https://speedcurve.com/mozilla-add-o
 which will give you the ID in the snippet shown there. Also, try to match
 this with the domains in those settings to match where we deploy it.
 
-### `BUILD_ALWAYS_NO_ROBOTS`
+### `BUILD_ALWAYS_ALLOW_ROBOTS`
 
 **Default: `false`**
 
 This exists so we can forcibly always include
 `<meta name="robots" content="noindex, nofollow">` into the HTML no matter what.
-For example, on our stage or dev builds, none of the documents should be indexed,
-so we'll set `BUILD_ALWAYS_NO_ROBOTS` to `true`.
+For example, on our stage or dev builds, we never want robots.
+
+The only place where we want robots is in prod. That's explicitly always
+set in `prod-build.yml`.
 
 We use this to make absolutely sure that no dev or stage build ever gets into
 the Google index. Thankfully we _always_ used a canonical URL

diff --git a/package.json b/package.json
@@ -14,7 +14,7 @@
     "fiori:build": "cd client && build-storybook",
     "fiori:start": "cd client && start-storybook -p 6006",
     "md": "ts-node markdown/cli.ts",
-    "prepare-build": "yarn build:client && yarn build:ssr && yarn tool optimize-client-build && yarn tool google-analytics-code && yarn tool spas && yarn tool gather-git-history",
+    "prepare-build": "yarn build:client && yarn build:ssr && yarn tool optimize-client-build && yarn tool google-analytics-code && yarn tool spas && yarn tool gather-git-history && yarn tool build-robots-txt",
     "prettier-check": "prettier --check .",
     "prettier-format": "prettier --write .",
     "start": "(test -f client/build/index.html || yarn build:client) && (test -f ssr/dist/main.js || yarn build:ssr) && (test -d client/build/en-us/_spas || yarn tool spas) && nf -j Procfile.start start",

diff --git a/ssr/render.js b/ssr/render.js
@@ -4,7 +4,7 @@ import { renderToString } from "react-dom/server";
 import cheerio from "cheerio";
 
 import {
-  ALWAYS_NO_ROBOTS,
+  ALWAYS_ALLOW_ROBOTS,
   BUILD_OUT_ROOT,
   SPEEDCURVE_LUX_ID,
 } from "../build/constants";
@@ -219,7 +219,10 @@ export default function render(
   }
 
   const robotsContent =
-    ALWAYS_NO_ROBOTS || (doc && doc.noIndexing) || pageNotFound || noIndexing
+    !ALWAYS_ALLOW_ROBOTS ||
+    (doc && doc.noIndexing) ||
+    pageNotFound ||
+    noIndexing
       ? "noindex, nofollow"
       : "index, follow";
   $(`<meta name="robots" content="${robotsContent}">`).insertAfter(

diff --git a/testing/.env b/testing/.env
@@ -27,3 +27,7 @@ REACT_APP_AUTOCOMPLETE_SEARCH_WIDGET=true
 # injects the relevant script tags actually run in end-to-end testing.
 BUILD_GOOGLE_ANALYTICS_ACCOUNT=UA-00000000-0
 BUILD_SPEEDCURVE_LUX_ID=012345
+
+# The functional tests are done in a production'y way as if it had
+# to go into full production mode.
+BUILD_ALWAYS_ALLOW_ROBOTS=true
diff --git a/testing/tests/index.test.js b/testing/tests/index.test.js
@@ -1752,3 +1752,14 @@ test("built search-index.json (en-US)", () => {
   // an archived page should not be in there.
   expect(urlToTitle.has("/en-US/docs/XUL")).toBeFalsy();
 });
+
+test("the robots.txt file was created", () => {
+  const filePath = path.join(buildRoot, "robots.txt");
+  const text = fs.readFileSync(filePath, "utf-8");
+  // The content of robots file when it's in production mode is
+  // to ONLY say `Disallow: /api/`.
+  // When the robots file is for disallowing everything it
+  // will ONLY say `Disallow: /`.
+  expect(text).toContain("Disallow: /api/");
+  expect(text).not.toContain("Disallow: /\n");
+});
diff --git a/tool/build-robots-txt.js b/tool/build-robots-txt.js
@@ -0,0 +1,37 @@
+/**
+ * This script generates a /robots.txt file that depends on
+ * process.env.BUILD_ALWAYS_ALLOW_ROBOTS.
+ *
+ */
+const fs = require("fs");
+
+const { VALID_LOCALES } = require("../libs/constants");
+const { ALWAYS_ALLOW_ROBOTS } = require("../build/constants");
+
+const ALLOW_TEXT = `
+User-agent: *
+Sitemap: https://developer.mozilla.org/sitemap.xml
+
+Disallow: /api/
+Disallow: /*/files/
+Disallow: /media
+`;
+
+const DISALLOW_TEXT = `
+User-Agent: *
+
+Disallow: /
+`;
+
+async function runBuildRobotsTxt(outfile) {
+  let content = ALWAYS_ALLOW_ROBOTS ? ALLOW_TEXT : DISALLOW_TEXT;
+  if (ALWAYS_ALLOW_ROBOTS) {
+    // Append extra lines specifically when we do allow robots.
+    for (const locale of VALID_LOCALES.values()) {
+      content += `Disallow: /${locale}/search\n`;
+    }
+  }
+  fs.writeFileSync(outfile, `${content.trim()}\n`, "utf-8");
+}
+
+module.exports = { runBuildRobotsTxt };
diff --git a/tool/cli.js b/tool/cli.js
@@ -24,13 +24,15 @@ const {
 } = require("../content");
 const { buildDocument, gatherGitHistory, buildSPAs } = require("../build");
 const {
+  ALWAYS_ALLOW_ROBOTS,
   BUILD_OUT_ROOT,
   GOOGLE_ANALYTICS_ACCOUNT,
   GOOGLE_ANALYTICS_DEBUG,
 } = require("../build/constants");
 const { runArchive } = require("./archive");
 const { runMakePopularitiesFile } = require("./popularities");
 const { runOptimizeClientBuild } = require("./optimize-client-build");
+const { runBuildRobotsTxt } = require("./build-robots-txt");
 const kumascript = require("../kumascript");
 
 const PORT = parseInt(process.env.SERVER_PORT || "5000");
@@ -775,6 +777,28 @@ if (Mozilla && !Mozilla.dntEnabled()) {
     })
   )
 
+  .command(
+    "build-robots-txt",
+    "Generate a robots.txt in the build root depending ALWAYS_ALLOW_ROBOTS"
+  )
+  .option("--outfile <path>", "name of the generated file", {
+    default: path.join(BUILD_OUT_ROOT, "robots.txt"),
+  })
+  .action(
+    tryOrExit(async ({ options, logger }) => {
+      const { outfile } = options;
+      await runBuildRobotsTxt(outfile);
+      logger.info(
+        chalk.yellow(
+          `Generated ${path.relative(
+            ".",
+            outfile
+          )} based on ALWAYS_ALLOW_ROBOTS=${ALWAYS_ALLOW_ROBOTS}`
+        )
+      );
+    })
+  )
+
   .command("spas", "Build (SSR) all the skeleton apps for single page apps")
   .action(
     tryOrExit(async ({ options }) => {