Skip to content

Commit

Permalink
serve robots.txt from yari (mdn#4186)
Browse files Browse the repository at this point in the history
* serve robots.txt from yari

* serve robots.txt from yari

* feedbacked

* eslint fix
  • Loading branch information
peterbe authored Jul 14, 2021
1 parent 2076d91 commit 51336ce
Show file tree
Hide file tree
Showing 11 changed files with 94 additions and 18 deletions.
4 changes: 0 additions & 4 deletions .github/workflows/dev-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -136,10 +136,6 @@ jobs:
# Same with the Speedcurve LUX
BUILD_SPEEDCURVE_LUX_ID: 000000000

# Make sure every built page always has
# '<meta name="robots" content="noindex, nofollow">' nomatter what
# kind of document it is.
BUILD_ALWAYS_NO_ROBOTS: true
run: |
if [ ${{ github.event.inputs.archived_content }} == "true" ]; then
echo "Will build mdn/archived-content too"
Expand Down
4 changes: 4 additions & 0 deletions .github/workflows/prod-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,10 @@ jobs:
# This enables the Plus call-to-action banner and the Plus landing page
REACT_APP_ENABLE_PLUS: true

# The default is to always set no to robots. This deployment is the only
# exception in the world where we actually want to welcome robots.
BUILD_ALWAYS_ALLOW_ROBOTS: true

run: |
if [ ${{ env.BUILD_ARCHIVED_CONTENT }} == "true" ]; then
echo "Will build mdn/archived-content too"
Expand Down
5 changes: 0 additions & 5 deletions .github/workflows/stage-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -163,11 +163,6 @@ jobs:
# https://speedcurve.com/mozilla-add-ons/mdn/settings/updated/#lux
BUILD_SPEEDCURVE_LUX_ID: 108906238

# Make sure every built page always has
# '<meta name="robots" content="noindex, nofollow">' nomatter what
# kind of document it is.
BUILD_ALWAYS_NO_ROBOTS: true

# This enables the Plus call-to-action banner and the Plus landing page
REACT_APP_ENABLE_PLUS: true

Expand Down
6 changes: 3 additions & 3 deletions build/constants.js
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,8 @@ const FIX_FLAWS_VERBOSE = JSON.parse(
);

// See explanation in docs/envvars.md
const ALWAYS_NO_ROBOTS = JSON.parse(
process.env.BUILD_ALWAYS_NO_ROBOTS || "false"
const ALWAYS_ALLOW_ROBOTS = JSON.parse(
process.env.BUILD_ALWAYS_ALLOW_ROBOTS || "false"
);

const HOMEPAGE_FEED_URL =
Expand Down Expand Up @@ -89,7 +89,7 @@ module.exports = {
FIX_FLAWS,
FIX_FLAWS_DRY_RUN,
FIX_FLAWS_VERBOSE,
ALWAYS_NO_ROBOTS,
ALWAYS_ALLOW_ROBOTS,
HOMEPAGE_FEED_URL,
HOMEPAGE_FEED_DISPLAY_MAX,
BUILD_SUBSCRIPTION_CONFIG_URL,
Expand Down
8 changes: 5 additions & 3 deletions docs/envvars.md
Original file line number Diff line number Diff line change
Expand Up @@ -165,14 +165,16 @@ You can get it here on [this settings page](https://speedcurve.com/mozilla-add-o
which will give you the ID in the snippet shown there. Also, try to match
this with the domains in those settings to match where we deploy it.

### `BUILD_ALWAYS_NO_ROBOTS`
### `BUILD_ALWAYS_ALLOW_ROBOTS`

**Default: `false`**

This exists so we can forcibly always include
`<meta name="robots" content="noindex, nofollow">` into the HTML no matter what.
For example, on our stage or dev builds, none of the documents should be indexed,
so we'll set `BUILD_ALWAYS_NO_ROBOTS` to `true`.
For example, on our stage or dev builds, we never want robots.

The only place where we want robots is in prod. That's explicitly always
set in `prod-build.yml`.

We use this to make absolutely sure that no dev or stage build ever gets into
the Google index. Thankfully we _always_ used a canonical URL
Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
"fiori:build": "cd client && build-storybook",
"fiori:start": "cd client && start-storybook -p 6006",
"md": "ts-node markdown/cli.ts",
"prepare-build": "yarn build:client && yarn build:ssr && yarn tool optimize-client-build && yarn tool google-analytics-code && yarn tool spas && yarn tool gather-git-history",
"prepare-build": "yarn build:client && yarn build:ssr && yarn tool optimize-client-build && yarn tool google-analytics-code && yarn tool spas && yarn tool gather-git-history && yarn tool build-robots-txt",
"prettier-check": "prettier --check .",
"prettier-format": "prettier --write .",
"start": "(test -f client/build/index.html || yarn build:client) && (test -f ssr/dist/main.js || yarn build:ssr) && (test -d client/build/en-us/_spas || yarn tool spas) && nf -j Procfile.start start",
Expand Down
7 changes: 5 additions & 2 deletions ssr/render.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ import { renderToString } from "react-dom/server";
import cheerio from "cheerio";

import {
ALWAYS_NO_ROBOTS,
ALWAYS_ALLOW_ROBOTS,
BUILD_OUT_ROOT,
SPEEDCURVE_LUX_ID,
} from "../build/constants";
Expand Down Expand Up @@ -219,7 +219,10 @@ export default function render(
}

const robotsContent =
ALWAYS_NO_ROBOTS || (doc && doc.noIndexing) || pageNotFound || noIndexing
!ALWAYS_ALLOW_ROBOTS ||
(doc && doc.noIndexing) ||
pageNotFound ||
noIndexing
? "noindex, nofollow"
: "index, follow";
$(`<meta name="robots" content="${robotsContent}">`).insertAfter(
Expand Down
4 changes: 4 additions & 0 deletions testing/.env
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,7 @@ REACT_APP_AUTOCOMPLETE_SEARCH_WIDGET=true
# injects the relevant script tags actually run in end-to-end testing.
BUILD_GOOGLE_ANALYTICS_ACCOUNT=UA-00000000-0
BUILD_SPEEDCURVE_LUX_ID=012345

# The functional tests are done in a production'y way as if it had
# to go into full production mode.
BUILD_ALWAYS_ALLOW_ROBOTS=true
11 changes: 11 additions & 0 deletions testing/tests/index.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -1752,3 +1752,14 @@ test("built search-index.json (en-US)", () => {
// an archived page should not be in there.
expect(urlToTitle.has("/en-US/docs/XUL")).toBeFalsy();
});

test("the robots.txt file was created", () => {
const filePath = path.join(buildRoot, "robots.txt");
const text = fs.readFileSync(filePath, "utf-8");
// The content of robots file when it's in production mode is
// to ONLY say `Disallow: /api/`.
// When the robots file is for disallowing everything it
// will ONLY say `Disallow: /`.
expect(text).toContain("Disallow: /api/");
expect(text).not.toContain("Disallow: /\n");
});
37 changes: 37 additions & 0 deletions tool/build-robots-txt.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
/**
* This script generates a /robots.txt file that depends on
* process.env.BUILD_ALWAYS_ALLOW_ROBOTS.
*
*/
const fs = require("fs");

const { VALID_LOCALES } = require("../libs/constants");
const { ALWAYS_ALLOW_ROBOTS } = require("../build/constants");

const ALLOW_TEXT = `
User-agent: *
Sitemap: https://developer.mozilla.org/sitemap.xml
Disallow: /api/
Disallow: /*/files/
Disallow: /media
`;

const DISALLOW_TEXT = `
User-Agent: *
Disallow: /
`;

async function runBuildRobotsTxt(outfile) {
let content = ALWAYS_ALLOW_ROBOTS ? ALLOW_TEXT : DISALLOW_TEXT;
if (ALWAYS_ALLOW_ROBOTS) {
// Append extra lines specifically when we do allow robots.
for (const locale of VALID_LOCALES.values()) {
content += `Disallow: /${locale}/search\n`;
}
}
fs.writeFileSync(outfile, `${content.trim()}\n`, "utf-8");
}

module.exports = { runBuildRobotsTxt };
24 changes: 24 additions & 0 deletions tool/cli.js
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,15 @@ const {
} = require("../content");
const { buildDocument, gatherGitHistory, buildSPAs } = require("../build");
const {
ALWAYS_ALLOW_ROBOTS,
BUILD_OUT_ROOT,
GOOGLE_ANALYTICS_ACCOUNT,
GOOGLE_ANALYTICS_DEBUG,
} = require("../build/constants");
const { runArchive } = require("./archive");
const { runMakePopularitiesFile } = require("./popularities");
const { runOptimizeClientBuild } = require("./optimize-client-build");
const { runBuildRobotsTxt } = require("./build-robots-txt");
const kumascript = require("../kumascript");

const PORT = parseInt(process.env.SERVER_PORT || "5000");
Expand Down Expand Up @@ -775,6 +777,28 @@ if (Mozilla && !Mozilla.dntEnabled()) {
})
)

.command(
"build-robots-txt",
"Generate a robots.txt in the build root depending ALWAYS_ALLOW_ROBOTS"
)
.option("--outfile <path>", "name of the generated file", {
default: path.join(BUILD_OUT_ROOT, "robots.txt"),
})
.action(
tryOrExit(async ({ options, logger }) => {
const { outfile } = options;
await runBuildRobotsTxt(outfile);
logger.info(
chalk.yellow(
`Generated ${path.relative(
".",
outfile
)} based on ALWAYS_ALLOW_ROBOTS=${ALWAYS_ALLOW_ROBOTS}`
)
);
})
)

.command("spas", "Build (SSR) all the skeleton apps for single page apps")
.action(
tryOrExit(async ({ options }) => {
Expand Down

0 comments on commit 51336ce

Please sign in to comment.