Skip to content

Commit

Permalink
doc: Generate a minimal sitemap of HTML files (RobotLocomotion#14778)
Browse files Browse the repository at this point in the history
  • Loading branch information
jamiesnape authored Mar 15, 2021
1 parent 03969a9 commit 693c849
Showing 1 changed file with 54 additions and 0 deletions.
54 changes: 54 additions & 0 deletions doc/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,14 @@

import argparse
import os.path
from pathlib import Path
import shlex
import shutil
import subprocess
import urllib.parse

from bazel_tools.tools.python.runfiles import runfiles
import lxml.etree as ET


def _check_call(args):
Expand All @@ -25,6 +28,7 @@ def main():
parser.add_argument(
"--out_dir", type=str, metavar="DIR", required=True,
help="Output directory. Must be an absolute path and must not exist.")

args = parser.parse_args()
out_dir = args.out_dir
if not os.path.isabs(out_dir):
Expand All @@ -50,6 +54,56 @@ def main():
# TODO(jwnimmer-tri) Incorporate the Drake styleguide publication here,
# instead of having it be a separate pipeline.

_build_sitemap(out_dir)


def _build_sitemap(site_dir: str) -> None:
"""Builds a minimal sitemap.xml for drake.mit.edu.
This helps Google, Bing, and other search engines decide which pages on the
generated drake.mit.edu site should be crawled, and helps determine the
canonical version of each page.
https://developers.google.com/search/docs/advanced/sitemaps/build-sitemap
Args:
site_dir: The absolute path to the root directory of the generated
website and the directory to which the built sitemap.xml will be
written.
Raises:
OSError: If the directory to which site_dir refers is not readable or
writable.
"""

print("Building sitemap.xml...")
root_path = Path(site_dir)
assert (root_path.is_absolute(),
"Path to generated website is not an absolute path")
paths = root_path.glob("**/*.html")

XML_NAMESPACE = "http://www.sitemaps.org/schemas/sitemap/0.9"
ROOT_URL = "https://drake.mit.edu"

urlset = ET.Element("urlset", xmlns=XML_NAMESPACE)
for path in sorted(paths):
relative_path = path.relative_to(root_path)
url = ET.SubElement(urlset, "url")
if relative_path.name == "index.html":
# sitemap.xml should only include canonical urls.
location = relative_path.parent.as_posix() + "/"
else:
location = relative_path.as_posix()
location = urllib.parse.urljoin(ROOT_URL,
urllib.parse.quote(location))
loc = ET.SubElement(url, "loc")
loc.text = location
sitemap = ET.ElementTree(urlset)
sitemap.write(os.path.join(site_dir, "sitemap.xml"),
encoding="utf-8",
pretty_print=True,
xml_declaration=True)


if __name__ == '__main__':
main()

0 comments on commit 693c849

Please sign in to comment.