doc: Generate a minimal sitemap of HTML files (RobotLocomotion#14778)

mcx · Mar 15, 2021 · 693c849 · 693c849
1 parent 03969a9
commit 693c849
Showing 1 changed file with 54 additions and 0 deletions.
diff --git a/doc/build.py b/doc/build.py
@@ -6,11 +6,14 @@
 
 import argparse
 import os.path
+from pathlib import Path
 import shlex
 import shutil
 import subprocess
+import urllib.parse
 
 from bazel_tools.tools.python.runfiles import runfiles
+import lxml.etree as ET
 
 
 def _check_call(args):
@@ -25,6 +28,7 @@ def main():
     parser.add_argument(
         "--out_dir", type=str, metavar="DIR", required=True,
         help="Output directory. Must be an absolute path and must not exist.")
+
     args = parser.parse_args()
     out_dir = args.out_dir
     if not os.path.isabs(out_dir):
@@ -50,6 +54,56 @@ def main():
     # TODO(jwnimmer-tri) Incorporate the Drake styleguide publication here,
     # instead of having it be a separate pipeline.
 
+    _build_sitemap(out_dir)
+
+
+def _build_sitemap(site_dir: str) -> None:
+    """Builds a minimal sitemap.xml for drake.mit.edu.
+
+    This helps Google, Bing, and other search engines decide which pages on the
+    generated drake.mit.edu site should be crawled, and helps determine the
+    canonical version of each page.
+
+    https://developers.google.com/search/docs/advanced/sitemaps/build-sitemap
+
+    Args:
+        site_dir: The absolute path to the root directory of the generated
+          website and the directory to which the built sitemap.xml will be
+          written.
+
+    Raises:
+        OSError: If the directory to which site_dir refers is not readable or
+          writable.
+    """
+
+    print("Building sitemap.xml...")
+    root_path = Path(site_dir)
+    assert (root_path.is_absolute(),
+            "Path to generated website is not an absolute path")
+    paths = root_path.glob("**/*.html")
+
+    XML_NAMESPACE = "http://www.sitemaps.org/schemas/sitemap/0.9"
+    ROOT_URL = "https://drake.mit.edu"
+
+    urlset = ET.Element("urlset", xmlns=XML_NAMESPACE)
+    for path in sorted(paths):
+        relative_path = path.relative_to(root_path)
+        url = ET.SubElement(urlset, "url")
+        if relative_path.name == "index.html":
+            # sitemap.xml should only include canonical urls.
+            location = relative_path.parent.as_posix() + "/"
+        else:
+            location = relative_path.as_posix()
+            location = urllib.parse.urljoin(ROOT_URL,
+                                            urllib.parse.quote(location))
+            loc = ET.SubElement(url, "loc")
+            loc.text = location
+        sitemap = ET.ElementTree(urlset)
+        sitemap.write(os.path.join(site_dir, "sitemap.xml"),
+                      encoding="utf-8",
+                      pretty_print=True,
+                      xml_declaration=True)
+
 
 if __name__ == '__main__':
     main()