ccan: add UTF-8 module for checking alias fields.

Signed-off-by: Rusty Russell <[email protected]>
owldock · Jul 1, 2018 · 2639b1e · 2639b1e
1 parent fecef61
commit 2639b1e
Show file tree

Hide file tree

Showing 11 changed files with 624 additions and 5 deletions.
diff --git a/Makefile b/Makefile
@@ -87,7 +87,8 @@ CCAN_OBJS :=					\
 	ccan-tal-str.o				\
 	ccan-tal.o				\
 	ccan-time.o				\
-	ccan-timer.o
+	ccan-timer.o				\
+	ccan-utf8.o
 
 CCAN_HEADERS :=						\
 	$(CCANDIR)/config.h				\
@@ -153,7 +154,8 @@ CCAN_HEADERS :=						\
 	$(CCANDIR)/ccan/tcon/tcon.h			\
 	$(CCANDIR)/ccan/time/time.h			\
 	$(CCANDIR)/ccan/timer/timer.h			\
-	$(CCANDIR)/ccan/typesafe_cb/typesafe_cb.h
+	$(CCANDIR)/ccan/typesafe_cb/typesafe_cb.h	\
+	$(CCANDIR)/ccan/utf8/utf8.h
 
 ALL_GEN_HEADERS += gen_version.h
 
@@ -577,3 +579,5 @@ ccan-rbuf.o: $(CCANDIR)/ccan/rbuf/rbuf.c
 	$(CC) $(CFLAGS) -c -o $@ $<
 ccan-str-base32.o: $(CCANDIR)/ccan/str/base32/base32.c
 	$(CC) $(CFLAGS) -c -o $@ $<
+ccan-utf8.o: $(CCANDIR)/ccan/utf8/utf8.c
+	$(CC) $(CFLAGS) -c -o $@ $<
diff --git a/ccan/README b/ccan/README
@@ -1,3 +1,3 @@
 CCAN imported from http://ccodearchive.net.
 
-CCAN version: init-2432-gd830ca0e
+CCAN version: init-2434-gac8694de
diff --git a/ccan/ccan/timer/_info b/ccan/ccan/timer/_info
@@ -70,7 +70,6 @@ int main(int argc, char *argv[])
 	if (strcmp(argv[1], "depends") == 0) {
 		printf("ccan/array_size\n");
 		printf("ccan/ilog\n");
-		printf("ccan/likely\n");
 		printf("ccan/list\n");
 		printf("ccan/time\n");
 		return 0;

diff --git a/ccan/ccan/timer/timer.c b/ccan/ccan/timer/timer.c
@@ -2,7 +2,6 @@
 #include <ccan/timer/timer.h>
 #include <ccan/array_size/array_size.h>
 #include <ccan/ilog/ilog.h>
-#include <ccan/likely/likely.h>
 #include <stdlib.h>
 #include <stdio.h>
 

diff --git a/ccan/ccan/utf8/LICENSE b/ccan/ccan/utf8/LICENSE
@@ -0,0 +1 @@
+../../licenses/BSD-MIT
diff --git a/ccan/ccan/utf8/_info b/ccan/ccan/utf8/_info
@@ -0,0 +1,48 @@
+#include "config.h"
+#include <stdio.h>
+#include <string.h>
+
+/**
+ * utf8 - Simple routines to encode/decode valid UTF-8.
+ *
+ * This code contains routines to encode and decode UTF-8 characters.
+ * Table and test code stolen entirely from:
+ *    Copyright (c) 2017 Christian Hansen <[email protected]>
+ *    <https://github.com/chansen/c-utf8-valid>
+ * 
+ * Example:
+ *	int main(int argc, char *argv[])
+ *	{
+ *		size_t i;
+ *		struct utf8_state utf8_state = UTF8_STATE_INIT;
+ *		bool decoded = true;
+ *
+ *		for (i = 0; i < strlen(argv[1]); i++) {
+ *			decoded = utf8_decode(&utf8_state, argv[1][i]);
+ *			if (decoded) {
+ *				if (errno != 0)
+ *					err(1, "Invalid UTF8 char %zu-%zu",
+ *					    i - utf8_state.used_len, i);
+ *				printf("Character %u\n", utf8_state.c);
+ *			}
+ *		}
+ *
+ *		if (!decoded)
+ *			errx(1, "Incomplete UTF8");
+ *		return 0;
+ *	}
+ *
+ * License: BSD-MIT
+ */
+int main(int argc, char *argv[])
+{
+	/* Expect exactly one argument */
+	if (argc != 2)
+		return 1;
+
+	if (strcmp(argv[1], "depends") == 0) {
+		return 0;
+	}
+
+	return 1;
+}
diff --git a/ccan/ccan/utf8/test/run-decode.c b/ccan/ccan/utf8/test/run-decode.c
@@ -0,0 +1,266 @@
+#include <ccan/utf8/utf8.h>
+/* Include the C files directly. */
+#include <ccan/utf8/utf8.c>
+#include <ccan/tap/tap.h>
+#include <assert.h>
+
+/* Stolen from https://github.com/chansen/c-utf8-valid/blob/master/test.c */
+
+/*
+ *  UTF-8
+ *
+ *     U+0000..U+007F         00..7F
+ *                         n  C0..C1  80..BF
+ *     U+0080..U+07FF         C2..DF  80..BF
+ *                         n  E0      80..9F  80..BF
+ *     U+0800..U+D7FF         E0..ED  A0..9F  80..BF
+ *     U+D800..U+DFFF      s  ED      A0..BF  80..BF
+ *     U+E000..U+FFFF         EE..EF  80..BF  80..BF
+ *                         n  F0      80..8F  80..BF  80..BF
+ *     U+0800..U+FFFF         F0      80..8F  A0..BF  80..BF
+ *    U+10000..U+10FFFF       F0..F4  90..8F  80..BF  80..BF
+ *
+ *   U-110000..U-1FFFFF    x  F4..F7  90..BF  80..BF  80..BF
+ *                         xn F8      80..87  80..BF  80..BF  80..BF
+ *   U-200000..U-3FFFFFF   x  F8..FB  88..BF  80..BF  80..BF  80..BF
+ *                         xn FC      80..83  80..BF  80..BF  80..BF  80..BF
+ *  U-4000000..U-7FFFFFFF  x  FC..FD  84..BF  80..BF  80..BF  80..BF  80..BF
+ * 
+ *  Legend:
+ *    n = Non-shortest form
+ *    s = Surrogates
+ *    x = Codepoints outside Unicode codespace
+ */
+
+/*
+ *  Encodes the given ordinal [0, 7FFFFFFF] using the UTF-8 encoding scheme
+ *  to the given sequence length [1, 6]. This routine can be used to
+ *  produce well-formed and ill-formed UTF-8.
+ *
+ *  To encode a Unicode scalar value to a well-formed representation:
+ *
+ *   [U+0000, U+007F] should be encoded to a sequence length of 1
+ *   [U+0080, U+07FF] should be encoded to a sequence length of 2
+ *   [U+0800, U+D7FF] should be encoded to a sequence length of 3
+ *   [U+E000, U+FFFF] should be encoded to a sequence length of 3
+ *   [U+10000, U+10FFFF] should be encoded to a sequence length of 4
+ *
+ *  To encode a Unicode scalar value to non-shortest form representation:
+ *
+ *   [U+0000, U+007F] can be encoded to a sequence length of [2, 6]
+ *   [U+0080, U+07FF] can be encoded to a sequence length of [3, 6]
+ *   [U+0800, U+FFFF] can be encoded to a sequence length of [4, 6]
+ *
+ *  To encode an ordinal outside of Unicode codespace:
+ *
+ *   [110000, 1FFFFF] can be encoded to a sequence length of 4
+ *   [200000, 3FFFFFF] can be encoded to a sequence length of 5
+ *   [4000000, 7FFFFFFF] can be encoded to a sequence length of 6
+ */
+
+static char *
+encode_ord(uint32_t ord, size_t len, char *dst) {
+  static const uint32_t kMask[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
+  static const uint32_t kMax[6]  = { 1 <<  7, 1 << 11, 1 << 16, 
+                                     1 << 21, 1 << 26, 1 << 31 };
+  size_t i;
+
+  assert(len >= 1);
+  assert(len <= 6);
+  assert(ord < kMax[len - 1]);
+
+  for (i = len - 1; i > 0; i--) {
+    dst[i] = (ord & 0x3F) | 0x80;
+    ord >>= 6;
+  }
+  dst[0] = ord | kMask[len - 1];
+  return dst;
+}
+
+static int utf8_check(const char *src, size_t len)
+{
+	bool decoded = false;
+	struct utf8_state utf8_state = UTF8_STATE_INIT;
+	size_t i;
+
+	for (i = 0; i < len; i++) {
+		decoded = utf8_decode(&utf8_state, src[i]);
+		if (decoded) {
+			if (errno != 0)
+				return errno;
+		}
+	}
+	if (!decoded)
+		return EMLINK;
+	return 0;
+}
+
+static void
+test_utf8(const char *src, size_t len, int exp_err, unsigned line) {
+  int got_err;
+
+  assert(len <= 255);
+
+  got_err = utf8_check(src, len);
+
+  ok(got_err == exp_err, "Got result %i, expected %i at line %u",
+     got_err, exp_err, line);
+}
+
+#define TEST_UTF8(src, len, exp) \
+  test_utf8(src, len, exp, __LINE__)
+
+
+static void
+test_unicode_scalar_value(void) {
+  uint32_t ord;
+  char src[4];
+
+  /* Unicode scalar value [U+0000, U+007F] */
+  for (ord = 0x0000; ord <= 0x007F; ord++) {
+    encode_ord(ord, 1, src);
+    TEST_UTF8(src, 1, ord ? 0 : ERANGE);
+  }
+
+  /*
+   * Unicode scalar value [U+0080, U+07FF]
+   * The maximal subpart is the length of the truncated sequence
+   */
+  for (ord = 0x0080; ord <= 0x07FF; ord++) {
+    encode_ord(ord, 2, src);
+    TEST_UTF8(src, 2, 0);
+  }
+
+  /*
+   * Unicode scalar value [U+0800, U+D7FF] and [U+E000, U+FFFF]
+   * The maximal subpart is the length of the truncated sequence
+   */
+  for (ord = 0x0800; ord <= 0xFFFF && (ord & 0xF800) != 0xD800; ord++) {
+    encode_ord(ord, 3, src);
+
+    TEST_UTF8(src, 3, 0);
+    if ((ord % (1 << 6)) == 0)
+      TEST_UTF8(src, 2, EMLINK);
+  }
+
+  /*
+   * Unicode scalar value [U+10000, U+10FFF]
+   * The maximal subpart is the length of the truncated sequence
+   */
+  for (ord = 0x10000; ord <= 0x10FFFF; ord++) {
+    encode_ord(ord, 4, src);
+
+    TEST_UTF8(src, 4, 0);
+    if ((ord % (1 << 6)) == 0)
+      TEST_UTF8(src, 3, EMLINK);
+    if ((ord % (1 << 12)) == 0)
+      TEST_UTF8(src, 2, EMLINK);
+  }
+}
+
+static void
+test_non_shortest_form(void) {
+  uint32_t ord;
+  char src[4];
+
+  /*
+   * Non-shortest form 2-byte sequence [U+0000, U+007F]
+   * The maximal subpart is 1-byte
+   */
+  for (ord = 0x0001; ord <= 0x007F; ord++) {
+    encode_ord(ord, 2, src);
+    TEST_UTF8(src, 2, EFBIG);
+  }
+
+  /*
+   * Non-shortest form 3-byte sequence [U+0000, U+07FF]
+   * The maximal subpart is 1-byte
+   */
+  for (ord = 0x0001; ord <= 0x07FF; ord++) {
+    encode_ord(ord, 3, src);
+
+    TEST_UTF8(src, 3, EFBIG);
+    if ((ord % (1 << 6)) == 0)
+      TEST_UTF8(src, 2, EMLINK);
+  }
+
+  /*
+   * Non-shortest form 4-byte sequence [U+0000, U+FFFF]
+   * The maximal subpart is 1-byte
+   */
+  for (ord = 0x0001; ord <= 0xFFFF; ord++) {
+    encode_ord(ord, 4, src);
+
+    TEST_UTF8(src, 4, EFBIG);
+    if ((ord % (1 << 6)) == 0)
+      TEST_UTF8(src, 3, EMLINK);
+    if ((ord % (1 << 12)) == 0)
+      TEST_UTF8(src, 2, EMLINK);
+  }
+}
+
+static void
+test_non_unicode(void) {
+  uint32_t ord;
+  char src[4];
+
+  /*
+   * Code point outside Unicode codespace
+   * The maximal subpart is 1-byte
+   */
+  for (ord = 0x110000; ord <= 0x1FFFFF; ord++) {
+    encode_ord(ord, 4, src);
+
+    TEST_UTF8(src, 4, ERANGE);
+    if ((ord % (1 << 6)) == 0)
+      TEST_UTF8(src, 3, EMLINK);
+    if ((ord % (1 << 12)) == 0)
+      TEST_UTF8(src, 2, EMLINK);
+  }
+}
+
+static void
+test_surrogates(void) {
+  uint32_t ord;
+  char src[4];
+
+  /*
+   * Surrogates [U+D800, U+DFFF]
+   * The maximal subpart is 1-byte
+   */
+  for (ord = 0xD800; ord <= 0xDFFF; ord++) {
+    encode_ord(ord, 3, src);
+
+    TEST_UTF8(src, 3, ERANGE);
+    if ((ord % (1 << 6)) == 0)
+      TEST_UTF8(src, 2, EMLINK);
+  }
+}
+
+static void
+test_continuations(void) {
+  uint8_t ord;
+  char src[4];
+
+  /*
+   * Missplaced continuation [\x80, \xBF]
+   * The maximal subpart is 1-byte
+   */
+  for (ord = 0x80; ord <= 0xBF; ord++) {
+    src[0] = ord;
+    TEST_UTF8(src, 1, EINVAL);
+  }
+}
+
+int
+main(int argc, char **argv)
+{
+  plan_tests(2190906);
+  test_unicode_scalar_value();
+  test_surrogates();
+  test_non_shortest_form();
+  test_non_unicode();
+  test_continuations();
+
+  return exit_status();
+}