Skip to content

Commit

Permalink
ccan: add UTF-8 module for checking alias fields.
Browse files Browse the repository at this point in the history
Signed-off-by: Rusty Russell <[email protected]>
  • Loading branch information
rustyrussell authored and cdecker committed Jul 1, 2018
1 parent fecef61 commit 2639b1e
Show file tree
Hide file tree
Showing 11 changed files with 624 additions and 5 deletions.
8 changes: 6 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,8 @@ CCAN_OBJS := \
ccan-tal-str.o \
ccan-tal.o \
ccan-time.o \
ccan-timer.o
ccan-timer.o \
ccan-utf8.o

CCAN_HEADERS := \
$(CCANDIR)/config.h \
Expand Down Expand Up @@ -153,7 +154,8 @@ CCAN_HEADERS := \
$(CCANDIR)/ccan/tcon/tcon.h \
$(CCANDIR)/ccan/time/time.h \
$(CCANDIR)/ccan/timer/timer.h \
$(CCANDIR)/ccan/typesafe_cb/typesafe_cb.h
$(CCANDIR)/ccan/typesafe_cb/typesafe_cb.h \
$(CCANDIR)/ccan/utf8/utf8.h

ALL_GEN_HEADERS += gen_version.h

Expand Down Expand Up @@ -577,3 +579,5 @@ ccan-rbuf.o: $(CCANDIR)/ccan/rbuf/rbuf.c
$(CC) $(CFLAGS) -c -o $@ $<
ccan-str-base32.o: $(CCANDIR)/ccan/str/base32/base32.c
$(CC) $(CFLAGS) -c -o $@ $<
ccan-utf8.o: $(CCANDIR)/ccan/utf8/utf8.c
$(CC) $(CFLAGS) -c -o $@ $<
2 changes: 1 addition & 1 deletion ccan/README
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
CCAN imported from http://ccodearchive.net.

CCAN version: init-2432-gd830ca0e
CCAN version: init-2434-gac8694de
1 change: 0 additions & 1 deletion ccan/ccan/timer/_info
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,6 @@ int main(int argc, char *argv[])
if (strcmp(argv[1], "depends") == 0) {
printf("ccan/array_size\n");
printf("ccan/ilog\n");
printf("ccan/likely\n");
printf("ccan/list\n");
printf("ccan/time\n");
return 0;
Expand Down
1 change: 0 additions & 1 deletion ccan/ccan/timer/timer.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
#include <ccan/timer/timer.h>
#include <ccan/array_size/array_size.h>
#include <ccan/ilog/ilog.h>
#include <ccan/likely/likely.h>
#include <stdlib.h>
#include <stdio.h>

Expand Down
1 change: 1 addition & 0 deletions ccan/ccan/utf8/LICENSE
48 changes: 48 additions & 0 deletions ccan/ccan/utf8/_info
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
#include "config.h"
#include <stdio.h>
#include <string.h>

/**
* utf8 - Simple routines to encode/decode valid UTF-8.
*
* This code contains routines to encode and decode UTF-8 characters.
* Table and test code stolen entirely from:
* Copyright (c) 2017 Christian Hansen <[email protected]>
* <https://github.com/chansen/c-utf8-valid>
*
* Example:
* int main(int argc, char *argv[])
* {
* size_t i;
* struct utf8_state utf8_state = UTF8_STATE_INIT;
* bool decoded = true;
*
* for (i = 0; i < strlen(argv[1]); i++) {
* decoded = utf8_decode(&utf8_state, argv[1][i]);
* if (decoded) {
* if (errno != 0)
* err(1, "Invalid UTF8 char %zu-%zu",
* i - utf8_state.used_len, i);
* printf("Character %u\n", utf8_state.c);
* }
* }
*
* if (!decoded)
* errx(1, "Incomplete UTF8");
* return 0;
* }
*
* License: BSD-MIT
*/
int main(int argc, char *argv[])
{
/* Expect exactly one argument */
if (argc != 2)
return 1;

if (strcmp(argv[1], "depends") == 0) {
return 0;
}

return 1;
}
266 changes: 266 additions & 0 deletions ccan/ccan/utf8/test/run-decode.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,266 @@
#include <ccan/utf8/utf8.h>
/* Include the C files directly. */
#include <ccan/utf8/utf8.c>
#include <ccan/tap/tap.h>
#include <assert.h>

/* Stolen from https://github.com/chansen/c-utf8-valid/blob/master/test.c */

/*
* UTF-8
*
* U+0000..U+007F 00..7F
* n C0..C1 80..BF
* U+0080..U+07FF C2..DF 80..BF
* n E0 80..9F 80..BF
* U+0800..U+D7FF E0..ED A0..9F 80..BF
* U+D800..U+DFFF s ED A0..BF 80..BF
* U+E000..U+FFFF EE..EF 80..BF 80..BF
* n F0 80..8F 80..BF 80..BF
* U+0800..U+FFFF F0 80..8F A0..BF 80..BF
* U+10000..U+10FFFF F0..F4 90..8F 80..BF 80..BF
*
* U-110000..U-1FFFFF x F4..F7 90..BF 80..BF 80..BF
* xn F8 80..87 80..BF 80..BF 80..BF
* U-200000..U-3FFFFFF x F8..FB 88..BF 80..BF 80..BF 80..BF
* xn FC 80..83 80..BF 80..BF 80..BF 80..BF
* U-4000000..U-7FFFFFFF x FC..FD 84..BF 80..BF 80..BF 80..BF 80..BF
*
* Legend:
* n = Non-shortest form
* s = Surrogates
* x = Codepoints outside Unicode codespace
*/

/*
* Encodes the given ordinal [0, 7FFFFFFF] using the UTF-8 encoding scheme
* to the given sequence length [1, 6]. This routine can be used to
* produce well-formed and ill-formed UTF-8.
*
* To encode a Unicode scalar value to a well-formed representation:
*
* [U+0000, U+007F] should be encoded to a sequence length of 1
* [U+0080, U+07FF] should be encoded to a sequence length of 2
* [U+0800, U+D7FF] should be encoded to a sequence length of 3
* [U+E000, U+FFFF] should be encoded to a sequence length of 3
* [U+10000, U+10FFFF] should be encoded to a sequence length of 4
*
* To encode a Unicode scalar value to non-shortest form representation:
*
* [U+0000, U+007F] can be encoded to a sequence length of [2, 6]
* [U+0080, U+07FF] can be encoded to a sequence length of [3, 6]
* [U+0800, U+FFFF] can be encoded to a sequence length of [4, 6]
*
* To encode an ordinal outside of Unicode codespace:
*
* [110000, 1FFFFF] can be encoded to a sequence length of 4
* [200000, 3FFFFFF] can be encoded to a sequence length of 5
* [4000000, 7FFFFFFF] can be encoded to a sequence length of 6
*/

static char *
encode_ord(uint32_t ord, size_t len, char *dst) {
static const uint32_t kMask[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
static const uint32_t kMax[6] = { 1 << 7, 1 << 11, 1 << 16,
1 << 21, 1 << 26, 1 << 31 };
size_t i;

assert(len >= 1);
assert(len <= 6);
assert(ord < kMax[len - 1]);

for (i = len - 1; i > 0; i--) {
dst[i] = (ord & 0x3F) | 0x80;
ord >>= 6;
}
dst[0] = ord | kMask[len - 1];
return dst;
}

static int utf8_check(const char *src, size_t len)
{
bool decoded = false;
struct utf8_state utf8_state = UTF8_STATE_INIT;
size_t i;

for (i = 0; i < len; i++) {
decoded = utf8_decode(&utf8_state, src[i]);
if (decoded) {
if (errno != 0)
return errno;
}
}
if (!decoded)
return EMLINK;
return 0;
}

static void
test_utf8(const char *src, size_t len, int exp_err, unsigned line) {
int got_err;

assert(len <= 255);

got_err = utf8_check(src, len);

ok(got_err == exp_err, "Got result %i, expected %i at line %u",
got_err, exp_err, line);
}

#define TEST_UTF8(src, len, exp) \
test_utf8(src, len, exp, __LINE__)


static void
test_unicode_scalar_value(void) {
uint32_t ord;
char src[4];

/* Unicode scalar value [U+0000, U+007F] */
for (ord = 0x0000; ord <= 0x007F; ord++) {
encode_ord(ord, 1, src);
TEST_UTF8(src, 1, ord ? 0 : ERANGE);
}

/*
* Unicode scalar value [U+0080, U+07FF]
* The maximal subpart is the length of the truncated sequence
*/
for (ord = 0x0080; ord <= 0x07FF; ord++) {
encode_ord(ord, 2, src);
TEST_UTF8(src, 2, 0);
}

/*
* Unicode scalar value [U+0800, U+D7FF] and [U+E000, U+FFFF]
* The maximal subpart is the length of the truncated sequence
*/
for (ord = 0x0800; ord <= 0xFFFF && (ord & 0xF800) != 0xD800; ord++) {
encode_ord(ord, 3, src);

TEST_UTF8(src, 3, 0);
if ((ord % (1 << 6)) == 0)
TEST_UTF8(src, 2, EMLINK);
}

/*
* Unicode scalar value [U+10000, U+10FFF]
* The maximal subpart is the length of the truncated sequence
*/
for (ord = 0x10000; ord <= 0x10FFFF; ord++) {
encode_ord(ord, 4, src);

TEST_UTF8(src, 4, 0);
if ((ord % (1 << 6)) == 0)
TEST_UTF8(src, 3, EMLINK);
if ((ord % (1 << 12)) == 0)
TEST_UTF8(src, 2, EMLINK);
}
}

static void
test_non_shortest_form(void) {
uint32_t ord;
char src[4];

/*
* Non-shortest form 2-byte sequence [U+0000, U+007F]
* The maximal subpart is 1-byte
*/
for (ord = 0x0001; ord <= 0x007F; ord++) {
encode_ord(ord, 2, src);
TEST_UTF8(src, 2, EFBIG);
}

/*
* Non-shortest form 3-byte sequence [U+0000, U+07FF]
* The maximal subpart is 1-byte
*/
for (ord = 0x0001; ord <= 0x07FF; ord++) {
encode_ord(ord, 3, src);

TEST_UTF8(src, 3, EFBIG);
if ((ord % (1 << 6)) == 0)
TEST_UTF8(src, 2, EMLINK);
}

/*
* Non-shortest form 4-byte sequence [U+0000, U+FFFF]
* The maximal subpart is 1-byte
*/
for (ord = 0x0001; ord <= 0xFFFF; ord++) {
encode_ord(ord, 4, src);

TEST_UTF8(src, 4, EFBIG);
if ((ord % (1 << 6)) == 0)
TEST_UTF8(src, 3, EMLINK);
if ((ord % (1 << 12)) == 0)
TEST_UTF8(src, 2, EMLINK);
}
}

static void
test_non_unicode(void) {
uint32_t ord;
char src[4];

/*
* Code point outside Unicode codespace
* The maximal subpart is 1-byte
*/
for (ord = 0x110000; ord <= 0x1FFFFF; ord++) {
encode_ord(ord, 4, src);

TEST_UTF8(src, 4, ERANGE);
if ((ord % (1 << 6)) == 0)
TEST_UTF8(src, 3, EMLINK);
if ((ord % (1 << 12)) == 0)
TEST_UTF8(src, 2, EMLINK);
}
}

static void
test_surrogates(void) {
uint32_t ord;
char src[4];

/*
* Surrogates [U+D800, U+DFFF]
* The maximal subpart is 1-byte
*/
for (ord = 0xD800; ord <= 0xDFFF; ord++) {
encode_ord(ord, 3, src);

TEST_UTF8(src, 3, ERANGE);
if ((ord % (1 << 6)) == 0)
TEST_UTF8(src, 2, EMLINK);
}
}

static void
test_continuations(void) {
uint8_t ord;
char src[4];

/*
* Missplaced continuation [\x80, \xBF]
* The maximal subpart is 1-byte
*/
for (ord = 0x80; ord <= 0xBF; ord++) {
src[0] = ord;
TEST_UTF8(src, 1, EINVAL);
}
}

int
main(int argc, char **argv)
{
plan_tests(2190906);
test_unicode_scalar_value();
test_surrogates();
test_non_shortest_form();
test_non_unicode();
test_continuations();

return exit_status();
}
Loading

0 comments on commit 2639b1e

Please sign in to comment.