Update comments; Fix u8c_decode_utf8_length not validating; Add attri…

…bute u8c_DEPRECATED; Deprecate u8c_encode_utf16 and u8c_encode_utf16_length as they're untested (this is not permanent);
bjoernager · Jul 23, 2023 · 68997e2 · 68997e2
1 parent a48610b
commit 68997e2
Show file tree

Hide file tree

Showing 5 changed files with 68 additions and 23 deletions.
diff --git a/CHANGELOG.txt b/CHANGELOG.txt
@@ -1,3 +1,10 @@
+# 26
+
+* Update comments
+* Fix u8c_decode_utf8_length not validating
+* Add attribute u8c_DEPRECATED
+* Deprecate u8c_encode_utf16 and u8c_encode_utf16_length as they're untested (this is not permanent)
+
 # 25
 
 * Rename source directory: src => source

diff --git a/u8c/include/u8c/format.h b/u8c/include/u8c/format.h
@@ -42,8 +42,8 @@ u8c_NO_DISCARD u8c_NO_THROW size_t u8c_decode_utf16_length(uint_least16_t const*
 u8c_NO_THROW size_t u8c_encode_utf8(char*           u8c_RESTRICT _buffer, uint_least32_t const* u8c_RESTRICT _source, size_t _count);
 u8c_NO_THROW size_t u8c_decode_utf8(uint_least32_t* u8c_RESTRICT _buffer, char const*           u8c_RESTRICT _source, size_t _count);
 
-u8c_NO_THROW size_t u8c_encode_utf16(uint_least16_t* u8c_RESTRICT _buffer, uint_least32_t const* u8c_RESTRICT _source, size_t _count);
-u8c_NO_THROW size_t u8c_decode_utf16(uint_least32_t* u8c_RESTRICT _buffer, uint_least16_t const* u8c_RESTRICT _source, size_t _count);
+u8c_DEPRECATED("utf-16 may not be safe") u8c_NO_THROW size_t u8c_encode_utf16(uint_least16_t* u8c_RESTRICT _buffer, uint_least32_t const* u8c_RESTRICT _source, size_t _count);
+u8c_DEPRECATED("utf-16 may not be safe") u8c_NO_THROW size_t u8c_decode_utf16(uint_least32_t* u8c_RESTRICT _buffer, uint_least16_t const* u8c_RESTRICT _source, size_t _count);
 
 #ifdef __cplusplus
 }

diff --git a/u8c/include/u8c/u8c.h b/u8c/include/u8c/u8c.h
@@ -56,35 +56,39 @@
 
 #ifdef __GNUC__
 
-#define u8c_ALWAYS_INLINE __attribute__ ((__always_inline__))
-#define u8c_NO_DISCARD    __attribute__ ((__warn_unused_result__))
-#define u8c_NO_THROW      __attribute__ ((__nothrow__))
-#define u8c_UNSEQUENCED   __attribute__ ((__const__))
+#define u8c_ALWAYS_INLINE  __attribute__ ((__always_inline__))
+#define u8c_DEPRECATED(_m) __attribute__ ((__deprecated__((_m))))
+#define u8c_NO_DISCARD     __attribute__ ((__warn_unused_result__))
+#define u8c_NO_THROW       __attribute__ ((__nothrow__))
+#define u8c_UNSEQUENCED    __attribute__ ((__const__))
 
 #elif __STDC_VERSION__ >= 202311
 
 #define u8c_ALWAYS_INLINE
-#define u8c_NO_DISCARD    [[nodiscard]]
+#define u8c_DEPRECATED(_m) [[deprecated((_m))]]
+#define u8c_NO_DISCARD     [[nodiscard]]
 #define u8c_NO_THROW
-#define u8c_UNSEQUENCED   [[unsequenced]]
+#define u8c_UNSEQUENCED    [[unsequenced]]
 
 #elif __cplusplus >= 201703
 
 #define u8c_ALWAYS_INLINE
-#define u8c_NO_DISCARD    [[nodiscard]]
+#define u8c_DEPRECATED(_m) [[deprecated((_m))]]
+#define u8c_NO_DISCARD     [[nodiscard]]
 #define u8c_NO_THROW
 #define u8c_UNSEQUENCED
 
 #else
 
 #define u8c_ALWAYS_INLINE
+#define u8c_DEPRECATED(_m)
 #define u8c_NO_DISCARD
 #define u8c_NO_THROW
 #define u8c_UNSEQUENCED
 
 #endif
 
-#define u8c_VERSION ((uint_least32_t)+UINT32_C(0x1D))
+#define u8c_VERSION ((uint_least32_t)+UINT32_C(0x1E))
 
 #define u8c_MAXIMUM_CODE_POINT ((uint_least32_t)+UINT32_C(0x0010FFFF))
 

diff --git a/u8c/source/format/decode_utf8.c b/u8c/source/format/decode_utf8.c
@@ -39,7 +39,31 @@ size_t u8c_decode_utf8(uint_least32_t* const restrict buffer, char const* const
 
 		uint_least32_t code_point = UINT32_C(0x0);
 
+		// For each octet in the input, we assert the
+		// following:
+		//
+		// 1. It has an appropriate value for its position.
+		// 2. The ammount of remaining octets is
+		//    sufficient to fully decode the current
+		//    sequence.
+		//
+		// If these predicates are not true, the octet is
+		// discard and the	replacement character U+FFFD
+		// written set its place.
+		//
+		// If the decoded code point lies outside the
+		// defined valid range of a UTF-32 value - that is,
+		// it's a surrogate point or larger than
+		// U+0010FFFF - it is likewise replaced.
+		//
+		// If an octet sequence with an otherwise valid
+		// initiating octet contains any ammount of invalid
+		// values, it is skipped in its entirety and
+		// replaced.
+
 		if ((octet & UINT32_C(0xF8)) == UINT32_C(0xF0)) {
+			// Four octets:
+
 			if (remaining < 0x3u) {
 				code_point = UINT32_C(0xFFFD);
 			} else {
@@ -48,9 +72,9 @@ size_t u8c_decode_utf8(uint_least32_t* const restrict buffer, char const* const
 				uint_least32_t const octet3 = (uint_least32_t)source[index_in + 0x3];
 
 				if (
-					   (octet1 & 0xC0) != 0x80
-					|| (octet2 & 0xC0) != 0x80
-					|| (octet3 & 0xC0) != 0x80
+					   (octet1 & UINT32_C(0xC0)) != UINT32_C(0x80)
+					|| (octet2 & UINT32_C(0xC0)) != UINT32_C(0x80)
+					|| (octet3 & UINT32_C(0xC0)) != UINT32_C(0x80)
 				) {
 					code_point = UINT32_C(0xFFFD);
 				} else {
@@ -63,15 +87,17 @@ size_t u8c_decode_utf8(uint_least32_t* const restrict buffer, char const* const
 
 			index_in += 0x4;
 		} else if ((octet & UINT32_C(0xF0)) == UINT32_C(0xE0)) {
+			// Three octets:
+
 			if (remaining < 0x2u) {
 				code_point = UINT32_C(0xFFFD);
 			} else {
 				uint_least32_t const octet1 = (uint_least32_t)source[index_in + 0x1];
 				uint_least32_t const octet2 = (uint_least32_t)source[index_in + 0x2];
 
 				if (
-					   (octet1 & 0xC0) != 0x80
-					|| (octet2 & 0xC0) != 0x80
+					   (octet1 & UINT32_C(0xC0)) != UINT32_C(0x80)
+					|| (octet2 & UINT32_C(0xC0)) != UINT32_C(0x80)
 				) {
 					code_point = UINT32_C(0xFFFD);
 				} else {
@@ -82,13 +108,15 @@ size_t u8c_decode_utf8(uint_least32_t* const restrict buffer, char const* const
 			}
 
 			index_in += 0x3;
-		} else if ((octet & UINT32_C(0xE0)) == 0xC0) {
+		} else if ((octet & UINT32_C(0xE0)) == UINT32_C(0xC0)) {
+			// Two octets:
+
 			if (remaining < 0x1u) {
 				code_point = UINT32_C(0xFFFD);
 			} else {
 				uint_least32_t const octet1 = (uint_least32_t)source[index_in + 0x1];
 
-				if ((octet1 & 0xC0) != 0x80) {
+				if ((octet1 & UINT32_C(0xC0)) != UINT32_C(0x80)) {
 					code_point = UINT32_C(0xFFFD);
 				} else {
 					code_point |= (octet  ^ UINT32_C(0xC0)) << UINT32_C(0x6);
@@ -98,10 +126,14 @@ size_t u8c_decode_utf8(uint_least32_t* const restrict buffer, char const* const
 
 			index_in += 0x2;
 		} else if ((octet & UINT32_C(0x80)) == UINT32_C(0x0)) {
+			// One octet:
+
 			code_point |= octet;
 
 			++index_in;
 		} else {
+			// Invalid:
+
 			code_point = UINT32_C(0xFFFD);
 
 			++index_in;

diff --git a/u8c/source/format/decode_utf8_length.c b/u8c/source/format/decode_utf8_length.c
@@ -34,13 +34,15 @@ size_t u8c_decode_utf8_length(char const* const restrict _source, size_t const c
 	for (ptrdiff_t index = 0x0; index < (ptrdiff_t)count; ++length) {
 		char unsigned const octet = source[index];
 
-		if (octet >= 0xF0u) {
-			index += 0x4u;
-		} else if (octet >= 0xE0u) {
-			index += 0x3u;
-		} else if (octet >= 0xC0u) {
-			index += 0x2u;
+		if ((octet & UINT32_C(0xF8)) == UINT32_C(0xF0)) {
+			index += 0x4;
+		} else if ((octet & UINT32_C(0xF0)) == UINT32_C(0xE0)) {
+			index += 0x3;
+		} else if ((octet & UINT32_C(0xE0)) == UINT32_C(0xC0)) {
+			index += 0x2;
 		} else {
+			// Valid or not, this is decoded as a single code
+			// point.
 			++index;
 		}
 	}