Skip to content

Commit

Permalink
Update comments; Fix u8c_decode_utf8_length not validating; Add attri…
Browse files Browse the repository at this point in the history
…bute u8c_DEPRECATED; Deprecate u8c_encode_utf16 and u8c_encode_utf16_length as they're untested (this is not permanent);
  • Loading branch information
bjoernager committed Jul 23, 2023
1 parent a48610b commit 68997e2
Show file tree
Hide file tree
Showing 5 changed files with 68 additions and 23 deletions.
7 changes: 7 additions & 0 deletions CHANGELOG.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
# 26

* Update comments
* Fix u8c_decode_utf8_length not validating
* Add attribute u8c_DEPRECATED
* Deprecate u8c_encode_utf16 and u8c_encode_utf16_length as they're untested (this is not permanent)

# 25

* Rename source directory: src => source
Expand Down
4 changes: 2 additions & 2 deletions u8c/include/u8c/format.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@ u8c_NO_DISCARD u8c_NO_THROW size_t u8c_decode_utf16_length(uint_least16_t const*
u8c_NO_THROW size_t u8c_encode_utf8(char* u8c_RESTRICT _buffer, uint_least32_t const* u8c_RESTRICT _source, size_t _count);
u8c_NO_THROW size_t u8c_decode_utf8(uint_least32_t* u8c_RESTRICT _buffer, char const* u8c_RESTRICT _source, size_t _count);

u8c_NO_THROW size_t u8c_encode_utf16(uint_least16_t* u8c_RESTRICT _buffer, uint_least32_t const* u8c_RESTRICT _source, size_t _count);
u8c_NO_THROW size_t u8c_decode_utf16(uint_least32_t* u8c_RESTRICT _buffer, uint_least16_t const* u8c_RESTRICT _source, size_t _count);
u8c_DEPRECATED("utf-16 may not be safe") u8c_NO_THROW size_t u8c_encode_utf16(uint_least16_t* u8c_RESTRICT _buffer, uint_least32_t const* u8c_RESTRICT _source, size_t _count);
u8c_DEPRECATED("utf-16 may not be safe") u8c_NO_THROW size_t u8c_decode_utf16(uint_least32_t* u8c_RESTRICT _buffer, uint_least16_t const* u8c_RESTRICT _source, size_t _count);

#ifdef __cplusplus
}
Expand Down
20 changes: 12 additions & 8 deletions u8c/include/u8c/u8c.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,35 +56,39 @@

#ifdef __GNUC__

#define u8c_ALWAYS_INLINE __attribute__ ((__always_inline__))
#define u8c_NO_DISCARD __attribute__ ((__warn_unused_result__))
#define u8c_NO_THROW __attribute__ ((__nothrow__))
#define u8c_UNSEQUENCED __attribute__ ((__const__))
#define u8c_ALWAYS_INLINE __attribute__ ((__always_inline__))
#define u8c_DEPRECATED(_m) __attribute__ ((__deprecated__((_m))))
#define u8c_NO_DISCARD __attribute__ ((__warn_unused_result__))
#define u8c_NO_THROW __attribute__ ((__nothrow__))
#define u8c_UNSEQUENCED __attribute__ ((__const__))

#elif __STDC_VERSION__ >= 202311

#define u8c_ALWAYS_INLINE
#define u8c_NO_DISCARD [[nodiscard]]
#define u8c_DEPRECATED(_m) [[deprecated((_m))]]
#define u8c_NO_DISCARD [[nodiscard]]
#define u8c_NO_THROW
#define u8c_UNSEQUENCED [[unsequenced]]
#define u8c_UNSEQUENCED [[unsequenced]]

#elif __cplusplus >= 201703

#define u8c_ALWAYS_INLINE
#define u8c_NO_DISCARD [[nodiscard]]
#define u8c_DEPRECATED(_m) [[deprecated((_m))]]
#define u8c_NO_DISCARD [[nodiscard]]
#define u8c_NO_THROW
#define u8c_UNSEQUENCED

#else

#define u8c_ALWAYS_INLINE
#define u8c_DEPRECATED(_m)
#define u8c_NO_DISCARD
#define u8c_NO_THROW
#define u8c_UNSEQUENCED

#endif

#define u8c_VERSION ((uint_least32_t)+UINT32_C(0x1D))
#define u8c_VERSION ((uint_least32_t)+UINT32_C(0x1E))

#define u8c_MAXIMUM_CODE_POINT ((uint_least32_t)+UINT32_C(0x0010FFFF))

Expand Down
46 changes: 39 additions & 7 deletions u8c/source/format/decode_utf8.c
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,31 @@ size_t u8c_decode_utf8(uint_least32_t* const restrict buffer, char const* const

uint_least32_t code_point = UINT32_C(0x0);

// For each octet in the input, we assert the
// following:
//
// 1. It has an appropriate value for its position.
// 2. The ammount of remaining octets is
// sufficient to fully decode the current
// sequence.
//
// If these predicates are not true, the octet is
// discard and the replacement character U+FFFD
// written set its place.
//
// If the decoded code point lies outside the
// defined valid range of a UTF-32 value - that is,
// it's a surrogate point or larger than
// U+0010FFFF - it is likewise replaced.
//
// If an octet sequence with an otherwise valid
// initiating octet contains any ammount of invalid
// values, it is skipped in its entirety and
// replaced.

if ((octet & UINT32_C(0xF8)) == UINT32_C(0xF0)) {
// Four octets:

if (remaining < 0x3u) {
code_point = UINT32_C(0xFFFD);
} else {
Expand All @@ -48,9 +72,9 @@ size_t u8c_decode_utf8(uint_least32_t* const restrict buffer, char const* const
uint_least32_t const octet3 = (uint_least32_t)source[index_in + 0x3];

if (
(octet1 & 0xC0) != 0x80
|| (octet2 & 0xC0) != 0x80
|| (octet3 & 0xC0) != 0x80
(octet1 & UINT32_C(0xC0)) != UINT32_C(0x80)
|| (octet2 & UINT32_C(0xC0)) != UINT32_C(0x80)
|| (octet3 & UINT32_C(0xC0)) != UINT32_C(0x80)
) {
code_point = UINT32_C(0xFFFD);
} else {
Expand All @@ -63,15 +87,17 @@ size_t u8c_decode_utf8(uint_least32_t* const restrict buffer, char const* const

index_in += 0x4;
} else if ((octet & UINT32_C(0xF0)) == UINT32_C(0xE0)) {
// Three octets:

if (remaining < 0x2u) {
code_point = UINT32_C(0xFFFD);
} else {
uint_least32_t const octet1 = (uint_least32_t)source[index_in + 0x1];
uint_least32_t const octet2 = (uint_least32_t)source[index_in + 0x2];

if (
(octet1 & 0xC0) != 0x80
|| (octet2 & 0xC0) != 0x80
(octet1 & UINT32_C(0xC0)) != UINT32_C(0x80)
|| (octet2 & UINT32_C(0xC0)) != UINT32_C(0x80)
) {
code_point = UINT32_C(0xFFFD);
} else {
Expand All @@ -82,13 +108,15 @@ size_t u8c_decode_utf8(uint_least32_t* const restrict buffer, char const* const
}

index_in += 0x3;
} else if ((octet & UINT32_C(0xE0)) == 0xC0) {
} else if ((octet & UINT32_C(0xE0)) == UINT32_C(0xC0)) {
// Two octets:

if (remaining < 0x1u) {
code_point = UINT32_C(0xFFFD);
} else {
uint_least32_t const octet1 = (uint_least32_t)source[index_in + 0x1];

if ((octet1 & 0xC0) != 0x80) {
if ((octet1 & UINT32_C(0xC0)) != UINT32_C(0x80)) {
code_point = UINT32_C(0xFFFD);
} else {
code_point |= (octet ^ UINT32_C(0xC0)) << UINT32_C(0x6);
Expand All @@ -98,10 +126,14 @@ size_t u8c_decode_utf8(uint_least32_t* const restrict buffer, char const* const

index_in += 0x2;
} else if ((octet & UINT32_C(0x80)) == UINT32_C(0x0)) {
// One octet:

code_point |= octet;

++index_in;
} else {
// Invalid:

code_point = UINT32_C(0xFFFD);

++index_in;
Expand Down
14 changes: 8 additions & 6 deletions u8c/source/format/decode_utf8_length.c
Original file line number Diff line number Diff line change
Expand Up @@ -34,13 +34,15 @@ size_t u8c_decode_utf8_length(char const* const restrict _source, size_t const c
for (ptrdiff_t index = 0x0; index < (ptrdiff_t)count; ++length) {
char unsigned const octet = source[index];

if (octet >= 0xF0u) {
index += 0x4u;
} else if (octet >= 0xE0u) {
index += 0x3u;
} else if (octet >= 0xC0u) {
index += 0x2u;
if ((octet & UINT32_C(0xF8)) == UINT32_C(0xF0)) {
index += 0x4;
} else if ((octet & UINT32_C(0xF0)) == UINT32_C(0xE0)) {
index += 0x3;
} else if ((octet & UINT32_C(0xE0)) == UINT32_C(0xC0)) {
index += 0x2;
} else {
// Valid or not, this is decoded as a single code
// point.
++index;
}
}
Expand Down

0 comments on commit 68997e2

Please sign in to comment.