Skip to content

Commit

Permalink
Added exact utf8 length estimator and exposed writeUtf8 with custom s…
Browse files Browse the repository at this point in the history
…pace reservation on destination buffer

Motivation:

To avoid eager allocation of the destination and to perform length prefixed encoding of UTF-8 string with forward only access pattern

Modifications:

The original writeUtf8 is modified by allowing customization of the reserved bytes on the destination buffer and is introduced an exact UTF-8 length estimator.

Result:

Is now possible to perform length first encoding with UTF-8 well-formed char sequences following a forward only write access pattern on the destination buffer.
  • Loading branch information
franz1981 authored and normanmaurer committed Feb 16, 2018
1 parent dc3036a commit bc8e022
Show file tree
Hide file tree
Showing 2 changed files with 129 additions and 5 deletions.
89 changes: 85 additions & 4 deletions buffer/src/main/java/io/netty/buffer/ByteBufUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -475,14 +475,29 @@ public static ByteBuf writeUtf8(ByteBufAllocator alloc, CharSequence seq) {
/**
* Encode a {@link CharSequence} in <a href="http://en.wikipedia.org/wiki/UTF-8">UTF-8</a> and write
* it to a {@link ByteBuf}.
*
* <p>
* It behaves like {@link #reserveAndWriteUtf8(ByteBuf, CharSequence, int)} with {@code reserveBytes}
* computed by {@link #utf8MaxBytes(CharSequence)}.<br>
* This method returns the actual number of bytes written.
*/
public static int writeUtf8(ByteBuf buf, CharSequence seq) {
return reserveAndWriteUtf8(buf, seq, utf8MaxBytes(seq));
}

/**
* Encode a {@link CharSequence} in <a href="http://en.wikipedia.org/wiki/UTF-8">UTF-8</a> and write
* it into {@code reserveBytes} of a {@link ByteBuf}.
* <p>
* The {@code reserveBytes} must be computed (ie eagerly using {@link #utf8MaxBytes(CharSequence)}
* or exactly with {@link #utf8Bytes(CharSequence)}) to ensure this method to not fail: for performance reasons
* the index checks will be performed using just {@code reserveBytes}.<br>
* This method returns the actual number of bytes written.
*/
public static int reserveAndWriteUtf8(ByteBuf buf, CharSequence seq, int reserveBytes) {
for (;;) {
if (buf instanceof AbstractByteBuf) {
AbstractByteBuf byteBuf = (AbstractByteBuf) buf;
byteBuf.ensureWritable0(utf8MaxBytes(seq));
byteBuf.ensureWritable0(reserveBytes);
int written = writeUtf8(byteBuf, byteBuf.writerIndex, seq, seq.length());
byteBuf.writerIndex += written;
return written;
Expand Down Expand Up @@ -521,7 +536,7 @@ static int writeUtf8(AbstractByteBuf buffer, int writerIndex, CharSequence seq,
// duplicate bounds checking with charAt. If an IndexOutOfBoundsException is thrown we will
// re-throw a more informative exception describing the problem.
c2 = seq.charAt(++i);
} catch (IndexOutOfBoundsException e) {
} catch (IndexOutOfBoundsException ignored) {
buffer._setByte(writerIndex++, WRITE_UTF_UNKNOWN);
break;
}
Expand All @@ -545,11 +560,77 @@ static int writeUtf8(AbstractByteBuf buffer, int writerIndex, CharSequence seq,
return writerIndex - oldWriterIndex;
}

/**
* Returns max bytes length of UTF8 character sequence of the given length.
*/
public static int utf8MaxBytes(final int seqLength) {
return seqLength * MAX_BYTES_PER_CHAR_UTF8;
}

/**
* Returns max bytes length of UTF8 character sequence.
* <p>
* It behaves like {@link #utf8MaxBytes(int)} applied to {@code seq} {@link CharSequence#length()}.
*/
public static int utf8MaxBytes(CharSequence seq) {
return seq.length() * MAX_BYTES_PER_CHAR_UTF8;
return utf8MaxBytes(seq.length());
}

/**
* Returns the exact bytes length of UTF8 character sequence.
* <p>
* This method is producing the exact length according to {@link #writeUtf8(ByteBuf, CharSequence)}.
*/
public static int utf8Bytes(final CharSequence seq) {
if (seq instanceof AsciiString) {
return seq.length();
}
int seqLength = seq.length();
int i = 0;
// ASCII fast path
while (i < seqLength && seq.charAt(i) < 0x80) {
++i;
}
// !ASCII is packed in a separate method to let the ASCII case be smaller
return i < seqLength ? i + utf8Bytes(seq, i, seqLength) : i;
}

private static int utf8Bytes(final CharSequence seq, final int start, final int length) {
int encodedLength = 0;
for (int i = start; i < length; i++) {
final char c = seq.charAt(i);
// making it 100% branchless isn't rewarding due to the many bit operations necessary!
if (c < 0x800) {
// branchless version of: (c <= 127 ? 0:1) + 1
encodedLength += ((0x7f - c) >>> 31) + 1;
} else if (isSurrogate(c)) {
if (!Character.isHighSurrogate(c)) {
encodedLength++;
// WRITE_UTF_UNKNOWN
continue;
}
final char c2;
try {
// Surrogate Pair consumes 2 characters. Optimistically try to get the next character to avoid
// duplicate bounds checking with charAt.
c2 = seq.charAt(++i);
} catch (IndexOutOfBoundsException ignored) {
encodedLength++;
// WRITE_UTF_UNKNOWN
break;
}
if (!Character.isLowSurrogate(c2)) {
// WRITE_UTF_UNKNOWN + (Character.isHighSurrogate(c2) ? WRITE_UTF_UNKNOWN : c2)
encodedLength += 2;
continue;
}
// See http://www.unicode.org/versions/Unicode7.0.0/ch03.pdf#G2630.
encodedLength += 4;
} else {
encodedLength += 3;
}
}
return encodedLength;
}

/**
Expand Down
45 changes: 44 additions & 1 deletion buffer/src/test/java/io/netty/buffer/ByteBufUtilTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,7 @@ public void testWriteUtf8Surrogates() {
ByteBufUtil.writeUtf8(buf2, surrogateString);

assertEquals(buf, buf2);
assertEquals(buf.readableBytes(), ByteBufUtil.utf8Bytes(surrogateString));

buf.release();
buf2.release();
Expand All @@ -285,6 +286,7 @@ public void testWriteUtf8InvalidOnlyTrailingSurrogate() {
ByteBufUtil.writeUtf8(buf2, surrogateString);

assertEquals(buf, buf2);
assertEquals(buf.readableBytes(), ByteBufUtil.utf8Bytes(surrogateString));

buf.release();
buf2.release();
Expand All @@ -303,6 +305,7 @@ public void testWriteUtf8InvalidOnlyLeadingSurrogate() {
ByteBufUtil.writeUtf8(buf2, surrogateString);

assertEquals(buf, buf2);
assertEquals(buf.readableBytes(), ByteBufUtil.utf8Bytes(surrogateString));

buf.release();
buf2.release();
Expand All @@ -322,6 +325,7 @@ public void testWriteUtf8InvalidSurrogatesSwitched() {
ByteBufUtil.writeUtf8(buf2, surrogateString);

assertEquals(buf, buf2);
assertEquals(buf.readableBytes(), ByteBufUtil.utf8Bytes(surrogateString));

buf.release();
buf2.release();
Expand All @@ -341,7 +345,7 @@ public void testWriteUtf8InvalidTwoLeadingSurrogates() {
ByteBufUtil.writeUtf8(buf2, surrogateString);

assertEquals(buf, buf2);

assertEquals(buf.readableBytes(), ByteBufUtil.utf8Bytes(surrogateString));
buf.release();
buf2.release();
}
Expand All @@ -360,6 +364,7 @@ public void testWriteUtf8InvalidTwoTrailingSurrogates() {
ByteBufUtil.writeUtf8(buf2, surrogateString);

assertEquals(buf, buf2);
assertEquals(buf.readableBytes(), ByteBufUtil.utf8Bytes(surrogateString));

buf.release();
buf2.release();
Expand All @@ -376,6 +381,7 @@ public void testWriteUtf8InvalidEndOnLeadingSurrogate() {
ByteBufUtil.writeUtf8(buf2, surrogateString);

assertEquals(buf, buf2);
assertEquals(buf.readableBytes(), ByteBufUtil.utf8Bytes(surrogateString));

buf.release();
buf2.release();
Expand All @@ -392,6 +398,7 @@ public void testWriteUtf8InvalidEndOnTrailingSurrogate() {
ByteBufUtil.writeUtf8(buf2, surrogateString);

assertEquals(buf, buf2);
assertEquals(buf.readableBytes(), ByteBufUtil.utf8Bytes(surrogateString));

buf.release();
buf2.release();
Expand Down Expand Up @@ -546,6 +553,42 @@ public void testIsTextWithInvalidIndexAndLength() {
}
}

@Test
public void testUtf8Bytes() {
final String s = "Some UTF-8 like äÄ∏ŒŒ";
checkUtf8Bytes(s);
}

@Test
public void testUtf8BytesWithSurrogates() {
final String s = "a\uD800\uDC00b";
checkUtf8Bytes(s);
}

@Test
public void testUtf8BytesWithNonSurrogates3Bytes() {
final String s = "a\uE000b";
checkUtf8Bytes(s);
}

@Test
public void testUtf8BytesWithNonSurrogatesNonAscii() {
final char nonAscii = (char) 0x81;
final String s = "a" + nonAscii + "b";
checkUtf8Bytes(s);
}

private static void checkUtf8Bytes(final CharSequence charSequence) {
final ByteBuf buf = Unpooled.buffer(ByteBufUtil.utf8MaxBytes(charSequence));
try {
final int writtenBytes = ByteBufUtil.writeUtf8(buf, charSequence);
final int utf8Bytes = ByteBufUtil.utf8Bytes(charSequence);
assertEquals(writtenBytes, utf8Bytes);
} finally {
buf.release();
}
}

private static void assertIsText(byte[] bytes, boolean expected, Charset charset) {
ByteBuf buffer = Unpooled.buffer();
try {
Expand Down

0 comments on commit bc8e022

Please sign in to comment.