Added exact utf8 length estimator and exposed writeUtf8 with custom s…

…pace reservation on destination buffer Motivation: To avoid eager allocation of the destination and to perform length prefixed encoding of UTF-8 string with forward only access pattern Modifications: The original writeUtf8 is modified by allowing customization of the reserved bytes on the destination buffer and is introduced an exact UTF-8 length estimator. Result: Is now possible to perform length first encoding with UTF-8 well-formed char sequences following a forward only write access pattern on the destination buffer.
jchenga · Feb 16, 2018 · bc8e022 · bc8e022
1 parent dc3036a
commit bc8e022
Show file tree

Hide file tree

Showing 2 changed files with 129 additions and 5 deletions.
diff --git a/buffer/src/main/java/io/netty/buffer/ByteBufUtil.java b/buffer/src/main/java/io/netty/buffer/ByteBufUtil.java
@@ -475,14 +475,29 @@ public static ByteBuf writeUtf8(ByteBufAllocator alloc, CharSequence seq) {
     /**
      * Encode a {@link CharSequence} in <a href="http://en.wikipedia.org/wiki/UTF-8">UTF-8</a> and write
      * it to a {@link ByteBuf}.
-     *
+     * <p>
+     * It behaves like {@link #reserveAndWriteUtf8(ByteBuf, CharSequence, int)} with {@code reserveBytes}
+     * computed by {@link #utf8MaxBytes(CharSequence)}.<br>
      * This method returns the actual number of bytes written.
      */
     public static int writeUtf8(ByteBuf buf, CharSequence seq) {
+        return reserveAndWriteUtf8(buf, seq, utf8MaxBytes(seq));
+    }
+
+    /**
+     * Encode a {@link CharSequence} in <a href="http://en.wikipedia.org/wiki/UTF-8">UTF-8</a> and write
+     * it into {@code reserveBytes} of a {@link ByteBuf}.
+     * <p>
+     * The {@code reserveBytes} must be computed (ie eagerly using {@link #utf8MaxBytes(CharSequence)}
+     * or exactly with {@link #utf8Bytes(CharSequence)}) to ensure this method to not fail: for performance reasons
+     * the index checks will be performed using just {@code reserveBytes}.<br>
+     * This method returns the actual number of bytes written.
+     */
+    public static int reserveAndWriteUtf8(ByteBuf buf, CharSequence seq, int reserveBytes) {
         for (;;) {
             if (buf instanceof AbstractByteBuf) {
                 AbstractByteBuf byteBuf = (AbstractByteBuf) buf;
-                byteBuf.ensureWritable0(utf8MaxBytes(seq));
+                byteBuf.ensureWritable0(reserveBytes);
                 int written = writeUtf8(byteBuf, byteBuf.writerIndex, seq, seq.length());
                 byteBuf.writerIndex += written;
                 return written;
@@ -521,7 +536,7 @@ static int writeUtf8(AbstractByteBuf buffer, int writerIndex, CharSequence seq,
                     // duplicate bounds checking with charAt. If an IndexOutOfBoundsException is thrown we will
                     // re-throw a more informative exception describing the problem.
                     c2 = seq.charAt(++i);
-                } catch (IndexOutOfBoundsException e) {
+                } catch (IndexOutOfBoundsException ignored) {
                     buffer._setByte(writerIndex++, WRITE_UTF_UNKNOWN);
                     break;
                 }
@@ -545,11 +560,77 @@ static int writeUtf8(AbstractByteBuf buffer, int writerIndex, CharSequence seq,
         return writerIndex - oldWriterIndex;
     }
 
+    /**
+     * Returns max bytes length of UTF8 character sequence of the given length.
+     */
+    public static int utf8MaxBytes(final int seqLength) {
+        return seqLength * MAX_BYTES_PER_CHAR_UTF8;
+    }
+
     /**
      * Returns max bytes length of UTF8 character sequence.
+     * <p>
+     * It behaves like {@link #utf8MaxBytes(int)} applied to {@code seq} {@link CharSequence#length()}.
      */
     public static int utf8MaxBytes(CharSequence seq) {
-        return seq.length() * MAX_BYTES_PER_CHAR_UTF8;
+        return utf8MaxBytes(seq.length());
+    }
+
+    /**
+     * Returns the exact bytes length of UTF8 character sequence.
+     * <p>
+     * This method is producing the exact length according to {@link #writeUtf8(ByteBuf, CharSequence)}.
+     */
+    public static int utf8Bytes(final CharSequence seq) {
+        if (seq instanceof AsciiString) {
+            return seq.length();
+        }
+        int seqLength = seq.length();
+        int i = 0;
+        // ASCII fast path
+        while (i < seqLength && seq.charAt(i) < 0x80) {
+            ++i;
+        }
+        // !ASCII is packed in a separate method to let the ASCII case be smaller
+        return i < seqLength ? i + utf8Bytes(seq, i, seqLength) : i;
+    }
+
+    private static int utf8Bytes(final CharSequence seq, final int start, final int length) {
+        int encodedLength = 0;
+        for (int i = start; i < length; i++) {
+            final char c = seq.charAt(i);
+            // making it 100% branchless isn't rewarding due to the many bit operations necessary!
+            if (c < 0x800) {
+                // branchless version of: (c <= 127 ? 0:1) + 1
+                encodedLength += ((0x7f - c) >>> 31) + 1;
+            } else if (isSurrogate(c)) {
+                if (!Character.isHighSurrogate(c)) {
+                    encodedLength++;
+                    // WRITE_UTF_UNKNOWN
+                    continue;
+                }
+                final char c2;
+                try {
+                    // Surrogate Pair consumes 2 characters. Optimistically try to get the next character to avoid
+                    // duplicate bounds checking with charAt.
+                    c2 = seq.charAt(++i);
+                } catch (IndexOutOfBoundsException ignored) {
+                    encodedLength++;
+                    // WRITE_UTF_UNKNOWN
+                    break;
+                }
+                if (!Character.isLowSurrogate(c2)) {
+                    // WRITE_UTF_UNKNOWN + (Character.isHighSurrogate(c2) ? WRITE_UTF_UNKNOWN : c2)
+                    encodedLength += 2;
+                    continue;
+                }
+                // See http://www.unicode.org/versions/Unicode7.0.0/ch03.pdf#G2630.
+                encodedLength += 4;
+            } else {
+                encodedLength += 3;
+            }
+        }
+        return encodedLength;
     }
 
     /**

diff --git a/buffer/src/test/java/io/netty/buffer/ByteBufUtilTest.java b/buffer/src/test/java/io/netty/buffer/ByteBufUtilTest.java
@@ -267,6 +267,7 @@ public void testWriteUtf8Surrogates() {
         ByteBufUtil.writeUtf8(buf2, surrogateString);
 
         assertEquals(buf, buf2);
+        assertEquals(buf.readableBytes(), ByteBufUtil.utf8Bytes(surrogateString));
 
         buf.release();
         buf2.release();
@@ -285,6 +286,7 @@ public void testWriteUtf8InvalidOnlyTrailingSurrogate() {
         ByteBufUtil.writeUtf8(buf2, surrogateString);
 
         assertEquals(buf, buf2);
+        assertEquals(buf.readableBytes(), ByteBufUtil.utf8Bytes(surrogateString));
 
         buf.release();
         buf2.release();
@@ -303,6 +305,7 @@ public void testWriteUtf8InvalidOnlyLeadingSurrogate() {
         ByteBufUtil.writeUtf8(buf2, surrogateString);
 
         assertEquals(buf, buf2);
+        assertEquals(buf.readableBytes(), ByteBufUtil.utf8Bytes(surrogateString));
 
         buf.release();
         buf2.release();
@@ -322,6 +325,7 @@ public void testWriteUtf8InvalidSurrogatesSwitched() {
         ByteBufUtil.writeUtf8(buf2, surrogateString);
 
         assertEquals(buf, buf2);
+        assertEquals(buf.readableBytes(), ByteBufUtil.utf8Bytes(surrogateString));
 
         buf.release();
         buf2.release();
@@ -341,7 +345,7 @@ public void testWriteUtf8InvalidTwoLeadingSurrogates() {
         ByteBufUtil.writeUtf8(buf2, surrogateString);
 
         assertEquals(buf, buf2);
-
+        assertEquals(buf.readableBytes(), ByteBufUtil.utf8Bytes(surrogateString));
         buf.release();
         buf2.release();
     }
@@ -360,6 +364,7 @@ public void testWriteUtf8InvalidTwoTrailingSurrogates() {
         ByteBufUtil.writeUtf8(buf2, surrogateString);
 
         assertEquals(buf, buf2);
+        assertEquals(buf.readableBytes(), ByteBufUtil.utf8Bytes(surrogateString));
 
         buf.release();
         buf2.release();
@@ -376,6 +381,7 @@ public void testWriteUtf8InvalidEndOnLeadingSurrogate() {
         ByteBufUtil.writeUtf8(buf2, surrogateString);
 
         assertEquals(buf, buf2);
+        assertEquals(buf.readableBytes(), ByteBufUtil.utf8Bytes(surrogateString));
 
         buf.release();
         buf2.release();
@@ -392,6 +398,7 @@ public void testWriteUtf8InvalidEndOnTrailingSurrogate() {
         ByteBufUtil.writeUtf8(buf2, surrogateString);
 
         assertEquals(buf, buf2);
+        assertEquals(buf.readableBytes(), ByteBufUtil.utf8Bytes(surrogateString));
 
         buf.release();
         buf2.release();
@@ -546,6 +553,42 @@ public void testIsTextWithInvalidIndexAndLength() {
         }
     }
 
+    @Test
+    public void testUtf8Bytes() {
+        final String s = "Some UTF-8 like äÄ∏ŒŒ";
+        checkUtf8Bytes(s);
+    }
+
+    @Test
+    public void testUtf8BytesWithSurrogates() {
+        final String s = "a\uD800\uDC00b";
+        checkUtf8Bytes(s);
+    }
+
+    @Test
+    public void testUtf8BytesWithNonSurrogates3Bytes() {
+        final String s = "a\uE000b";
+        checkUtf8Bytes(s);
+    }
+
+    @Test
+    public void testUtf8BytesWithNonSurrogatesNonAscii() {
+        final char nonAscii = (char) 0x81;
+        final String s = "a" + nonAscii + "b";
+        checkUtf8Bytes(s);
+    }
+
+    private static void checkUtf8Bytes(final CharSequence charSequence) {
+        final ByteBuf buf = Unpooled.buffer(ByteBufUtil.utf8MaxBytes(charSequence));
+        try {
+            final int writtenBytes = ByteBufUtil.writeUtf8(buf, charSequence);
+            final int utf8Bytes = ByteBufUtil.utf8Bytes(charSequence);
+            assertEquals(writtenBytes, utf8Bytes);
+        } finally {
+            buf.release();
+        }
+    }
+
     private static void assertIsText(byte[] bytes, boolean expected, Charset charset) {
         ByteBuf buffer = Unpooled.buffer();
         try {