Clean up Utf8 class and avoid copying and strlen() for JNI NewStringUTF

orangejyc · Dec 29, 2017 · 9124cb1 · 9124cb1
1 parent 84be517
commit 9124cb1
Show file tree

Hide file tree

Showing 3 changed files with 68 additions and 199 deletions.
diff --git a/substratevm/src/com.oracle.svm.core/src/com/oracle/svm/core/util/Utf8.java b/substratevm/src/com.oracle.svm.core/src/com/oracle/svm/core/util/Utf8.java
@@ -22,13 +22,13 @@
  */
 package com.oracle.svm.core.util;
 
-import java.io.ByteArrayOutputStream;
-import java.io.DataInput;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
+import java.io.CharConversionException;
 import java.nio.ByteBuffer;
 
+/**
+ * Implements UTF-8 encoding and decoding of strings with support for zero-bytes as string
+ * terminators.
+ */
 public final class Utf8 {
 
     private Utf8() {
@@ -46,121 +46,88 @@ public static int utf8Length(String string) {
      * @param endIndex index at the end of the region, exclusive
      * @return the length in bytes of the UTF8 representation of the string region
      */
-    public static int utf8Length(String string, int beginIndex, int endIndex) {
-        if (beginIndex < 0 || endIndex > string.length() || beginIndex > endIndex) {
+    public static int utf8Length(String s, int beginIndex, int endIndex) {
+        if (beginIndex < 0 || endIndex > s.length() || beginIndex > endIndex) {
             throw new StringIndexOutOfBoundsException();
         }
-        int result = 0;
+        int length = 0;
         for (int i = beginIndex; i < endIndex; i++) {
-            final int ch = string.charAt(i);
-            if ((ch >= 0x0001) && (ch <= 0x007F)) {
-                result++;
-            } else if (ch > 0x07FF) {
-                result += 3;
+            final int c = s.charAt(i);
+            if ((c >= 0x0001) && (c <= 0x007F)) {
+                length++;
+            } else if (c > 0x07FF) {
+                length += 3;
             } else {
-                result += 2;
+                length += 2;
             }
         }
-        return result;
+        return length;
     }
 
     /**
      * @return maximum number of bytes needed to represent the specified number of characters.
      */
-    public static int maxByteLength(int charCount, boolean zeroTerminate) {
+    public static int maxUtf8ByteLength(int charCount, boolean zeroTerminate) {
+        assert charCount >= 0;
         return 3 * charCount + (zeroTerminate ? 1 : 0);
     }
 
     /**
      * Writes an UTF8-encoded string region to a given byte buffer.
      * 
-     * @param bb the byte buffer to write to
-     * @param string the String to be written
-     * @param beginIndex first index that is part of the region, inclusive
-     * @param endIndex index at the end of the region, exclusive
+     * @param dest the byte buffer to write to
+     * @param source the String to be written
+     * @param beginIndex first index in {@code source} that is part of the region, inclusive
+     * @param endIndex index in {@code source} at the end of the region, exclusive
      * @param zeroTerminate whether to write a final zero byte
      */
-    public static void writeString(ByteBuffer bb, String string, int beginIndex, int endIndex, boolean zeroTerminate) {
-        if (beginIndex < 0 || endIndex > string.length() || beginIndex > endIndex) {
+    public static void substringToUtf8(ByteBuffer dest, String source, int beginIndex, int endIndex, boolean zeroTerminate) {
+        if (beginIndex < 0 || endIndex > source.length() || beginIndex > endIndex) {
             throw new StringIndexOutOfBoundsException();
         }
         for (int i = beginIndex; i < endIndex; i++) {
-            final char ch = string.charAt(i);
-            if ((ch >= 0x0001) && (ch <= 0x007F)) {
-                bb.put((byte) ch);
-            } else if (ch > 0x07FF) {
-                bb.put((byte) (0xe0 | (byte) (ch >> 12)));
-                bb.put((byte) (0x80 | ((ch & 0xfc0) >> 6)));
-                bb.put((byte) (0x80 | (ch & 0x3f)));
+            final char c = source.charAt(i);
+            if ((c >= 0x0001) && (c <= 0x007F)) {
+                dest.put((byte) c);
+            } else if (c > 0x07FF) {
+                dest.put((byte) (0xe0 | (byte) (c >> 12)));
+                dest.put((byte) (0x80 | ((c & 0xfc0) >> 6)));
+                dest.put((byte) (0x80 | (c & 0x3f)));
             } else {
-                bb.put((byte) (0xc0 | (byte) (ch >> 6)));
-                bb.put((byte) (0x80 | (ch & 0x3f)));
+                dest.put((byte) (0xc0 | (byte) (c >> 6)));
+                dest.put((byte) (0x80 | (c & 0x3f)));
             }
         }
         if (zeroTerminate) {
-            bb.put((byte) 0);
+            dest.put((byte) 0);
         }
     }
 
-    public static byte[] stringToUtf8(String string, boolean zeroTerminate) {
-        int length = utf8Length(string) + (zeroTerminate ? 1 : 0);
+    public static byte[] stringToUtf8(String source, boolean zeroTerminate) {
+        int length = utf8Length(source) + (zeroTerminate ? 1 : 0);
         ByteBuffer buffer = ByteBuffer.allocate(length);
-        writeString(buffer, string, 0, string.length(), zeroTerminate);
+        substringToUtf8(buffer, source, 0, source.length(), zeroTerminate);
         return buffer.array();
     }
 
     /**
-     * Reads a UTF-8 encoded String from {@code in}.
-     *
-     * @param in a data input source
-     * @param zeroIsEncodedIn2Bytes if true, then 0 is decoded from two bytes as opposed to one
-     * @param length the numbers of bytes to be decoded
-     * @return the decoded string
-     */
-    public static String readUtf8(DataInput in, boolean zeroIsEncodedIn2Bytes, int length) throws IOException, Utf8Exception {
-        if (length == 0) {
-            return "";
-        }
-        final byte[] utf8Data = new byte[length];
-
-        boolean sevenBit = true;
-        for (int i = 0; i < length; i++) {
-            final byte ch = in.readByte();
-            utf8Data[i] = ch;
-            if (ch < 0 || (zeroIsEncodedIn2Bytes && ch == 0)) {
-                sevenBit = false;
-            }
-        }
-
-        if (sevenBit) {
-            final char[] charData = new char[length];
-            for (int i = 0; i < length; i++) {
-                charData[i] = (char) (utf8Data[i] & 0xff);
-            }
-            return new String(charData);
-        }
-
-        return utf8ToString(zeroIsEncodedIn2Bytes, utf8Data);
-    }
-
-    /**
-     * Converts an array of UTF-8 data to a String.
+     * Converts a byte buffer of UTF-8 data to a String. The entire buffer until the
+     * {@link ByteBuffer#limit() buffer's limit} is converted unless {@code zeroTerminated} is
+     * {@code true}, in which case conversion stops at the first zero byte.
      *
-     * @param zeroIsEncodedIn2Bytes if true, then 0 is decoded from two bytes as opposed to one
-     * @param utf8Data the data
+     * @param zeroTerminated if true, then a 0 byte marks the end of the string, and character '\0'
+     *            in the input must be encoded as two bytes as opposed to one
+     * @param source the byte buffer to read from
      * @return the decoded string
      */
-    public static String utf8ToString(boolean zeroIsEncodedIn2Bytes, byte[] utf8Data) throws Utf8Exception {
-        final int length = utf8Data.length;
-        int count = 0;
-        final StringBuilder sb = new StringBuilder(length);
-
-        while (count < length) {
-            final int c = utf8Data[count] & 0xff;
-            if (zeroIsEncodedIn2Bytes && c == 0) {
-                throw new Utf8Exception();
+    public static String utf8ToString(boolean zeroTerminated, ByteBuffer source) throws CharConversionException {
+        final StringBuilder sb = new StringBuilder();
+        while (source.hasRemaining()) {
+            final int c0 = source.get() & 0xff;
+            if (zeroTerminated && c0 == 0) {
+                break;
             }
-            switch (c >> 4) {
+            switch (c0 >> 4) {
                 case 0:
                 case 1:
                 case 2:
@@ -170,83 +137,36 @@ public static String utf8ToString(boolean zeroIsEncodedIn2Bytes, byte[] utf8Data
                 case 6:
                 case 7: {
                     /* 0xxxxxxx */
-                    count++;
-                    sb.append((char) c);
+                    sb.append((char) c0);
                     break;
                 }
                 case 12:
                 case 13: {
                     /* 110x xxxx 10xx xxxx */
-                    count += 2;
-                    if (count > length) {
-                        throw new Utf8Exception();
+                    final int c1 = source.get();
+                    if ((c1 & 0xC0) != 0x80) {
+                        throw new CharConversionException();
                     }
-                    final int char2 = utf8Data[count - 1];
-                    if ((char2 & 0xC0) != 0x80) {
-                        throw new Utf8Exception();
-                    }
-                    sb.append((char) (((c & 0x1F) << 6) | (char2 & 0x3F)));
+                    sb.append((char) (((c0 & 0x1F) << 6) | (c1 & 0x3F)));
                     break;
                 }
                 case 14: {
                     /* 1110 xxxx 10xx xxxx 10xx xxxx */
-                    count += 3;
-                    if (count > length) {
-                        throw new Utf8Exception();
-                    }
-                    final int char2 = utf8Data[count - 2];
-                    final int char3 = utf8Data[count - 1];
-                    if (((char2 & 0xC0) != 0x80) || ((char3 & 0xC0) != 0x80)) {
-                        throw new Utf8Exception();
+                    final int c1 = source.get();
+                    final int c2 = source.get();
+                    if (((c1 & 0xC0) != 0x80) || ((c2 & 0xC0) != 0x80)) {
+                        throw new CharConversionException();
                     }
-                    sb.append((char) (((c & 0x0F) << 12) | ((char2 & 0x3F) << 6) | ((char3 & 0x3F) << 0)));
+                    sb.append((char) (((c0 & 0x0F) << 12) | ((c1 & 0x3F) << 6) | (c2 & 0x3F)));
                     break;
                 }
                 default: {
                     /* 10xx xxxx, 1111 xxxx */
-                    throw new Utf8Exception();
+                    throw new CharConversionException();
                 }
             }
         }
-        // The number of chars produced may be less than utflen
-        return new String(sb);
-    }
-
-    private static byte[] readZeroTerminatedBytes(InputStream inputStream) throws IOException {
-        final ByteArrayOutputStream buffer = new ByteArrayOutputStream();
-        while (true) {
-            final int ch = inputStream.read();
-            if (ch < 0) {
-                throw new IOException();
-            }
-            if (ch == 0) {
-                return buffer.toByteArray();
-            }
-            buffer.write(ch);
-        }
-    }
-
-    /**
-     * Reads a 0-terminated UTF8 encoded string from a given stream.
-     *
-     * @param inputStream the stream to read from
-     * @return the String constructed from the UTF8 encoded chars read from {@code inputStream},
-     *         omitting the terminating 0
-     */
-    public static String readString(InputStream inputStream) throws IOException, Utf8Exception {
-        final byte[] utf8Data = readZeroTerminatedBytes(inputStream);
-        return Utf8.utf8ToString(false, utf8Data);
-    }
-
-    /**
-     * Writes a 0-terminated UTF8 encoded string to a given stream.
-     *
-     * @param outputStream the stream to write to
-     * @param string the String to be written
-     */
-    public static void writeString(OutputStream outputStream, String string) throws IOException {
-        outputStream.write(stringToUtf8(string, false));
-        outputStream.write((byte) 0);
+        return sb.toString();
     }
 
 }
diff --git a/substratevm/src/com.oracle.svm.core/src/com/oracle/svm/core/util/Utf8Exception.java b/substratevm/src/com.oracle.svm.core/src/com/oracle/svm/core/util/Utf8Exception.java
diff --git a/substratevm/src/com.oracle.svm.jni/src/com/oracle/svm/jni/functions/JNIFunctions.java b/substratevm/src/com.oracle.svm.jni/src/com/oracle/svm/jni/functions/JNIFunctions.java
@@ -24,6 +24,7 @@
 
 // Checkstyle: allow reflection
 
+import java.io.CharConversionException;
 import java.lang.reflect.Array;
 import java.lang.reflect.Constructor;
 import java.lang.reflect.Executable;
@@ -47,7 +48,6 @@
 import org.graalvm.nativeimage.c.type.CTypeConversion;
 import org.graalvm.nativeimage.c.type.WordPointer;
 import org.graalvm.word.Pointer;
-import org.graalvm.word.UnsignedWord;
 import org.graalvm.word.WordBase;
 import org.graalvm.word.WordFactory;
 
@@ -63,7 +63,6 @@
 import com.oracle.svm.core.snippets.KnownIntrinsics;
 import com.oracle.svm.core.thread.VMThreads;
 import com.oracle.svm.core.util.Utf8;
-import com.oracle.svm.core.util.Utf8Exception;
 import com.oracle.svm.core.util.VMError;
 import com.oracle.svm.jni.JNIGlobalHandles;
 import com.oracle.svm.jni.JNIObjectHandles;
@@ -442,14 +441,10 @@ static JNIObjectHandle NewString(JNIEnvironment env, CShortPointer unicode, int
     static JNIObjectHandle NewStringUTF(JNIEnvironment env, CCharPointer bytes) {
         String str = null;
         if (bytes.isNonNull()) {
-            UnsignedWord len = SubstrateUtil.strlen(bytes);
-            byte[] array = new byte[(int) len.rawValue()];
-            for (int i = 0; i < array.length; i++) {
-                array[i] = ((Pointer) bytes).readByte(i);
-            }
+            ByteBuffer buffer = SubstrateUtil.wrapAsByteBuffer(bytes, Integer.MAX_VALUE);
             try {
-                str = Utf8.utf8ToString(true, array);
-            } catch (Utf8Exception ignore) {
+                str = Utf8.utf8ToString(true, buffer);
+            } catch (CharConversionException ignore) {
             }
         }
         return JNIThreadLocalHandles.get().create(str);
@@ -579,9 +574,10 @@ static void GetStringUTFRegion(JNIEnvironment env, JNIObjectHandle hstr, int sta
         if (len < 0) {
             throw new StringIndexOutOfBoundsException(len);
         }
-        int capacity = Utf8.maxByteLength(len, true); // estimate: caller must pre-allocate enough
+        int capacity = Utf8.maxUtf8ByteLength(len, true); // estimate: caller must pre-allocate
+                                                          // enough
         ByteBuffer buffer = SubstrateUtil.wrapAsByteBuffer(buf, capacity);
-        Utf8.writeString(buffer, str, start, start + len, true);
+        Utf8.substringToUtf8(buffer, str, start, start + len, true);
     }
 
     /*