Skip to content

Commit

Permalink
Clean up Utf8 class and avoid copying and strlen() for JNI NewStringUTF
Browse files Browse the repository at this point in the history
  • Loading branch information
peter-hofer committed Dec 29, 2017
1 parent 84be517 commit 9124cb1
Show file tree
Hide file tree
Showing 3 changed files with 68 additions and 199 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,13 @@
*/
package com.oracle.svm.core.util;

import java.io.ByteArrayOutputStream;
import java.io.DataInput;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.CharConversionException;
import java.nio.ByteBuffer;

/**
* Implements UTF-8 encoding and decoding of strings with support for zero-bytes as string
* terminators.
*/
public final class Utf8 {

private Utf8() {
Expand All @@ -46,121 +46,88 @@ public static int utf8Length(String string) {
* @param endIndex index at the end of the region, exclusive
* @return the length in bytes of the UTF8 representation of the string region
*/
public static int utf8Length(String string, int beginIndex, int endIndex) {
if (beginIndex < 0 || endIndex > string.length() || beginIndex > endIndex) {
public static int utf8Length(String s, int beginIndex, int endIndex) {
if (beginIndex < 0 || endIndex > s.length() || beginIndex > endIndex) {
throw new StringIndexOutOfBoundsException();
}
int result = 0;
int length = 0;
for (int i = beginIndex; i < endIndex; i++) {
final int ch = string.charAt(i);
if ((ch >= 0x0001) && (ch <= 0x007F)) {
result++;
} else if (ch > 0x07FF) {
result += 3;
final int c = s.charAt(i);
if ((c >= 0x0001) && (c <= 0x007F)) {
length++;
} else if (c > 0x07FF) {
length += 3;
} else {
result += 2;
length += 2;
}
}
return result;
return length;
}

/**
* @return maximum number of bytes needed to represent the specified number of characters.
*/
public static int maxByteLength(int charCount, boolean zeroTerminate) {
public static int maxUtf8ByteLength(int charCount, boolean zeroTerminate) {
assert charCount >= 0;
return 3 * charCount + (zeroTerminate ? 1 : 0);
}

/**
* Writes an UTF8-encoded string region to a given byte buffer.
*
* @param bb the byte buffer to write to
* @param string the String to be written
* @param beginIndex first index that is part of the region, inclusive
* @param endIndex index at the end of the region, exclusive
* @param dest the byte buffer to write to
* @param source the String to be written
* @param beginIndex first index in {@code source} that is part of the region, inclusive
* @param endIndex index in {@code source} at the end of the region, exclusive
* @param zeroTerminate whether to write a final zero byte
*/
public static void writeString(ByteBuffer bb, String string, int beginIndex, int endIndex, boolean zeroTerminate) {
if (beginIndex < 0 || endIndex > string.length() || beginIndex > endIndex) {
public static void substringToUtf8(ByteBuffer dest, String source, int beginIndex, int endIndex, boolean zeroTerminate) {
if (beginIndex < 0 || endIndex > source.length() || beginIndex > endIndex) {
throw new StringIndexOutOfBoundsException();
}
for (int i = beginIndex; i < endIndex; i++) {
final char ch = string.charAt(i);
if ((ch >= 0x0001) && (ch <= 0x007F)) {
bb.put((byte) ch);
} else if (ch > 0x07FF) {
bb.put((byte) (0xe0 | (byte) (ch >> 12)));
bb.put((byte) (0x80 | ((ch & 0xfc0) >> 6)));
bb.put((byte) (0x80 | (ch & 0x3f)));
final char c = source.charAt(i);
if ((c >= 0x0001) && (c <= 0x007F)) {
dest.put((byte) c);
} else if (c > 0x07FF) {
dest.put((byte) (0xe0 | (byte) (c >> 12)));
dest.put((byte) (0x80 | ((c & 0xfc0) >> 6)));
dest.put((byte) (0x80 | (c & 0x3f)));
} else {
bb.put((byte) (0xc0 | (byte) (ch >> 6)));
bb.put((byte) (0x80 | (ch & 0x3f)));
dest.put((byte) (0xc0 | (byte) (c >> 6)));
dest.put((byte) (0x80 | (c & 0x3f)));
}
}
if (zeroTerminate) {
bb.put((byte) 0);
dest.put((byte) 0);
}
}

public static byte[] stringToUtf8(String string, boolean zeroTerminate) {
int length = utf8Length(string) + (zeroTerminate ? 1 : 0);
public static byte[] stringToUtf8(String source, boolean zeroTerminate) {
int length = utf8Length(source) + (zeroTerminate ? 1 : 0);
ByteBuffer buffer = ByteBuffer.allocate(length);
writeString(buffer, string, 0, string.length(), zeroTerminate);
substringToUtf8(buffer, source, 0, source.length(), zeroTerminate);
return buffer.array();
}

/**
* Reads a UTF-8 encoded String from {@code in}.
*
* @param in a data input source
* @param zeroIsEncodedIn2Bytes if true, then 0 is decoded from two bytes as opposed to one
* @param length the numbers of bytes to be decoded
* @return the decoded string
*/
public static String readUtf8(DataInput in, boolean zeroIsEncodedIn2Bytes, int length) throws IOException, Utf8Exception {
if (length == 0) {
return "";
}
final byte[] utf8Data = new byte[length];

boolean sevenBit = true;
for (int i = 0; i < length; i++) {
final byte ch = in.readByte();
utf8Data[i] = ch;
if (ch < 0 || (zeroIsEncodedIn2Bytes && ch == 0)) {
sevenBit = false;
}
}

if (sevenBit) {
final char[] charData = new char[length];
for (int i = 0; i < length; i++) {
charData[i] = (char) (utf8Data[i] & 0xff);
}
return new String(charData);
}

return utf8ToString(zeroIsEncodedIn2Bytes, utf8Data);
}

/**
* Converts an array of UTF-8 data to a String.
* Converts a byte buffer of UTF-8 data to a String. The entire buffer until the
* {@link ByteBuffer#limit() buffer's limit} is converted unless {@code zeroTerminated} is
* {@code true}, in which case conversion stops at the first zero byte.
*
* @param zeroIsEncodedIn2Bytes if true, then 0 is decoded from two bytes as opposed to one
* @param utf8Data the data
* @param zeroTerminated if true, then a 0 byte marks the end of the string, and character '\0'
* in the input must be encoded as two bytes as opposed to one
* @param source the byte buffer to read from
* @return the decoded string
*/
public static String utf8ToString(boolean zeroIsEncodedIn2Bytes, byte[] utf8Data) throws Utf8Exception {
final int length = utf8Data.length;
int count = 0;
final StringBuilder sb = new StringBuilder(length);

while (count < length) {
final int c = utf8Data[count] & 0xff;
if (zeroIsEncodedIn2Bytes && c == 0) {
throw new Utf8Exception();
public static String utf8ToString(boolean zeroTerminated, ByteBuffer source) throws CharConversionException {
final StringBuilder sb = new StringBuilder();
while (source.hasRemaining()) {
final int c0 = source.get() & 0xff;
if (zeroTerminated && c0 == 0) {
break;
}
switch (c >> 4) {
switch (c0 >> 4) {
case 0:
case 1:
case 2:
Expand All @@ -170,83 +137,36 @@ public static String utf8ToString(boolean zeroIsEncodedIn2Bytes, byte[] utf8Data
case 6:
case 7: {
/* 0xxxxxxx */
count++;
sb.append((char) c);
sb.append((char) c0);
break;
}
case 12:
case 13: {
/* 110x xxxx 10xx xxxx */
count += 2;
if (count > length) {
throw new Utf8Exception();
final int c1 = source.get();
if ((c1 & 0xC0) != 0x80) {
throw new CharConversionException();
}
final int char2 = utf8Data[count - 1];
if ((char2 & 0xC0) != 0x80) {
throw new Utf8Exception();
}
sb.append((char) (((c & 0x1F) << 6) | (char2 & 0x3F)));
sb.append((char) (((c0 & 0x1F) << 6) | (c1 & 0x3F)));
break;
}
case 14: {
/* 1110 xxxx 10xx xxxx 10xx xxxx */
count += 3;
if (count > length) {
throw new Utf8Exception();
}
final int char2 = utf8Data[count - 2];
final int char3 = utf8Data[count - 1];
if (((char2 & 0xC0) != 0x80) || ((char3 & 0xC0) != 0x80)) {
throw new Utf8Exception();
final int c1 = source.get();
final int c2 = source.get();
if (((c1 & 0xC0) != 0x80) || ((c2 & 0xC0) != 0x80)) {
throw new CharConversionException();
}
sb.append((char) (((c & 0x0F) << 12) | ((char2 & 0x3F) << 6) | ((char3 & 0x3F) << 0)));
sb.append((char) (((c0 & 0x0F) << 12) | ((c1 & 0x3F) << 6) | (c2 & 0x3F)));
break;
}
default: {
/* 10xx xxxx, 1111 xxxx */
throw new Utf8Exception();
throw new CharConversionException();
}
}
}
// The number of chars produced may be less than utflen
return new String(sb);
}

private static byte[] readZeroTerminatedBytes(InputStream inputStream) throws IOException {
final ByteArrayOutputStream buffer = new ByteArrayOutputStream();
while (true) {
final int ch = inputStream.read();
if (ch < 0) {
throw new IOException();
}
if (ch == 0) {
return buffer.toByteArray();
}
buffer.write(ch);
}
}

/**
* Reads a 0-terminated UTF8 encoded string from a given stream.
*
* @param inputStream the stream to read from
* @return the String constructed from the UTF8 encoded chars read from {@code inputStream},
* omitting the terminating 0
*/
public static String readString(InputStream inputStream) throws IOException, Utf8Exception {
final byte[] utf8Data = readZeroTerminatedBytes(inputStream);
return Utf8.utf8ToString(false, utf8Data);
}

/**
* Writes a 0-terminated UTF8 encoded string to a given stream.
*
* @param outputStream the stream to write to
* @param string the String to be written
*/
public static void writeString(OutputStream outputStream, String string) throws IOException {
outputStream.write(stringToUtf8(string, false));
outputStream.write((byte) 0);
return sb.toString();
}

}

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

// Checkstyle: allow reflection

import java.io.CharConversionException;
import java.lang.reflect.Array;
import java.lang.reflect.Constructor;
import java.lang.reflect.Executable;
Expand All @@ -47,7 +48,6 @@
import org.graalvm.nativeimage.c.type.CTypeConversion;
import org.graalvm.nativeimage.c.type.WordPointer;
import org.graalvm.word.Pointer;
import org.graalvm.word.UnsignedWord;
import org.graalvm.word.WordBase;
import org.graalvm.word.WordFactory;

Expand All @@ -63,7 +63,6 @@
import com.oracle.svm.core.snippets.KnownIntrinsics;
import com.oracle.svm.core.thread.VMThreads;
import com.oracle.svm.core.util.Utf8;
import com.oracle.svm.core.util.Utf8Exception;
import com.oracle.svm.core.util.VMError;
import com.oracle.svm.jni.JNIGlobalHandles;
import com.oracle.svm.jni.JNIObjectHandles;
Expand Down Expand Up @@ -442,14 +441,10 @@ static JNIObjectHandle NewString(JNIEnvironment env, CShortPointer unicode, int
static JNIObjectHandle NewStringUTF(JNIEnvironment env, CCharPointer bytes) {
String str = null;
if (bytes.isNonNull()) {
UnsignedWord len = SubstrateUtil.strlen(bytes);
byte[] array = new byte[(int) len.rawValue()];
for (int i = 0; i < array.length; i++) {
array[i] = ((Pointer) bytes).readByte(i);
}
ByteBuffer buffer = SubstrateUtil.wrapAsByteBuffer(bytes, Integer.MAX_VALUE);
try {
str = Utf8.utf8ToString(true, array);
} catch (Utf8Exception ignore) {
str = Utf8.utf8ToString(true, buffer);
} catch (CharConversionException ignore) {
}
}
return JNIThreadLocalHandles.get().create(str);
Expand Down Expand Up @@ -579,9 +574,10 @@ static void GetStringUTFRegion(JNIEnvironment env, JNIObjectHandle hstr, int sta
if (len < 0) {
throw new StringIndexOutOfBoundsException(len);
}
int capacity = Utf8.maxByteLength(len, true); // estimate: caller must pre-allocate enough
int capacity = Utf8.maxUtf8ByteLength(len, true); // estimate: caller must pre-allocate
// enough
ByteBuffer buffer = SubstrateUtil.wrapAsByteBuffer(buf, capacity);
Utf8.writeString(buffer, str, start, start + len, true);
Utf8.substringToUtf8(buffer, str, start, start + len, true);
}

/*
Expand Down

0 comments on commit 9124cb1

Please sign in to comment.