Skip to content

Commit

Permalink
add routine for lossy conversion
Browse files Browse the repository at this point in the history
Summary: Sometimes we just don't want to deal with errors.

Reviewed By: avp

Differential Revision: D30444131

fbshipit-source-id: 905a8873821ca216ca319054aba6b2de8c65eb01
  • Loading branch information
tmikov authored and facebook-github-bot committed Aug 21, 2021
1 parent 51ae06f commit d5b1ff6
Showing 1 changed file with 54 additions and 15 deletions.
69 changes: 54 additions & 15 deletions unsupported/juno/src/hermes_utf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

use thiserror::Error;

#[derive(Error, Debug)]
#[derive(Clone, Copy, Error, Debug, Eq, PartialEq)]
pub enum UTFError {
// General.
#[error("invalid Unicode character")]
Expand Down Expand Up @@ -91,6 +91,7 @@ fn decode_utf8_slow_path<const ALLOW_SURROGATES: bool>(
let len = src.len();
if (ch & 0xE0) == 0xC0 {
if *from + 1 >= len {
*from = len;
return Err(UTFError::UTF8IncompleteCont);
}
let ch1 = unsafe { *src.get_unchecked(*from + 1) } as u32;
Expand Down Expand Up @@ -192,10 +193,12 @@ pub fn utf8_with_surrogates_to_utf16(src: &[u8]) -> Result<Vec<u16>, UTFError> {
Ok(v)
}

pub fn utf8_with_surrogates_to_string(src: &[u8]) -> Result<String, UTFError> {
/// Returns a string with replacement and an optional error.
fn utf8_with_surrogates_to_string_helper(src: &[u8]) -> (String, Option<UTFError>) {
let mut from: usize = 0;
let mut str = String::new();
let len = src.len();
let mut err: Option<UTFError> = None;

str.reserve(len);
while from < len {
Expand All @@ -207,27 +210,56 @@ pub fn utf8_with_surrogates_to_string(src: &[u8]) -> Result<String, UTFError> {
continue;
}

let mut cp = decode_utf8_slow_path::<true>(src, &mut from, b as u32)?;
if is_low_surrogate(cp) {
return Err(UTFError::UTF16UnmatchedLowSurrogate);
let mut cp: u32;
match decode_utf8_slow_path::<true>(src, &mut from, b as u32) {
Ok(x) => cp = x,
Err(e) => {
err = err.or(Some(e));
cp = UNICODE_REPLACEMENT_CHARACTER;
}
}
if is_high_surrogate(cp) {
if is_low_surrogate(cp) {
err = err.or(Some(UTFError::UTF16UnmatchedLowSurrogate));
cp = UNICODE_REPLACEMENT_CHARACTER;
} else if is_high_surrogate(cp) {
if from >= len {
return Err(UTFError::UTF16IncompleteSurrogatePair);
}
// We checked `from` already.
let b1 = unsafe { *src.get_unchecked(from) };
let cp_low = decode_utf8::<true>(src, &mut from, b1)?;
if !is_low_surrogate(cp_low) {
return Err(UTFError::UTF16IncompleteSurrogatePair);
err = err.or(Some(UTFError::UTF16IncompleteSurrogatePair));
cp = UNICODE_REPLACEMENT_CHARACTER;
} else {
// We checked `from` already.
let b1 = unsafe { *src.get_unchecked(from) };
match decode_utf8::<true>(src, &mut from, b1) {
Ok(cp_low) => {
if !is_low_surrogate(cp_low) {
err = err.or(Some(UTFError::UTF16IncompleteSurrogatePair));
cp = UNICODE_REPLACEMENT_CHARACTER;
} else {
cp = decode_surrogate_pair(cp, cp_low);
}
}
Err(e) => {
err = err.or(Some(e));
cp = UNICODE_REPLACEMENT_CHARACTER;
}
}
}
cp = decode_surrogate_pair(cp, cp_low);
}
str.push(unsafe { char::from_u32_unchecked(cp) });
}

str.shrink_to_fit();
Ok(str)
(str, err)
}

pub fn utf8_with_surrogates_to_string(src: &[u8]) -> Result<String, UTFError> {
match utf8_with_surrogates_to_string_helper(src) {
(_, Some(e)) => Err(e),
(s, None) => Ok(s),
}
}

pub fn utf8_with_surrogates_to_string_lossy(src: &[u8]) -> String {
utf8_with_surrogates_to_string_helper(src).0
}

#[cfg(test)]
Expand Down Expand Up @@ -257,4 +289,11 @@ mod tests {
"😹"
);
}
#[test]
fn test_lossy() {
assert_eq!(
utf8_with_surrogates_to_string_lossy(&[0xED, 0xA0, 0x30, 0xED, 0xB8, 0xB9]),
"�0�"
);
}
}

0 comments on commit d5b1ff6

Please sign in to comment.