Skip to content

Commit

Permalink
introduce NullTerminatedBuf
Browse files Browse the repository at this point in the history
Summary:
The various Hermes parsers require that their input be null terminated.
That is an invariant that has proven difficult to maintain in practice
because it is not explicit in the C++ interfaces.

`NullTerminatedBuf` aims to solve this problem when Hermes parsers are
exposed to Rust by making the invariant explicit. It provides a
reference to a slice that is always explicitly null terminated.
Internally it either just borrows a reference, or owns a copy of the
original non-terminated input.

`NullTerminatedBuf` can be created from `&[u8]`, `&str` and by reading
from a file. The first two provide options to check whether the input is
already null-terminated and just borrow it instead of copying. That can
be useful if the input is already null terminated for another reason.

The more interesting from performance perspective is creating a
`NullTerminatedBuf` from a file. In terms of functionality it is
equivalent to `llvm::MemoryBuffer::getFile()`.

The current implementation is extremely primitive, it simply reads the
entire file into a Vec and appends a 0. There are many possibilities for
optimizations, but applying them was not trivial since we weren't able
to find portable ways for checking if a file is a "regular file" (as
opposed to a pipe, a character device, etc) in Rust. Listing them here
for posterity:

- Obtain the size the file in advance to avoid reallocations and to
  minimize number of syscalls (hopefully down to one, if not
  interrupted).
- If the file is not aligned to page, memory map it. The "filler space"
  past the end is guaranteed to be zeroes.
- Expose `llvh::MemoryBuffer` through a couple of simple C wrappers and
  just use that.

Reviewed By: avp

Differential Revision: D30444136

fbshipit-source-id: 9b1d198a6de385ed075d89a1229313e14c56ad0a
  • Loading branch information
tmikov authored and facebook-github-bot committed Aug 21, 2021
1 parent d5b1ff6 commit f1150ce
Show file tree
Hide file tree
Showing 5 changed files with 143 additions and 33 deletions.
2 changes: 1 addition & 1 deletion unsupported/juno/src/hparser/convert.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ use std::str::FromStr;

/// Converts from Hermes AST to Juno AST
pub struct Converter<'a> {
pub hparser: &'a HermesParser,
pub hparser: &'a HermesParser<'a>,
/// The file id to use for the converted coordinates.
pub file_id: u32,
}
Expand Down
43 changes: 16 additions & 27 deletions unsupported/juno/src/hparser/hermes_parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

use super::node::{Node, NodePtr, NodePtrOpt, SMLoc, StringRef};
use crate::hermes_utf::utf8_with_surrogates_to_string;
use crate::nullbuf::NullTerminatedBuf;
use libc::c_int;
use std::fmt::Formatter;
use std::marker::PhantomData;
Expand Down Expand Up @@ -109,42 +110,27 @@ extern "C" {
fn hermes_get_node_name(node: NodePtr) -> DataRef<'static, u8>;
}

pub struct HermesParser {
pub struct HermesParser<'a> {
/// A pointer to the opaque C++ parser object. It should never be null.
parser_ctx: *mut ParserContext,
/// If the input is not zero-terminated, we create a zero-terminated copy
/// here.
tmpbuf: Vec<u8>,
source: &'a NullTerminatedBuf<'a>,
}

impl Drop for HermesParser {
impl Drop for HermesParser<'_> {
fn drop(&mut self) {
unsafe { hermes_parser_free(self.parser_ctx) }
}
}

impl HermesParser {
impl HermesParser<'_> {
/// `file_id` is an opaque value used for encoding source coordinates.
/// To avoid copying `source` can optionally be NUL-terminated.
pub fn parse(source: &str) -> HermesParser {
let bytes = source.as_bytes();

// Optional temporary copy for zero termination.
let mut tmpbuf = Vec::new();
// Zero terminated source reference.
let source_z = if let [.., 0] = bytes {
bytes
} else {
tmpbuf.reserve_exact(bytes.len() + 1);
tmpbuf.extend_from_slice(bytes);
tmpbuf.push(0u8);
tmpbuf.as_slice()
};

let parser_ctx =
unsafe { hermes_parser_parse(source_z.as_ptr() as *const i8, source_z.len()) };

HermesParser { parser_ctx, tmpbuf }
pub fn parse<'a>(source: &'a NullTerminatedBuf) -> HermesParser<'a> {
HermesParser {
parser_ctx: unsafe { hermes_parser_parse(source.as_c_char_ptr(), source.len()) },
source,
}
}

/// Return the index of the first parser error (there could be warnings before it).
Expand Down Expand Up @@ -206,14 +192,16 @@ mod tests {

#[test]
fn good_parse() {
let p = HermesParser::parse("var x = 10;\0");
let buf = NullTerminatedBuf::from_str_check("var x = 10;\0");
let p = HermesParser::parse(&buf);
assert!(!p.has_errors());
assert_eq!(p.root().unwrap().as_ref().kind, NodeKind::Program);
}

#[test]
fn parse_error() {
let p = HermesParser::parse("var x+ = 10;");
let buf = NullTerminatedBuf::from_str_check("var x+ = 10;");
let p = HermesParser::parse(&buf);
assert!(p.has_errors());
assert!(p.root().is_none());

Expand All @@ -227,13 +215,14 @@ mod tests {

#[test]
fn magic_comments() {
let p = HermesParser::parse(
let buf = NullTerminatedBuf::from_str_check(
"var p = 0;
//# sourceURL=1
//# sourceMappingURL=my map URL
//# sourceURL=my source URL
",
);
let p = HermesParser::parse(&buf);
assert!(!p.has_errors());
assert_eq!(
p.magic_comment(MagicCommentKind::SourceUrl).unwrap(),
Expand Down
15 changes: 10 additions & 5 deletions unsupported/juno/src/hparser/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ mod hermes_parser;
mod node;

use crate::ast;
use crate::nullbuf::NullTerminatedBuf;
use thiserror::Error;

use crate::hermes_utf::utf8_with_surrogates_to_string;
Expand All @@ -23,15 +24,15 @@ use std::fmt::Formatter;

pub use hermes_parser::MagicCommentKind;

pub struct ParsedJS {
parser: HermesParser,
pub struct ParsedJS<'a> {
parser: HermesParser<'a>,
}

impl ParsedJS {
impl ParsedJS<'_> {
/// Parse the source and store an internal representation of the AST and/or a list of diagnostic
/// messages. If at least one of the messages is an error, there is no AST.
/// To avoid copying `source` can optionally be NUL-terminated.
pub fn parse(source: &str) -> ParsedJS {
pub fn parse<'a>(source: &'a NullTerminatedBuf) -> ParsedJS<'a> {
ParsedJS {
parser: HermesParser::parse(source),
}
Expand Down Expand Up @@ -95,8 +96,12 @@ impl std::fmt::Display for ParseError {

/// This is a simple function that is intended to be used mostly for testing.
/// When there ar errors, it returns only the first error.
/// It checks if the input is already null-terminated and avoids making the copy in that case.
/// Note that if the null terminator is truly present in the input, it would parse successfully
/// what ought to be an error.
pub fn parse(source: &str) -> Result<ast::NodePtr, ParseError> {
let parsed = ParsedJS::parse(source);
let buf = NullTerminatedBuf::from_str_check(source);
let parsed = ParsedJS::parse(&buf);
if let Some(ast) = parsed.to_ast() {
Ok(ast)
} else {
Expand Down
3 changes: 3 additions & 0 deletions unsupported/juno/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,6 @@ pub mod hermes_utf;

#[allow(dead_code)]
pub mod hparser;

#[allow(dead_code)]
pub mod nullbuf;
113 changes: 113 additions & 0 deletions unsupported/juno/src/nullbuf.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/

use std::io::{BufReader, Read};
use std::os::raw::c_char;

/// An abstraction for a null-terminated buffer either read from disk, copied
/// or borrowed.
pub struct NullTerminatedBuf<'a>(Inner<'a>);

// This enum must be separate because we can't make the variants private.
enum Inner<'a> {
Own(Vec<u8>),
Ref(&'a [u8]),
}

impl NullTerminatedBuf<'_> {
/// A reference to an existing NullTerminatedBuf which can be passed by value.
pub fn as_ref_buf<'a>(buf: &'a NullTerminatedBuf<'a>) -> NullTerminatedBuf<'a> {
NullTerminatedBuf(Inner::Ref(buf.as_bytes()))
}

/// Create from a file and null terminated.
pub fn from_file(f: &mut std::fs::File) -> Result<NullTerminatedBuf, std::io::Error> {
// TODO: this is an extremely naive implementation, it can be optimized in multiple ways:
// - obtain the size of the file and perform a single allocation and few syscalls
// - memory map the file
// - just use LLVM's MemoryBuffer
// One problem is that there isn't an obvious way in Rust to check portably whether
// something has a fixed size and is memory mappable (i.e. is not a pipe).

let mut reader = BufReader::new(f);
let mut v = Vec::<u8>::new();
reader.read_to_end(&mut v)?;
v.push(0);

Ok(NullTerminatedBuf(Inner::Own(v)))
}

/// Create by copying a slice and appending null-termination.
pub fn from_slice_copy(s: &[u8]) -> NullTerminatedBuf {
let mut v = Vec::with_capacity(s.len() + 1);
v.extend_from_slice(s);
v.push(0);
NullTerminatedBuf(Inner::Own(v))
}

/// Create from a slice that may already be null-terminated. If it is,
/// borrow it, otherwise create a null-terminated copy.
pub fn from_slice_check(s: &[u8]) -> NullTerminatedBuf {
if let [.., 0] = s {
NullTerminatedBuf(Inner::Ref(s))
} else {
Self::from_slice_copy(s)
}
}

/// Create by copying a string and appending null-termination.
pub fn from_str_copy(s: &str) -> NullTerminatedBuf {
Self::from_slice_copy(s.as_bytes())
}

/// Create from a string that may already be null-terminated. If it is,
/// borrow it, otherwise create a null-terminated copy.
pub fn from_str_check(s: &str) -> NullTerminatedBuf {
Self::from_slice_check(s.as_bytes())
}

/// Return the length of the data including the null terminator.
pub fn len(&self) -> usize {
match &self.0 {
Inner::Own(v) => v.len(),
Inner::Ref(s) => s.len(),
}
}

/// Just a placeholder always returning `true`, since the there is always
/// at least a null terminator.
pub fn is_empty(&self) -> bool {
false
}

/// A pointer to the start of the slice.
/// # Safety
/// It is not really unsafe, but is intended to be used in an unsafe context.
pub unsafe fn as_ptr(&self) -> *const u8 {
self.as_bytes().as_ptr()
}

/// Convenience wrapper returning C `const char *`.
/// # Safety
/// It is not really unsafe, but is intended to be used in an unsafe context.
pub unsafe fn as_c_char_ptr(&self) -> *const c_char {
self.as_ptr() as *const c_char
}

fn as_bytes(&self) -> &[u8] {
match &self.0 {
Inner::Own(v) => v.as_slice(),
Inner::Ref(s) => s,
}
}
}

impl std::convert::AsRef<[u8]> for NullTerminatedBuf<'_> {
fn as_ref(&self) -> &[u8] {
self.as_bytes()
}
}

0 comments on commit f1150ce

Please sign in to comment.