Skip to content

Commit

Permalink
Bugfixes coming from fuzzing (blocking release) (untitaker#22)
Browse files Browse the repository at this point in the history
  • Loading branch information
untitaker authored Dec 16, 2021
1 parent e3ad8a9 commit 4b53622
Show file tree
Hide file tree
Showing 5 changed files with 108 additions and 60 deletions.
10 changes: 10 additions & 0 deletions fuzz/Makefile
Original file line number Diff line number Diff line change
@@ -1,6 +1,14 @@
# Confirm that really only the current html5gum should be fuzzed. This will
# find panics and segfaults at best, not any other wrong behavior.
export FUZZ_BASIC := 0
# Add latest released html5gum to fuzzing
export FUZZ_OLD_HTML5GUM := 0
# Add html5ever to fuzzing
export FUZZ_HTML5EVER := 0
# Ignore errors while diffing to paper over bugs in old html5gum
export FUZZ_IGNORE_PARSE_ERRORS := 0
# CLI arguments to pass to AFL. useful for multiprocessing
export _AFL_OPTS := -M fuzzer01

AFL_TARGET_BIN=target-afl/debug/html5gum-fuzz-afl

Expand Down Expand Up @@ -41,6 +49,8 @@ afl-skip:
break; \
done

whatsup:
cargo afl whatsup -s out/

cli:
cargo run --bin html5gum-fuzz-cli
22 changes: 16 additions & 6 deletions fuzz/src/testcase.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,24 @@ pub fn run(s: &str) {
if env::var("FUZZ_OLD_HTML5GUM").unwrap() == "1" {
let reference_tokenizer = html5gum_old::Tokenizer::new(s).infallible();
let testing_tokenizer = html5gum::Tokenizer::new(s).infallible();
let mut testing_tokens = Vec::new();
let mut reference_tokens = Vec::new();
for (a, b) in testing_tokenizer.zip(reference_tokenizer) {
// Check equality of two different versions of Token by... stringifying them
testing_tokens.push(format!("{:?}", a));
reference_tokens.push(format!("{:?}", b));

let mut testing_tokens: Vec<_> = testing_tokenizer.collect();
let mut reference_tokens: Vec<_> = reference_tokenizer.collect();

if env::var("FUZZ_IGNORE_PARSE_ERRORS").unwrap() == "1" {
testing_tokens.retain(|x| !matches!(x, html5gum::Token::Error(_)));
reference_tokens.retain(|x| !matches!(x, html5gum_old::Token::Error(_)));
}

let testing_tokens: Vec<_> = testing_tokens
.into_iter()
.map(|x| format!("{:?}", x))
.collect();
let reference_tokens: Vec<_> = reference_tokens
.into_iter()
.map(|x| format!("{:?}", x))
.collect();

assert_eq!(testing_tokens, reference_tokens);
did_anything = true;
}
Expand Down
100 changes: 47 additions & 53 deletions src/read_helper.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,15 @@ use crate::utils::{control_pat, ctostr, noncharacter_pat, surrogate_pat};

pub(crate) struct ReadHelper<R: Reader> {
reader: R,
last_character_was_cr: bool,
to_reconsume: Stack2<Option<char>>,
}

impl<R: Reader> ReadHelper<R> {
pub(crate) fn new(reader: R) -> Self {
ReadHelper {
reader,
last_character_was_cr: false,
to_reconsume: Default::default(),
}
}
Expand All @@ -22,29 +24,27 @@ impl<R: Reader> ReadHelper<R> {
&mut self,
emitter: &mut E,
) -> Result<Option<char>, R::Error> {
let (c_res, reconsumed) = match self.to_reconsume.pop() {
Some(c) => (Ok(c), true),
None => (self.reader.read_char(), false),
let mut c = match self.to_reconsume.pop() {
Some(c) => return Ok(c),
None => self.reader.read_char(),
};

let mut c = match c_res {
Ok(Some(c)) => c,
res => return res,
};

if c == '\r' {
c = '\n';
let c2 = self.reader.read_char()?;
if c2 != Some('\n') {
self.unread_char(c2);
}
if self.last_character_was_cr && matches!(c, Ok(Some('\n'))) {
self.last_character_was_cr = false;
return self.read_char(emitter);
}

if !reconsumed {
Self::validate_char(emitter, c);
if matches!(c, Ok(Some('\r'))) {
self.last_character_was_cr = true;
c = Ok(Some('\n'));
} else {
self.last_character_was_cr = false;
}

Ok(Some(c))
if let Ok(Some(x)) = c {
Self::validate_char(emitter, x);
}
c
}

#[inline]
Expand All @@ -54,6 +54,7 @@ impl<R: Reader> ReadHelper<R> {
case_sensitive: bool,
) -> Result<bool, R::Error> {
debug_assert!(!s.is_empty());
debug_assert!(!s.contains('\r'));

let to_reconsume_bak = self.to_reconsume;
let mut chars = s.chars();
Expand All @@ -70,6 +71,8 @@ impl<R: Reader> ReadHelper<R> {
return Ok(false);
}

self.last_character_was_cr = false;

self.reader.try_read_string(s, case_sensitive)
}

Expand All @@ -86,48 +89,39 @@ impl<R: Reader> ReadHelper<R> {
None => (),
}

let mut last_character_was_cr = false;

loop {
let rv = self.reader.read_until(needle, |xs| {
let xs = match xs {
Some(xs) => xs,
None => {
last_character_was_cr = false;
return read_cb(None, emitter);
}
};

let mut last_i = 0;
if last_character_was_cr && xs.starts_with('\n') {
last_i = 1;
}

for (i, _) in xs.match_indices('\r') {
let xs2 = &xs[last_i..i];
for x in xs2.chars() {
Self::validate_char(emitter, x);
}
read_cb(Some(xs2), emitter);
read_cb(Some("\n"), emitter);
last_i = i + 1;
if xs.as_bytes().get(last_i) == Some(&b'\n') {
last_i += 1;
}
let last_character_was_cr = &mut self.last_character_was_cr;

const MAX_NEEDLE_LEN: usize = 13;
let mut needle2 = ['\0'; MAX_NEEDLE_LEN];
// Assert that we will have space for adding \r
// If not, just bump MAX_NEEDLE_LEN
debug_assert!(needle.len() < needle2.len());
needle2[..needle.len()].copy_from_slice(needle);
needle2[needle.len()] = '\r';
let needle2_slice = &needle2[..needle.len() + 1];

self.reader.read_until(needle2_slice, |xs| match xs {
Some("\r") => {
*last_character_was_cr = true;
read_cb(Some("\n"), emitter)
}
Some(mut xs) => {
if *last_character_was_cr && xs.starts_with('\n') {
xs = &xs[1..];
}

let xs2 = &xs[last_i..];
for x in xs2.chars() {
for x in xs.chars() {
Self::validate_char(emitter, x);
}
last_character_was_cr = xs.ends_with('\r');
read_cb(Some(xs2), emitter)
});

if !last_character_was_cr {
break rv;
*last_character_was_cr = false;
read_cb(Some(xs), emitter)
}
}
None => {
*last_character_was_cr = false;
read_cb(None, emitter)
}
})
}

#[inline]
Expand Down
25 changes: 25 additions & 0 deletions tests/custom-html5lib-tests/custom.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
{"tests": [

{"description":"CR in tag preventing state change in fast_read_char (found via fuzzer)",
"input":"<a\r\"",
"output":[],
"errors":[
{"code": "unexpected-character-in-attribute-name", "line": 1, "col": 2},
{"code": "eof-in-tag", "line": 1, "col": 5}
]},

{"description": "CR \\u0001",
"input":"\r\u0001",
"output":[["Character","\n\u0001"]],
"errors": [
{"code": "control-character-in-input-stream", "line": 1, "col": 2}
]},

{"description": "comment less than sign bang dash CR",
"input":"<!--<!-\r",
"output":[["Comment", "<!-\n"]],
"errors": [
{"code": "eof-in-comment", "line": 1, "col": 5}
]}

]}
11 changes: 10 additions & 1 deletion tests/test_html5lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,16 @@ struct Tests {
}

#[test_generator::test_resources("tests/html5lib-tests/tokenizer/*.test")]
fn test_tokenizer_file(resource_name: &str) {
fn test_html5lib(resource_name: &str) {
run_html5lib_tokenizer_test(resource_name)
}

#[test_generator::test_resources("tests/custom-html5lib-tests/*.test")]
fn test_custom_html5lib(resource_name: &str) {
run_html5lib_tokenizer_test(resource_name)
}

fn run_html5lib_tokenizer_test(resource_name: &str) {
let path = Path::new(resource_name);
let fname = path.file_name().unwrap().to_str().unwrap();

Expand Down

0 comments on commit 4b53622

Please sign in to comment.