Skip to content

Commit

Permalink
rewrite entire crate to run on bytes (untitaker#25)
Browse files Browse the repository at this point in the history
  • Loading branch information
untitaker authored Jan 14, 2022
1 parent 762d19b commit ad1bbf9
Show file tree
Hide file tree
Showing 23 changed files with 1,293 additions and 960 deletions.
9 changes: 9 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,12 @@ version = "0.3.0"
include = ["src/**/*", "LICENSE", "README.md"]

[dev-dependencies]
criterion = "0.3.5"
pretty_assertions = "1.0.0"
serde = { version = "1.0.130", features = ["derive"] }
serde_json = "1.0.71"
test-generator = "0.3.0"
serde_bytes = "0.11.5"

[features]
# By default this crate depends on the memchr library for best performance.
Expand All @@ -28,3 +30,10 @@ integration-tests = []

[dependencies]
memchr = { version = "2.4.1", optional = true }

[[bench]]
name = "data_state"
harness = false

[lib]
bench = false
10 changes: 5 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,13 @@ let mut new_html = String::new();
for token in Tokenizer::new(html).infallible() {
match token {
Token::StartTag(tag) => {
write!(new_html, "<{}>", tag.name).unwrap();
write!(new_html, "<{}>", String::from_utf8_lossy(&tag.name)).unwrap();
}
Token::String(hello_world) => {
write!(new_html, "{}", hello_world).unwrap();
write!(new_html, "{}", String::from_utf8_lossy(&hello_world)).unwrap();
}
Token::EndTag(tag) => {
write!(new_html, "</{}>", tag.name).unwrap();
write!(new_html, "</{}>", String::from_utf8_lossy(&tag.name)).unwrap();
}
_ => panic!("unexpected input"),
}
Expand All @@ -38,8 +38,8 @@ test suite](https://github.com/html5lib/html5lib-tests/tree/master/tokenizer). S

* `html5gum` **does not** [implement charset
detection.](https://html.spec.whatwg.org/#determining-the-character-encoding)
This implementation requires all input to be Rust strings and therefore valid
UTF-8.
This implementation takes and returns bytes, but assumes UTF-8. It recovers
gracefully from invalid UTF-8.
* `html5gum` **does not** [correct mis-nested
tags.](https://html.spec.whatwg.org/#an-introduction-to-error-handling-and-strange-cases-in-the-parser)
* `html5gum` **does not** recognize implicitly self-closing elements like
Expand Down
14 changes: 14 additions & 0 deletions benches/data_state.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
use html5gum::Tokenizer;

fn data_state(c: &mut Criterion) {
for i in [100, 1000, 10000, 1000000] {
let s: String = (0..i).map(|_| 'a').collect();
c.bench_with_input(BenchmarkId::new("aaa", i), &s, |b, s| {
b.iter(|| for _ in Tokenizer::new(s).infallible() {})
});
}
}

criterion_group!(benches, data_state);
criterion_main!(benches);
4 changes: 2 additions & 2 deletions examples/tokenize.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
//! Let's you easily try out the tokenizer with e.g.
//! printf '<h1>Hello world!</h1>' | cargo run --example=tokenize
use html5gum::{BufReadReader, Tokenizer};
use html5gum::{IoReader, Tokenizer};

fn main() {
for token in Tokenizer::new(BufReadReader::new(std::io::stdin().lock())).flatten() {
for token in Tokenizer::new(IoReader::new(std::io::stdin().lock())).flatten() {
println!("{:?}", token);
}
}
26 changes: 18 additions & 8 deletions fuzz/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,11 @@ export FUZZ_OLD_HTML5GUM := 0
# Add html5ever to fuzzing
export FUZZ_HTML5EVER := 0
# Ignore errors while diffing to paper over bugs in old html5gum
# order = ignore sort order of errors relative to other non-error
# tokens (not relative to each other)
# 1 = ignore all errors
# if-reference-contains:XYZ = ignore all errors if one of the ones emitted by old html5gum is equal to XYZ (e.g. "if-reference-contains:duplicate-attribute")
# any other value = assert exact token equality
export FUZZ_IGNORE_PARSE_ERRORS := 0
# CLI arguments to pass to AFL. useful for multiprocessing
export _AFL_OPTS := -M fuzzer01
Expand All @@ -21,22 +26,25 @@ in:
curl https://docs.sentry.io/ > in/sentrydocs.html
cat ../tests/html5lib-tests/tokenizer/*.tests | jq -r '.tests[].input' > in/html5lib-tokenizer-tests.html

setup-afl: in
setup-afl:
which cargo-afl || cargo install afl
CARGO_TARGET_DIR=./target-afl/ cargo afl build --bin html5gum-fuzz-afl

afl: setup-afl
afl: in setup-afl
CARGO_TARGET_DIR=./target-afl/ AFL_AUTORESUME=1 cargo afl fuzz $$_AFL_OPTS -i in -o out ${AFL_TARGET_BIN}

afl-next:
set -e && for f in $$(echo out/*/crashes/id:* | sort); do \
afl-next: setup-afl
set -e && for f in out/*/crashes/id:*; do \
echo $$f; \
if ! $(MAKE) cli < $$f; then \
cargo afl tmin -i $$f -o /tmp/html5gum-mintest ${AFL_TARGET_BIN}; \
echo /tmp/html5gum-mintest; \
echo new test input written to /tmp/html5gum-mintest; \
echo ----; \
cat -v /tmp/html5gum-mintest; \
echo; \
echo "hexyl preview:"; \
hexyl /tmp/html5gum-mintest || true; \
echo ----; \
echo "json string:"; \
cat /tmp/html5gum-mintest | jq -Ra . || true; \
echo ----; \
exit 2; \
else \
Expand All @@ -45,7 +53,9 @@ afl-next:
done

afl-skip:
set -e && for f in $$(echo out/*/crashes/id:* | sort); do \
# relies on the same order as in afl-next, see
# https://serverfault.com/a/122743/520816
set -e && for f in out/*/crashes/id:*; do \
rm $$f; \
break; \
done
Expand Down
4 changes: 1 addition & 3 deletions fuzz/src/main_afl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@ use testcase::run;

fn main() {
afl::fuzz!(|data: &[u8]| {
if let Ok(s) = std::str::from_utf8(data) {
run(s);
}
run(data);
});
}
4 changes: 2 additions & 2 deletions fuzz/src/main_cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ use std::io::Read;
use testcase::run;

fn main() {
let mut input = String::new();
std::io::stdin().lock().read_to_string(&mut input).unwrap();
let mut input = Vec::new();
std::io::stdin().lock().read_to_end(&mut input).unwrap();
run(&input);
}
159 changes: 111 additions & 48 deletions fuzz/src/testcase.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,60 +3,30 @@ use std::env;
use html5ever::buffer_queue::BufferQueue;
use html5ever::tendril::format_tendril;
use html5ever::tokenizer::{TagKind, Token as Token2, TokenSinkResult, TokenizerResult};
use html5gum::{Emitter, Reader, Token};
use html5gum::{Doctype, Emitter, EndTag, Reader, StartTag, Token};

use pretty_assertions::assert_eq;

pub fn run(s: &str) {
let mut did_anything = false;
pub fn run(s: &[u8]) {
let mut did_anything = env::var("FUZZ_BASIC").unwrap() == "1";

if env::var("FUZZ_BASIC").unwrap() == "1" {
let testing_tokenizer = html5gum::Tokenizer::new(s).infallible();
for _ in testing_tokenizer {}

did_anything = true;
}
// unconditionally run tokenizer against raw bytes, it should never crash. we rely on running
// in debug mode such that this is not just simply optimized away
let testing_tokenizer = html5gum::Tokenizer::new(s).infallible();
for _ in testing_tokenizer {}

if env::var("FUZZ_OLD_HTML5GUM").unwrap() == "1" {
let reference_tokenizer = html5gum_old::Tokenizer::new(s).infallible();
let testing_tokenizer = html5gum::Tokenizer::new(s).infallible();

let mut testing_tokens: Vec<_> = testing_tokenizer.collect();
let mut reference_tokens: Vec<_> = reference_tokenizer.collect();

if env::var("FUZZ_IGNORE_PARSE_ERRORS").unwrap() == "1" {
testing_tokens.retain(|x| !matches!(x, html5gum::Token::Error(_)));
reference_tokens.retain(|x| !matches!(x, html5gum_old::Token::Error(_)));
if let Ok(data) = std::str::from_utf8(s) {
run_old_html5gum(data);
}

let testing_tokens: Vec<_> = testing_tokens
.into_iter()
.map(|x| format!("{:?}", x))
.collect();
let reference_tokens: Vec<_> = reference_tokens
.into_iter()
.map(|x| format!("{:?}", x))
.collect();

assert_eq!(testing_tokens, reference_tokens);
did_anything = true;
}

if env::var("FUZZ_HTML5EVER").unwrap() == "1" {
let mut reference_tokenizer = html5ever::tokenizer::Tokenizer::new(
TokenSink {
testing_tokenizer: html5gum::Tokenizer::new(s),
},
Default::default(),
);
let mut queue = BufferQueue::new();
queue.push_back(format_tendril!("{}", s));

assert!(matches!(
reference_tokenizer.feed(&mut queue),
TokenizerResult::Done
));
reference_tokenizer.end();
if let Ok(data) = std::str::from_utf8(s) {
run_html5ever(data);
}

did_anything = true;
}
Expand All @@ -66,6 +36,95 @@ pub fn run(s: &str) {
}
}

fn run_old_html5gum(s: &str) {
let reference_tokenizer = html5gum_old::Tokenizer::new(s).infallible();
let testing_tokenizer = html5gum::Tokenizer::new(s).infallible();

let mut testing_tokens: Vec<_> = testing_tokenizer.collect();
let mut reference_tokens: Vec<_> = reference_tokenizer.collect();

fn isnt_error(x: &html5gum::Token) -> bool {
!matches!(*x, html5gum::Token::Error(_))
}

fn isnt_old_error(x: &html5gum_old::Token) -> bool {
!matches!(*x, html5gum_old::Token::Error(_))
}

for instruction in env::var("FUZZ_IGNORE_PARSE_ERRORS")
.unwrap()
.as_str()
.trim()
.split(",")
{
match instruction {
"1" => {
testing_tokens.retain(isnt_error);
reference_tokens.retain(isnt_old_error);
}
"order" => {
testing_tokens.sort_by_key(isnt_error);
reference_tokens.sort_by_key(isnt_old_error);
}
x if x.starts_with("if-reference-contains:") => {
if reference_tokens.contains(&html5gum_old::Token::Error(
x["if-reference-contains:".len()..].parse().unwrap(),
)) {
reference_tokens.retain(isnt_old_error);
testing_tokens.retain(isnt_error);
}
}
x => panic!("unknown FUZZ_IGNORE_PARSE_ERRORS instruction: {}", x),
}
}

let reference_tokens: Vec<_> = reference_tokens
.into_iter()
.map(|x| match x {
html5gum_old::Token::String(x) => Token::String(x.into_bytes()),
html5gum_old::Token::Comment(x) => Token::Comment(x.into_bytes()),
html5gum_old::Token::StartTag(x) => Token::StartTag(StartTag {
name: x.name.into_bytes(),
attributes: x
.attributes
.into_iter()
.map(|(k, v)| (k.into_bytes(), v.into_bytes()))
.collect(),
self_closing: x.self_closing,
}),
html5gum_old::Token::EndTag(x) => Token::EndTag(EndTag {
name: x.name.into_bytes(),
}),
html5gum_old::Token::Error(x) => Token::Error(x.to_string().parse().unwrap()),
html5gum_old::Token::Doctype(x) => Token::Doctype(Doctype {
name: x.name.into_bytes(),
force_quirks: x.force_quirks,
public_identifier: x.public_identifier.map(String::into_bytes),
system_identifier: x.system_identifier.map(String::into_bytes),
}),
})
.collect();

assert_eq!(testing_tokens, reference_tokens);
}

fn run_html5ever(s: &str) {
let mut reference_tokenizer = html5ever::tokenizer::Tokenizer::new(
TokenSink {
testing_tokenizer: html5gum::Tokenizer::new(s),
},
Default::default(),
);
let mut queue = BufferQueue::new();
queue.push_back(format_tendril!("{}", s));

assert!(matches!(
reference_tokenizer.feed(&mut queue),
TokenizerResult::Done
));
reference_tokenizer.end();
}

struct TokenSink<R: Reader, E: Emitter> {
testing_tokenizer: html5gum::Tokenizer<R, E>,
}
Expand Down Expand Up @@ -98,31 +157,35 @@ impl<R: Reader, E: Emitter<Token = Token>> html5ever::tokenizer::TokenSink for T
match (token, reference_token) {
(Some(Token::StartTag(tag)), Token2::TagToken(tag2)) => {
assert_eq!(tag2.kind, TagKind::StartTag);
assert_eq!(tag.name, tag2.name.as_ref());
assert_eq!(tag.name, tag2.name.as_ref().as_bytes());
}
(Some(Token::EndTag(tag)), Token2::TagToken(tag2)) => {
assert_eq!(tag2.kind, TagKind::EndTag);
assert_eq!(tag.name, tag2.name.as_ref());
assert_eq!(tag.name, tag2.name.as_ref().as_bytes());
}
(None, Token2::EOFToken) => {}
(Some(Token::Comment(comment)), Token2::CommentToken(comment2)) => {
assert_eq!(comment, comment2.as_ref());
assert_eq!(comment, comment2.as_ref().as_bytes());
}
(Some(Token::Doctype(doctype)), Token2::DoctypeToken(doctype2)) => {
assert_eq!(
doctype.name,
doctype2
.name
.map(|x| x.as_ref().to_owned())
.map(|x| x.as_ref().to_owned().into_bytes())
.unwrap_or_default()
);
assert_eq!(
doctype.public_identifier,
doctype2.public_id.map(|x| x.as_ref().to_owned())
doctype2
.public_id
.map(|x| x.as_ref().to_owned().into_bytes())
);
assert_eq!(
doctype.system_identifier,
doctype2.system_id.map(|x| x.as_ref().to_owned())
doctype2
.system_id
.map(|x| x.as_ref().to_owned().into_bytes())
);
assert_eq!(doctype.force_quirks, doctype2.force_quirks);
}
Expand Down
29 changes: 29 additions & 0 deletions src/arrayvec.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
/// This is basically like the arrayvec crate, except crappier, only the subset I need and
/// therefore without unsafe Rust.
pub struct ArrayVec<T: Copy, const CAP: usize> {
content: [T; CAP],
len: usize,
}

impl<T: Copy, const CAP: usize> ArrayVec<T, CAP> {
pub fn new(filler_item: T) -> Self {
// filler_item is there to avoid usage of MaybeUninit, and can literally be anything at
// all.
ArrayVec {
content: [filler_item; CAP],
len: 0,
}
}

pub fn push(&mut self, item: T) {
self.content[self.len] = item;
self.len += 1;
}

pub fn drain(&mut self) -> &[T] {
let rv = &self.content[..self.len];
self.len = 0;
rv
}
}
Loading

0 comments on commit ad1bbf9

Please sign in to comment.