Skip to content

Commit

Permalink
ARROW-10999: [Rust] [Benchmarks] Use signed ints for TPC-H schema
Browse files Browse the repository at this point in the history
The TPC-H parquet files generated by the benchmark crate could not be read by Apache Spark because they used unsigned ints, which Spark does not support (JVM only has signed ints).

I would  like to use the same data sets for benchmarking DataFusion, Apache Spark, and other tools, so have changed the schema to use signed ints.

Closes apache#8980 from andygrove/tpch-signed-ints

Authored-by: Andy Grove <[email protected]>
Signed-off-by: Jorge C. Leitao <[email protected]>
  • Loading branch information
andygrove authored and jorgecarleitao committed Dec 22, 2020
1 parent 22f06bb commit ab185d5
Showing 1 changed file with 23 additions and 19 deletions.
42 changes: 23 additions & 19 deletions rust/benchmarks/src/bin/tpch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1095,65 +1095,69 @@ fn get_table(
}

fn get_schema(table: &str) -> Schema {
// note that the schema intentionally uses signed integers so that any generated Parquet
// files can also be used to benchmark tools that only support signed integers, such as
// Apache Spark

match table {
"part" => Schema::new(vec![
Field::new("p_partkey", DataType::UInt32, false),
Field::new("p_partkey", DataType::Int32, false),
Field::new("p_name", DataType::Utf8, false),
Field::new("p_mfgr", DataType::Utf8, false),
Field::new("p_brand", DataType::Utf8, false),
Field::new("p_type", DataType::Utf8, false),
Field::new("p_size", DataType::UInt32, false),
Field::new("p_size", DataType::Int32, false),
Field::new("p_container", DataType::Utf8, false),
Field::new("p_retailprice", DataType::Float64, false), // decimal
Field::new("p_comment", DataType::Utf8, false),
]),

"supplier" => Schema::new(vec![
Field::new("s_suppkey", DataType::UInt32, false),
Field::new("s_suppkey", DataType::Int32, false),
Field::new("s_name", DataType::Utf8, false),
Field::new("s_address", DataType::Utf8, false),
Field::new("s_nationkey", DataType::UInt32, false),
Field::new("s_nationkey", DataType::Int32, false),
Field::new("s_phone", DataType::Utf8, false),
Field::new("s_acctbal", DataType::Float64, false), // decimal
Field::new("s_comment", DataType::Utf8, false),
]),

"partsupp" => Schema::new(vec![
Field::new("ps_partkey", DataType::UInt32, false),
Field::new("ps_suppkey", DataType::UInt32, false),
Field::new("ps_availqty", DataType::UInt32, false),
Field::new("ps_partkey", DataType::Int32, false),
Field::new("ps_suppkey", DataType::Int32, false),
Field::new("ps_availqty", DataType::Int32, false),
Field::new("ps_supplycost", DataType::Float64, false), // decimal
Field::new("ps_comment", DataType::Utf8, false),
]),

"customer" => Schema::new(vec![
Field::new("c_custkey", DataType::UInt32, false),
Field::new("c_custkey", DataType::Int32, false),
Field::new("c_name", DataType::Utf8, false),
Field::new("c_address", DataType::Utf8, false),
Field::new("c_nationkey", DataType::UInt32, false),
Field::new("c_nationkey", DataType::Int32, false),
Field::new("c_phone", DataType::Utf8, false),
Field::new("c_acctbal", DataType::Float64, false), // decimal
Field::new("c_mktsegment", DataType::Utf8, false),
Field::new("c_comment", DataType::Utf8, false),
]),

"orders" => Schema::new(vec![
Field::new("o_orderkey", DataType::UInt32, false),
Field::new("o_custkey", DataType::UInt32, false),
Field::new("o_orderkey", DataType::Int32, false),
Field::new("o_custkey", DataType::Int32, false),
Field::new("o_orderstatus", DataType::Utf8, false),
Field::new("o_totalprice", DataType::Float64, false), // decimal
Field::new("o_orderdate", DataType::Date32(DateUnit::Day), false),
Field::new("o_orderpriority", DataType::Utf8, false),
Field::new("o_clerk", DataType::Utf8, false),
Field::new("o_shippriority", DataType::UInt32, false),
Field::new("o_shippriority", DataType::Int32, false),
Field::new("o_comment", DataType::Utf8, false),
]),

"lineitem" => Schema::new(vec![
Field::new("l_orderkey", DataType::UInt32, false),
Field::new("l_partkey", DataType::UInt32, false),
Field::new("l_suppkey", DataType::UInt32, false),
Field::new("l_linenumber", DataType::UInt32, false),
Field::new("l_orderkey", DataType::Int32, false),
Field::new("l_partkey", DataType::Int32, false),
Field::new("l_suppkey", DataType::Int32, false),
Field::new("l_linenumber", DataType::Int32, false),
Field::new("l_quantity", DataType::Float64, false), // decimal
Field::new("l_extendedprice", DataType::Float64, false), // decimal
Field::new("l_discount", DataType::Float64, false), // decimal
Expand All @@ -1169,14 +1173,14 @@ fn get_schema(table: &str) -> Schema {
]),

"nation" => Schema::new(vec![
Field::new("n_nationkey", DataType::UInt32, false),
Field::new("n_nationkey", DataType::Int32, false),
Field::new("n_name", DataType::Utf8, false),
Field::new("n_regionkey", DataType::UInt32, false),
Field::new("n_regionkey", DataType::Int32, false),
Field::new("n_comment", DataType::Utf8, false),
]),

"region" => Schema::new(vec![
Field::new("r_regionkey", DataType::UInt32, false),
Field::new("r_regionkey", DataType::Int32, false),
Field::new("r_name", DataType::Utf8, false),
Field::new("r_comment", DataType::Utf8, false),
]),
Expand Down

0 comments on commit ab185d5

Please sign in to comment.