diff --git a/rust/arrow/examples/builders.rs b/rust/arrow/examples/builders.rs index f9ba2974ef7c8..c1aede81023ce 100644 --- a/rust/arrow/examples/builders.rs +++ b/rust/arrow/examples/builders.rs @@ -23,8 +23,8 @@ use arrow::builder::Int32Builder; fn main() { // Primitive Arrays // - // Primitive arrays are arrays of fixed-width primitive types (bool, u8, u16, u32, u64, i8, i16, - // i32, i64, f32, f64) + // Primitive arrays are arrays of fixed-width primitive types (bool, u8, u16, u32, + // u64, i8, i16, i32, i64, f32, f64) // Create a new builder with a capacity of 100 let mut primitive_array_builder = Int32Builder::new(100); diff --git a/rust/arrow/src/array.rs b/rust/arrow/src/array.rs index 69207f6926592..127edb9a9e350 100644 --- a/rust/arrow/src/array.rs +++ b/rust/arrow/src/array.rs @@ -15,9 +15,9 @@ // specific language governing permissions and limitations // under the License. -//! Defines public types representing Apache Arrow arrays. Arrow's specification defines an array as -//! "a sequence of values with known length all having the same type." For example, the type -//! `Int16Array` represents an Apache Arrow array of 16-bit integers. +//! Defines public types representing Apache Arrow arrays. Arrow's specification defines +//! an array as "a sequence of values with known length all having the same type." For +//! example, the type `Int16Array` represents an Apache Arrow array of 16-bit integers. //! //! ``` //! extern crate arrow; @@ -39,11 +39,19 @@ //! // Build the array //! let array = builder.finish(); //! -//! assert_eq!(5, array.len(), "The array has 5 values, counting the null value"); +//! assert_eq!( +//! 5, +//! array.len(), +//! "The array has 5 values, counting the null value" +//! ); //! //! assert_eq!(2, array.value(2), "Get the value with index 2"); //! -//! assert_eq!(array.value_slice(3, 2), &[3, 4], "Get slice of len 2 starting at idx 3") +//! assert_eq!( +//! array.value_slice(3, 2), +//! &[3, 4], +//! "Get slice of len 2 starting at idx 3" +//! ) //! ``` use std::any::Any; @@ -153,8 +161,8 @@ pub struct PrimitiveArray { data: ArrayDataRef, /// Pointer to the value array. The lifetime of this must be <= to the value buffer /// stored in `data`, so it's safe to store. - /// Also note that boolean arrays are bit-packed, so although the underlying pointer is of type - /// bool it should be cast back to u8 before being used. + /// Also note that boolean arrays are bit-packed, so although the underlying pointer + /// is of type bool it should be cast back to u8 before being used. /// i.e. `self.raw_values.get() as *const u8` raw_values: RawPtrBox, } @@ -212,7 +220,9 @@ impl PrimitiveArray { /// Returns a raw pointer to the values of this array. pub fn raw_values(&self) -> *const T::Native { - unsafe { mem::transmute(self.raw_values.get().offset(self.data.offset() as isize)) } + unsafe { + mem::transmute(self.raw_values.get().offset(self.data.offset() as isize)) + } } /// Returns the primitive value at index `i`. @@ -288,8 +298,10 @@ macro_rules! def_numeric_from_vec { fn from(data: Vec>) -> Self { let data_len = data.len(); let num_bytes = bit_util::ceil(data_len, 8); - let mut null_buf = MutableBuffer::new(num_bytes).with_bitset(num_bytes, false); - let mut val_buf = MutableBuffer::new(data_len * mem::size_of::<$native_ty>()); + let mut null_buf = + MutableBuffer::new(num_bytes).with_bitset(num_bytes, false); + let mut val_buf = + MutableBuffer::new(data_len * mem::size_of::<$native_ty>()); { let null = vec![0; mem::size_of::<$native_ty>()]; @@ -590,7 +602,8 @@ impl From for BinaryArray { assert_eq!( v.data().child_data()[0].child_data().len(), 0, - "BinaryArray can only be created from list array of u8 values (i.e. List>)." + "BinaryArray can only be created from list array of u8 values \ + (i.e. List>)." ); assert_eq!( v.data().child_data()[0].data_type(), @@ -627,7 +640,8 @@ impl Array for BinaryArray { } } -/// A nested array type where each child (called *field*) is represented by a separate array. +/// A nested array type where each child (called *field*) is represented by a separate +/// array. pub struct StructArray { data: ArrayDataRef, boxed_fields: Vec, @@ -759,9 +773,8 @@ mod tests { } #[test] - #[should_panic( - expected = "PrimitiveArray data should contain a single buffer only (values buffer)" - )] + #[should_panic(expected = "PrimitiveArray data should contain a single buffer only \ + (values buffer)")] fn test_primitive_array_invalid_buffer_len() { let data = ArrayData::builder(DataType::Int32).len(5).build(); Int32Array::from(data); @@ -841,9 +854,8 @@ mod tests { } #[test] - #[should_panic( - expected = "PrimitiveArray data should contain a single buffer only (values buffer)" - )] + #[should_panic(expected = "PrimitiveArray data should contain a single buffer only \ + (values buffer)")] fn test_boolean_array_invalid_buffer_len() { let data = ArrayData::builder(DataType::Boolean).len(5).build(); BooleanArray::from(data); @@ -901,7 +913,9 @@ mod tests { } #[test] - #[should_panic(expected = "ListArray data should contain a single buffer only (value offsets)")] + #[should_panic( + expected = "ListArray data should contain a single buffer only (value offsets)" + )] fn test_list_array_invalid_buffer_len() { let value_data = ArrayData::builder(DataType::Int32) .len(8) @@ -916,7 +930,9 @@ mod tests { } #[test] - #[should_panic(expected = "ListArray should contain a single child array (values array)")] + #[should_panic( + expected = "ListArray should contain a single child array (values array)" + )] fn test_list_array_invalid_child_array_len() { let value_offsets = Buffer::from(&[0, 2, 5, 7].to_byte_slice()); let list_data_type = DataType::List(Box::new(DataType::Int32)); @@ -1058,7 +1074,8 @@ mod tests { #[test] #[should_panic( - expected = "BinaryArray can only be created from List arrays, mismatched data types." + expected = "BinaryArray can only be created from List arrays, mismatched \ + data types." )] fn test_binary_array_from_incorrect_list_array_type() { let values: [u32; 12] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]; @@ -1079,7 +1096,8 @@ mod tests { #[test] #[should_panic( - expected = "BinaryArray can only be created from list array of u8 values (i.e. List>)." + expected = "BinaryArray can only be created from list array of u8 values \ + (i.e. List>)." )] fn test_binary_array_from_incorrect_list_array() { let values: [u32; 12] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]; @@ -1151,7 +1169,8 @@ mod tests { let struct_array = StructArray::from(vec![ ( Field::new("b", DataType::Boolean, false), - Arc::new(BooleanArray::from(vec![false, false, true, true])) as Arc, + Arc::new(BooleanArray::from(vec![false, false, true, true])) + as Arc, ), ( Field::new("c", DataType::Int32, false), @@ -1163,7 +1182,9 @@ mod tests { } #[test] - #[should_panic(expected = "all child arrays of a StructArray must have the same length")] + #[should_panic( + expected = "all child arrays of a StructArray must have the same length" + )] fn test_invalid_struct_child_array_lengths() { StructArray::from(vec![ ( diff --git a/rust/arrow/src/array_data.rs b/rust/arrow/src/array_data.rs index 1c96b517d9833..a24dd0115e349 100644 --- a/rust/arrow/src/array_data.rs +++ b/rust/arrow/src/array_data.rs @@ -15,8 +15,8 @@ // specific language governing permissions and limitations // under the License. -//! Contains `ArrayData`, a generic representation of Arrow array data which encapsulates common -//! attributes and operations for Arrow array. +//! Contains `ArrayData`, a generic representation of Arrow array data which encapsulates +//! common attributes and operations for Arrow array. use std::sync::Arc; @@ -237,7 +237,8 @@ mod tests { #[test] fn test_new() { - let arr_data = ArrayData::new(DataType::Boolean, 10, Some(1), None, 2, vec![], vec![]); + let arr_data = + ArrayData::new(DataType::Boolean, 10, Some(1), None, 2, vec![], vec![]); assert_eq!(10, arr_data.len()); assert_eq!(1, arr_data.null_count()); assert_eq!(2, arr_data.offset()); diff --git a/rust/arrow/src/array_ops.rs b/rust/arrow/src/array_ops.rs index b6afdf23de154..6e847c8b378f2 100644 --- a/rust/arrow/src/array_ops.rs +++ b/rust/arrow/src/array_ops.rs @@ -27,8 +27,12 @@ use crate::datatypes; use crate::datatypes::ArrowNumericType; use crate::error::{ArrowError, Result}; -/// Perform `left + right` operation on two arrays. If either left or right value is null then the result is also null. -pub fn add(left: &PrimitiveArray, right: &PrimitiveArray) -> Result> +/// Perform `left + right` operation on two arrays. If either left or right value is null +/// then the result is also null. +pub fn add( + left: &PrimitiveArray, + right: &PrimitiveArray, +) -> Result> where T: datatypes::ArrowNumericType, T::Native: Add @@ -40,8 +44,12 @@ where math_op(left, right, |a, b| Ok(a + b)) } -/// Perform `left - right` operation on two arrays. If either left or right value is null then the result is also null. -pub fn subtract(left: &PrimitiveArray, right: &PrimitiveArray) -> Result> +/// Perform `left - right` operation on two arrays. If either left or right value is null +/// then the result is also null. +pub fn subtract( + left: &PrimitiveArray, + right: &PrimitiveArray, +) -> Result> where T: datatypes::ArrowNumericType, T::Native: Add @@ -53,8 +61,12 @@ where math_op(left, right, |a, b| Ok(a - b)) } -/// Perform `left * right` operation on two arrays. If either left or right value is null then the result is also null. -pub fn multiply(left: &PrimitiveArray, right: &PrimitiveArray) -> Result> +/// Perform `left * right` operation on two arrays. If either left or right value is null +/// then the result is also null. +pub fn multiply( + left: &PrimitiveArray, + right: &PrimitiveArray, +) -> Result> where T: datatypes::ArrowNumericType, T::Native: Add @@ -66,9 +78,13 @@ where math_op(left, right, |a, b| Ok(a * b)) } -/// Perform `left / right` operation on two arrays. If either left or right value is null then the result is also null. -/// If any right hand value is zero then the result of this operation will be `Err(ArrowError::DivideByZero)`. -pub fn divide(left: &PrimitiveArray, right: &PrimitiveArray) -> Result> +/// Perform `left / right` operation on two arrays. If either left or right value is null +/// then the result is also null. If any right hand value is zero then the result of this +/// operation will be `Err(ArrowError::DivideByZero)`. +pub fn divide( + left: &PrimitiveArray, + right: &PrimitiveArray, +) -> Result> where T: datatypes::ArrowNumericType, T::Native: Add @@ -86,8 +102,9 @@ where }) } -/// Helper function to perform math lambda function on values from two arrays. If either left or -/// right value is null then the output value is also null, so `1 + null` is `null`. +/// Helper function to perform math lambda function on values from two arrays. If either +/// left or right value is null then the output value is also null, so `1 + null` is +/// `null`. fn math_op( left: &PrimitiveArray, right: &PrimitiveArray, @@ -200,7 +217,8 @@ where bool_op(left, right, |a, b| a != b) } -/// Perform `left < right` operation on two arrays. Null values are less than non-null values. +/// Perform `left < right` operation on two arrays. Null values are less than non-null +/// values. pub fn lt(left: &PrimitiveArray, right: &PrimitiveArray) -> Result where T: ArrowNumericType, @@ -213,8 +231,12 @@ where }) } -/// Perform `left <= right` operation on two arrays. Null values are less than non-null values. -pub fn lt_eq(left: &PrimitiveArray, right: &PrimitiveArray) -> Result +/// Perform `left <= right` operation on two arrays. Null values are less than non-null +/// values. +pub fn lt_eq( + left: &PrimitiveArray, + right: &PrimitiveArray, +) -> Result where T: ArrowNumericType, { @@ -226,7 +248,8 @@ where }) } -/// Perform `left > right` operation on two arrays. Non-null values are greater than null values. +/// Perform `left > right` operation on two arrays. Non-null values are greater than null +/// values. pub fn gt(left: &PrimitiveArray, right: &PrimitiveArray) -> Result where T: ArrowNumericType, @@ -239,8 +262,12 @@ where }) } -/// Perform `left >= right` operation on two arrays. Non-null values are greater than null values. -pub fn gt_eq(left: &PrimitiveArray, right: &PrimitiveArray) -> Result +/// Perform `left >= right` operation on two arrays. Non-null values are greater than null +/// values. +pub fn gt_eq( + left: &PrimitiveArray, + right: &PrimitiveArray, +) -> Result where T: ArrowNumericType, { @@ -253,7 +280,11 @@ where } /// Helper function to perform boolean lambda function on values from two arrays. -fn bool_op(left: &PrimitiveArray, right: &PrimitiveArray, op: F) -> Result +fn bool_op( + left: &PrimitiveArray, + right: &PrimitiveArray, + op: F, +) -> Result where T: ArrowNumericType, F: Fn(Option, Option) -> bool, @@ -281,7 +312,8 @@ where Ok(b.finish()) } -/// Perform `AND` operation on two arrays. If either left or right value is null then the result is also null. +/// Perform `AND` operation on two arrays. If either left or right value is null then the +/// result is also null. pub fn and(left: &BooleanArray, right: &BooleanArray) -> Result { if left.len() != right.len() { return Err(ArrowError::ComputeError( @@ -299,7 +331,8 @@ pub fn and(left: &BooleanArray, right: &BooleanArray) -> Result { Ok(b.finish()) } -/// Perform `OR` operation on two arrays. If either left or right value is null then the result is also null. +/// Perform `OR` operation on two arrays. If either left or right value is null then the +/// result is also null. pub fn or(left: &BooleanArray, right: &BooleanArray) -> Result { if left.len() != right.len() { return Err(ArrowError::ComputeError( @@ -317,7 +350,8 @@ pub fn or(left: &BooleanArray, right: &BooleanArray) -> Result { Ok(b.finish()) } -/// Perform unary `NOT` operation on an arrays. If value is null then the result is also null. +/// Perform unary `NOT` operation on an arrays. If value is null then the result is also +/// null. pub fn not(left: &BooleanArray) -> Result { let mut b = BooleanArray::builder(left.len()); for i in 0..left.len() { diff --git a/rust/arrow/src/bitmap.rs b/rust/arrow/src/bitmap.rs index e8fce3e184d90..93b6ee83c0492 100644 --- a/rust/arrow/src/bitmap.rs +++ b/rust/arrow/src/bitmap.rs @@ -15,8 +15,8 @@ // specific language governing permissions and limitations // under the License. -//! Defines a bitmap, which is used to track which values in an Arrow array are null. This is called -//! a "validity bitmap" in the Arrow documentation. +//! Defines a bitmap, which is used to track which values in an Arrow array are null. +//! This is called a "validity bitmap" in the Arrow documentation. use super::buffer::Buffer; use crate::util::bit_util; diff --git a/rust/arrow/src/buffer.rs b/rust/arrow/src/buffer.rs index cde38c432ae91..6172445ec821e 100644 --- a/rust/arrow/src/buffer.rs +++ b/rust/arrow/src/buffer.rs @@ -15,9 +15,9 @@ // specific language governing permissions and limitations // under the License. -//! The main type in the module is `Buffer`, a contiguous immutable memory region of fixed size -//! aligned at a 64-byte boundary. `MutableBuffer` is like `Buffer`, but it can be mutated and -//! grown. +//! The main type in the module is `Buffer`, a contiguous immutable memory region of +//! fixed size aligned at a 64-byte boundary. `MutableBuffer` is like `Buffer`, but it can +//! be mutated and grown. use std::cmp; use std::io::{Error as IoError, ErrorKind, Result as IoResult, Write}; @@ -182,8 +182,9 @@ impl MutableBuffer { /// Ensure that `count` bytes from `start` contain zero bits /// - /// This is used to initialize the bits in a buffer, however, it has no impact on the `len` - /// of the buffer and so can be used to initialize the memory region from `len` to `capacity`. + /// This is used to initialize the bits in a buffer, however, it has no impact on the + /// `len` of the buffer and so can be used to initialize the memory region from + /// `len` to `capacity`. pub fn set_null_bits(&mut self, start: usize, count: usize) { assert!(start + count <= self.capacity); unsafe { @@ -219,7 +220,8 @@ impl MutableBuffer { } else { let new_capacity = bit_util::round_upto_multiple_of_64(new_len); if new_capacity < self.capacity { - let new_data = memory::reallocate(self.capacity, new_capacity, self.data)?; + let new_data = + memory::reallocate(self.capacity, new_capacity, self.data)?; self.data = new_data as *mut u8; self.capacity = new_capacity; } @@ -255,7 +257,9 @@ impl MutableBuffer { /// Returns the data stored in this buffer as a mutable slice. pub fn data_mut(&mut self) -> &mut [u8] { - unsafe { ::std::slice::from_raw_parts_mut(self.raw_data() as *mut u8, self.len()) } + unsafe { + ::std::slice::from_raw_parts_mut(self.raw_data() as *mut u8, self.len()) + } } /// Returns a raw pointer for this buffer. @@ -397,7 +401,9 @@ mod tests { } #[test] - #[should_panic(expected = "the offset of the new Buffer cannot exceed the existing length")] + #[should_panic( + expected = "the offset of the new Buffer cannot exceed the existing length" + )] fn test_slice_offset_out_of_bound() { let buf = Buffer::from(&[2, 4, 6, 8, 10]); buf.slice(6); diff --git a/rust/arrow/src/builder.rs b/rust/arrow/src/builder.rs index 2a4b702a2738a..77dcc24f250ac 100644 --- a/rust/arrow/src/builder.rs +++ b/rust/arrow/src/builder.rs @@ -15,8 +15,8 @@ // specific language governing permissions and limitations // under the License. -//! Defines a `BufferBuilder` capable of creating a `Buffer` which can be used as an internal -//! buffer in an `ArrayData` object. +//! Defines a `BufferBuilder` capable of creating a `Buffer` which can be used as an +//! internal buffer in an `ArrayData` object. use std::any::Any; use std::io::Write; @@ -124,12 +124,13 @@ impl BufferBuilderTrait for BufferBuilder { } impl BufferBuilder { - /// Writes a byte slice to the underlying buffer and updates the `len`, i.e. the number array - /// elements in the builder. Also, converts the `io::Result` required by the `Write` trait - /// to the Arrow `Result` type. + /// Writes a byte slice to the underlying buffer and updates the `len`, i.e. the + /// number array elements in the builder. Also, converts the `io::Result` + /// required by the `Write` trait to the Arrow `Result` type. fn write_bytes(&mut self, bytes: &[u8], len_added: usize) -> Result<()> { let write_result = self.buffer.write(bytes); - // `io::Result` has many options one of which we use, so pattern matching is overkill here + // `io::Result` has many options one of which we use, so pattern matching is + // overkill here if write_result.is_err() { Err(ArrowError::MemoryError( "Could not write to Buffer, not big enough".to_string(), @@ -432,13 +433,14 @@ where let offset_buffer = self.offsets_builder.finish(); let null_bit_buffer = self.bitmap_builder.finish(); self.offsets_builder.append(0).unwrap(); - let data = ArrayData::builder(DataType::List(Box::new(values_data.data_type().clone()))) - .len(len) - .null_count(len - bit_util::count_set_bits(null_bit_buffer.data())) - .add_buffer(offset_buffer) - .add_child_data(values_data) - .null_bit_buffer(null_bit_buffer) - .build(); + let data = + ArrayData::builder(DataType::List(Box::new(values_data.data_type().clone()))) + .len(len) + .null_count(len - bit_util::count_set_bits(null_bit_buffer.data())) + .add_buffer(offset_buffer) + .add_child_data(values_data) + .null_bit_buffer(null_bit_buffer) + .build(); ListArray::from(data) } @@ -477,7 +479,8 @@ impl ArrayBuilder for BinaryBuilder { } impl BinaryBuilder { - /// Creates a new `BinaryBuilder`, `capacity` is the number of bytes in the values array + /// Creates a new `BinaryBuilder`, `capacity` is the number of bytes in the values + /// array pub fn new(capacity: usize) -> Self { let values_builder = UInt8Builder::new(capacity); Self { diff --git a/rust/arrow/src/csv/reader.rs b/rust/arrow/src/csv/reader.rs index 718e8d526c46b..e16d7c8a631e0 100644 --- a/rust/arrow/src/csv/reader.rs +++ b/rust/arrow/src/csv/reader.rs @@ -17,8 +17,8 @@ //! CSV Reader //! -//! This CSV reader allows CSV files to be read into the Arrow memory model. Records are loaded in -//! batches and are then converted from row-based data to columnar data. +//! This CSV reader allows CSV files to be read into the Arrow memory model. Records are +//! loaded in batches and are then converted from row-based data to columnar data. //! //! Example: //! @@ -68,7 +68,8 @@ lazy_static! { /// Infer the data type of a record fn infer_field_schema(string: &str) -> DataType { - // when quoting is enabled in the reader, these quotes aren't escaped, we default to Utf8 for them + // when quoting is enabled in the reader, these quotes aren't escaped, we default to + // Utf8 for them if string.starts_with("\"") { return DataType::Utf8; } @@ -226,7 +227,9 @@ impl Reader { rows.push(r); } Some(Err(_)) => { - return Err(ArrowError::ParseError("Error reading CSV file".to_string())); + return Err(ArrowError::ParseError( + "Error reading CSV file".to_string(), + )); } None => break, } @@ -254,17 +257,29 @@ impl Reader { .map(|i| { let field = self.schema.field(*i); match field.data_type() { - &DataType::Boolean => self.build_primitive_array::(rows, i), + &DataType::Boolean => { + self.build_primitive_array::(rows, i) + } &DataType::Int8 => self.build_primitive_array::(rows, i), &DataType::Int16 => self.build_primitive_array::(rows, i), &DataType::Int32 => self.build_primitive_array::(rows, i), &DataType::Int64 => self.build_primitive_array::(rows, i), &DataType::UInt8 => self.build_primitive_array::(rows, i), - &DataType::UInt16 => self.build_primitive_array::(rows, i), - &DataType::UInt32 => self.build_primitive_array::(rows, i), - &DataType::UInt64 => self.build_primitive_array::(rows, i), - &DataType::Float32 => self.build_primitive_array::(rows, i), - &DataType::Float64 => self.build_primitive_array::(rows, i), + &DataType::UInt16 => { + self.build_primitive_array::(rows, i) + } + &DataType::UInt32 => { + self.build_primitive_array::(rows, i) + } + &DataType::UInt64 => { + self.build_primitive_array::(rows, i) + } + &DataType::Float32 => { + self.build_primitive_array::(rows, i) + } + &DataType::Float64 => { + self.build_primitive_array::(rows, i) + } &DataType::Utf8 => { let mut builder = BinaryBuilder::new(rows.len()); for row_index in 0..rows.len() { @@ -295,7 +310,8 @@ impl Reader { col_idx: &usize, ) -> Result { let mut builder = PrimitiveBuilder::::new(rows.len()); - let is_boolean_type = *self.schema.field(*col_idx).data_type() == DataType::Boolean; + let is_boolean_type = + *self.schema.field(*col_idx).data_type() == DataType::Boolean; for row_index in 0..rows.len() { match rows[row_index].get(*col_idx) { Some(s) if s.len() > 0 => { diff --git a/rust/arrow/src/datatypes.rs b/rust/arrow/src/datatypes.rs index 5008a97624a40..36f73414f46da 100644 --- a/rust/arrow/src/datatypes.rs +++ b/rust/arrow/src/datatypes.rs @@ -17,8 +17,8 @@ //! Defines the data-types of Arrow arrays. //! -//! For an overview of the terminology used within the arrow project and more general information -//! regarding data-types and memory layouts see +//! For an overview of the terminology used within the arrow project and more general +//! information regarding data-types and memory layouts see //! [here](https://arrow.apache.org/docs/memory_layout.html). use std::fmt; @@ -33,8 +33,8 @@ use crate::error::{ArrowError, Result}; /// The possible relative types that are supported. /// -/// The variants of this enum include primitive fixed size types as well as parametric or nested -/// types. +/// The variants of this enum include primitive fixed size types as well as parametric or +/// nested types. /// Currently the Rust implementation supports the following nested types: /// - `List` /// - `Struct` @@ -202,16 +202,24 @@ impl DataType { }, Some(s) if s == "timestamp" => match map.get("unit") { Some(p) if p == "SECOND" => Ok(DataType::Timestamp(TimeUnit::Second)), - Some(p) if p == "MILLISECOND" => Ok(DataType::Timestamp(TimeUnit::Millisecond)), - Some(p) if p == "MICROSECOND" => Ok(DataType::Timestamp(TimeUnit::Microsecond)), - Some(p) if p == "NANOSECOND" => Ok(DataType::Timestamp(TimeUnit::Nanosecond)), + Some(p) if p == "MILLISECOND" => { + Ok(DataType::Timestamp(TimeUnit::Millisecond)) + } + Some(p) if p == "MICROSECOND" => { + Ok(DataType::Timestamp(TimeUnit::Microsecond)) + } + Some(p) if p == "NANOSECOND" => { + Ok(DataType::Timestamp(TimeUnit::Nanosecond)) + } _ => Err(ArrowError::ParseError( "timestamp unit missing or invalid".to_string(), )), }, Some(s) if s == "date" => match map.get("unit") { Some(p) if p == "DAY" => Ok(DataType::Date(DateUnit::Day)), - Some(p) if p == "MILLISECOND" => Ok(DataType::Date(DateUnit::Millisecond)), + Some(p) if p == "MILLISECOND" => { + Ok(DataType::Date(DateUnit::Millisecond)) + } _ => Err(ArrowError::ParseError( "date unit missing or invalid".to_string(), )), @@ -235,8 +243,12 @@ impl DataType { } } Some(s) if s == "interval" => match map.get("unit") { - Some(p) if p == "DAY_TIME" => Ok(DataType::Interval(IntervalUnit::DayTime)), - Some(p) if p == "YEAR_MONTH" => Ok(DataType::Interval(IntervalUnit::YearMonth)), + Some(p) if p == "DAY_TIME" => { + Ok(DataType::Interval(IntervalUnit::DayTime)) + } + Some(p) if p == "YEAR_MONTH" => { + Ok(DataType::Interval(IntervalUnit::YearMonth)) + } _ => Err(ArrowError::ParseError( "interval unit missing or invalid".to_string(), )), @@ -312,26 +324,31 @@ impl DataType { DataType::Float64 => json!({"name": "floatingpoint", "precision": "DOUBLE"}), DataType::Utf8 => json!({"name": "utf8"}), DataType::Struct(ref fields) => { - let field_json_array = - Value::Array(fields.iter().map(|f| f.to_json()).collect::>()); + let field_json_array = Value::Array( + fields.iter().map(|f| f.to_json()).collect::>(), + ); json!({ "fields": field_json_array }) } DataType::List(ref t) => { let child_json = t.to_json(); json!({ "name": "list", "children": child_json }) } - DataType::Time32(unit) => json!({"name": "time", "bitWidth": "32", "unit": match unit { - TimeUnit::Second => "SECOND", - TimeUnit::Millisecond => "MILLISECOND", - TimeUnit::Microsecond => "MICROSECOND", - TimeUnit::Nanosecond => "NANOSECOND", - }}), - DataType::Time64(unit) => json!({"name": "time", "bitWidth": "64", "unit": match unit { - TimeUnit::Second => "SECOND", - TimeUnit::Millisecond => "MILLISECOND", - TimeUnit::Microsecond => "MICROSECOND", - TimeUnit::Nanosecond => "NANOSECOND", - }}), + DataType::Time32(unit) => { + json!({"name": "time", "bitWidth": "32", "unit": match unit { + TimeUnit::Second => "SECOND", + TimeUnit::Millisecond => "MILLISECOND", + TimeUnit::Microsecond => "MICROSECOND", + TimeUnit::Nanosecond => "NANOSECOND", + }}) + } + DataType::Time64(unit) => { + json!({"name": "time", "bitWidth": "64", "unit": match unit { + TimeUnit::Second => "SECOND", + TimeUnit::Millisecond => "MILLISECOND", + TimeUnit::Microsecond => "MICROSECOND", + TimeUnit::Nanosecond => "NANOSECOND", + }}) + } DataType::Date(unit) => json!({"name": "date", "unit": match unit { DateUnit::Day => "DAY", DateUnit::Millisecond => "MILLISECOND", @@ -438,8 +455,8 @@ impl fmt::Display for Field { /// Describes the meta-data of an ordered sequence of relative types. /// -/// Note that this information is only part of the meta-data and not part of the physical memory -/// layout. +/// Note that this information is only part of the meta-data and not part of the physical +/// memory layout. #[derive(Serialize, Deserialize, Debug, Clone)] pub struct Schema { pub(crate) fields: Vec, @@ -472,8 +489,8 @@ impl Schema { &self.fields } - /// Returns an immutable reference of a specific `Field` instance selected using an offset - /// within the internal `fields` vector + /// Returns an immutable reference of a specific `Field` instance selected using an + /// offset within the internal `fields` vector pub fn field(&self, i: usize) -> &Field { &self.fields[i] } diff --git a/rust/arrow/src/error.rs b/rust/arrow/src/error.rs index 85f8ee3be6bf7..58204a362d731 100644 --- a/rust/arrow/src/error.rs +++ b/rust/arrow/src/error.rs @@ -43,21 +43,20 @@ impl From for ArrowError { csv_crate::ErrorKind::Io(error) => { ArrowError::CsvError(error.description().to_string()) } - csv_crate::ErrorKind::Utf8 {pos: _, err} => { - ArrowError::CsvError(format!("Encountered UTF-8 error while reading CSV file: {:?}", err.description())) - } - csv_crate::ErrorKind::UnequalLengths {pos: _, expected_len, len} => { - ArrowError::CsvError( - format!( - "Encountered unequal lengths between records on CSV file. Expected {} records, found {} records", - len, - expected_len - ) - ) - } - _ => { - ArrowError::CsvError("Error reading CSV file".to_string()) - } + csv_crate::ErrorKind::Utf8 { pos: _, err } => ArrowError::CsvError(format!( + "Encountered UTF-8 error while reading CSV file: {:?}", + err.description() + )), + csv_crate::ErrorKind::UnequalLengths { + pos: _, + expected_len, + len, + } => ArrowError::CsvError(format!( + "Encountered unequal lengths between records on CSV file. Expected {} \ + records, found {} records", + len, expected_len + )), + _ => ArrowError::CsvError("Error reading CSV file".to_string()), } } } diff --git a/rust/arrow/src/lib.rs b/rust/arrow/src/lib.rs index 0ecd97c9890f7..dbac4db115165 100644 --- a/rust/arrow/src/lib.rs +++ b/rust/arrow/src/lib.rs @@ -18,8 +18,8 @@ //! A native Rust implementation of [Apache Arrow](https://arrow.apache.org), a cross-language //! development platform for in-memory data. //! -//! Currently the project is developed and tested against nightly Rust. To learn more about the -//! status of Arrow in Rust, see `README.md`. +//! Currently the project is developed and tested against nightly Rust. To learn more +//! about the status of Arrow in Rust, see `README.md`. #![feature(type_ascription)] #![feature(rustc_private)] diff --git a/rust/arrow/src/memory.rs b/rust/arrow/src/memory.rs index 2168d09803e5f..4e9ed98cc90bc 100644 --- a/rust/arrow/src/memory.rs +++ b/rust/arrow/src/memory.rs @@ -15,7 +15,8 @@ // specific language governing permissions and limitations // under the License. -//! Defines memory-related functions, currently mostly to make this library play nicely with C. +//! Defines memory-related functions, currently mostly to make this library play nicely +//! with C. use libc; use std::cmp; @@ -34,7 +35,8 @@ extern "C" { #[cfg(windows)] pub fn allocate_aligned(size: usize) -> Result<*mut u8> { - let page = unsafe { _aligned_malloc(size as libc::size_t, ALIGNMENT as libc::size_t) }; + let page = + unsafe { _aligned_malloc(size as libc::size_t, ALIGNMENT as libc::size_t) }; match page { 0 => Err(ArrowError::MemoryError( "Failed to allocate memory".to_string(), @@ -71,7 +73,11 @@ pub fn free_aligned(p: *const u8) { } } -pub fn reallocate(old_size: usize, new_size: usize, pointer: *const u8) -> Result<*const u8> { +pub fn reallocate( + old_size: usize, + new_size: usize, + pointer: *const u8, +) -> Result<*const u8> { unsafe { let old_src = mem::transmute::<*const u8, *mut libc::c_void>(pointer); let result = allocate_aligned(new_size)?; diff --git a/rust/arrow/src/record_batch.rs b/rust/arrow/src/record_batch.rs index 59eaf22b61911..a2bbd8b553ff0 100644 --- a/rust/arrow/src/record_batch.rs +++ b/rust/arrow/src/record_batch.rs @@ -17,9 +17,9 @@ //! According to the [Arrow Metadata Specification](https://arrow.apache.org/docs/metadata.html): //! -//! > A record batch is a collection of top-level named, equal length Arrow arrays (or vectors). If -//! > one of the arrays contains nested data, its child arrays are not required to be the same -//! > length as the top-level arrays. +//! > A record batch is a collection of top-level named, equal length Arrow arrays +//! > (or vectors). If one of the arrays contains nested data, its child arrays are not +//! > required to be the same length as the top-level arrays. use std::sync::Arc; @@ -101,7 +101,8 @@ mod tests { .build(); let b = BinaryArray::from(array_data); - let record_batch = RecordBatch::new(Arc::new(schema), vec![Arc::new(a), Arc::new(b)]); + let record_batch = + RecordBatch::new(Arc::new(schema), vec![Arc::new(a), Arc::new(b)]); assert_eq!(5, record_batch.num_rows()); assert_eq!(2, record_batch.num_columns()); diff --git a/rust/arrow/src/tensor.rs b/rust/arrow/src/tensor.rs index d633e3516e47d..66572286a6198 100644 --- a/rust/arrow/src/tensor.rs +++ b/rust/arrow/src/tensor.rs @@ -97,7 +97,9 @@ impl<'a, T: ArrowPrimitiveType> Tensor<'a, T> { Some(ref s) => { strides .iter() - .map(|i| assert_eq!(s.len(), i.len(), "shape and stride dimensions differ")) + .map(|i| { + assert_eq!(s.len(), i.len(), "shape and stride dimensions differ") + }) .next(); names .iter() @@ -360,7 +362,9 @@ mod tests { } #[test] - #[should_panic(expected = "number of dimensions and number of dimension names differ")] + #[should_panic( + expected = "number of dimensions and number of dimension names differ" + )] fn test_inconsistent_names() { let mut builder = Int32BufferBuilder::new(16); for i in 0..16 { diff --git a/rust/arrow/src/util/bit_util.rs b/rust/arrow/src/util/bit_util.rs index 953fea3359b53..4674783b092f7 100644 --- a/rust/arrow/src/util/bit_util.rs +++ b/rust/arrow/src/util/bit_util.rs @@ -20,14 +20,16 @@ static BIT_MASK: [u8; 8] = [1, 2, 4, 8, 16, 32, 64, 128]; static POPCOUNT_TABLE: [u8; 256] = [ - 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, - 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, - 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, - 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, - 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, - 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, - 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, - 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8, + 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, + 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, + 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, + 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, + 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, + 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, + 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, + 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, + 6, 7, 7, 8, ]; /// Returns the nearest number that is `>=` than `num` and is a multiple of 64 diff --git a/rust/parquet/src/basic.rs b/rust/parquet/src/basic.rs index 8b1be49659bc0..e6fdb9708759c 100644 --- a/rust/parquet/src/basic.rs +++ b/rust/parquet/src/basic.rs @@ -181,8 +181,8 @@ pub enum Encoding { /// **Deprecated** dictionary encoding. /// /// The values in the dictionary are encoded using PLAIN encoding. - /// Since it is deprecated, RLE_DICTIONARY encoding is used for a data page, and PLAIN - /// encoding is used for dictionary page. + /// Since it is deprecated, RLE_DICTIONARY encoding is used for a data page, and + /// PLAIN encoding is used for dictionary page. PLAIN_DICTIONARY, /// Group packed run length encoding. @@ -287,9 +287,10 @@ impl ColumnOrder { pub fn get_sort_order(logical_type: LogicalType, physical_type: Type) -> SortOrder { match logical_type { // Unsigned byte-wise comparison. - LogicalType::UTF8 | LogicalType::JSON | LogicalType::BSON | LogicalType::ENUM => { - SortOrder::UNSIGNED - } + LogicalType::UTF8 + | LogicalType::JSON + | LogicalType::BSON + | LogicalType::ENUM => SortOrder::UNSIGNED, LogicalType::INT_8 | LogicalType::INT_16 @@ -479,8 +480,12 @@ impl convert::From for Option { LogicalType::DATE => Some(parquet::ConvertedType::DATE), LogicalType::TIME_MILLIS => Some(parquet::ConvertedType::TIME_MILLIS), LogicalType::TIME_MICROS => Some(parquet::ConvertedType::TIME_MICROS), - LogicalType::TIMESTAMP_MILLIS => Some(parquet::ConvertedType::TIMESTAMP_MILLIS), - LogicalType::TIMESTAMP_MICROS => Some(parquet::ConvertedType::TIMESTAMP_MICROS), + LogicalType::TIMESTAMP_MILLIS => { + Some(parquet::ConvertedType::TIMESTAMP_MILLIS) + } + LogicalType::TIMESTAMP_MICROS => { + Some(parquet::ConvertedType::TIMESTAMP_MICROS) + } LogicalType::UINT_8 => Some(parquet::ConvertedType::UINT_8), LogicalType::UINT_16 => Some(parquet::ConvertedType::UINT_16), LogicalType::UINT_32 => Some(parquet::ConvertedType::UINT_32), @@ -530,7 +535,9 @@ impl convert::From for Encoding { parquet::Encoding::RLE => Encoding::RLE, parquet::Encoding::BIT_PACKED => Encoding::BIT_PACKED, parquet::Encoding::DELTA_BINARY_PACKED => Encoding::DELTA_BINARY_PACKED, - parquet::Encoding::DELTA_LENGTH_BYTE_ARRAY => Encoding::DELTA_LENGTH_BYTE_ARRAY, + parquet::Encoding::DELTA_LENGTH_BYTE_ARRAY => { + Encoding::DELTA_LENGTH_BYTE_ARRAY + } parquet::Encoding::DELTA_BYTE_ARRAY => Encoding::DELTA_BYTE_ARRAY, parquet::Encoding::RLE_DICTIONARY => Encoding::RLE_DICTIONARY, } @@ -545,7 +552,9 @@ impl convert::From for parquet::Encoding { Encoding::RLE => parquet::Encoding::RLE, Encoding::BIT_PACKED => parquet::Encoding::BIT_PACKED, Encoding::DELTA_BINARY_PACKED => parquet::Encoding::DELTA_BINARY_PACKED, - Encoding::DELTA_LENGTH_BYTE_ARRAY => parquet::Encoding::DELTA_LENGTH_BYTE_ARRAY, + Encoding::DELTA_LENGTH_BYTE_ARRAY => { + parquet::Encoding::DELTA_LENGTH_BYTE_ARRAY + } Encoding::DELTA_BYTE_ARRAY => parquet::Encoding::DELTA_BYTE_ARRAY, Encoding::RLE_DICTIONARY => parquet::Encoding::RLE_DICTIONARY, } diff --git a/rust/parquet/src/column/reader.rs b/rust/parquet/src/column/reader.rs index d327c50879ea8..625dbd260eb46 100644 --- a/rust/parquet/src/column/reader.rs +++ b/rust/parquet/src/column/reader.rs @@ -53,27 +53,34 @@ pub fn get_column_reader( col_page_reader: Box, ) -> ColumnReader { match col_descr.physical_type() { - Type::BOOLEAN => { - ColumnReader::BoolColumnReader(ColumnReaderImpl::new(col_descr, col_page_reader)) - } - Type::INT32 => { - ColumnReader::Int32ColumnReader(ColumnReaderImpl::new(col_descr, col_page_reader)) - } - Type::INT64 => { - ColumnReader::Int64ColumnReader(ColumnReaderImpl::new(col_descr, col_page_reader)) - } - Type::INT96 => { - ColumnReader::Int96ColumnReader(ColumnReaderImpl::new(col_descr, col_page_reader)) - } - Type::FLOAT => { - ColumnReader::FloatColumnReader(ColumnReaderImpl::new(col_descr, col_page_reader)) - } - Type::DOUBLE => { - ColumnReader::DoubleColumnReader(ColumnReaderImpl::new(col_descr, col_page_reader)) - } - Type::BYTE_ARRAY => { - ColumnReader::ByteArrayColumnReader(ColumnReaderImpl::new(col_descr, col_page_reader)) - } + Type::BOOLEAN => ColumnReader::BoolColumnReader(ColumnReaderImpl::new( + col_descr, + col_page_reader, + )), + Type::INT32 => ColumnReader::Int32ColumnReader(ColumnReaderImpl::new( + col_descr, + col_page_reader, + )), + Type::INT64 => ColumnReader::Int64ColumnReader(ColumnReaderImpl::new( + col_descr, + col_page_reader, + )), + Type::INT96 => ColumnReader::Int96ColumnReader(ColumnReaderImpl::new( + col_descr, + col_page_reader, + )), + Type::FLOAT => ColumnReader::FloatColumnReader(ColumnReaderImpl::new( + col_descr, + col_page_reader, + )), + Type::DOUBLE => ColumnReader::DoubleColumnReader(ColumnReaderImpl::new( + col_descr, + col_page_reader, + )), + Type::BYTE_ARRAY => ColumnReader::ByteArrayColumnReader(ColumnReaderImpl::new( + col_descr, + col_page_reader, + )), Type::FIXED_LEN_BYTE_ARRAY => ColumnReader::FixedLenByteArrayColumnReader( ColumnReaderImpl::new(col_descr, col_page_reader), ), @@ -85,7 +92,9 @@ pub fn get_column_reader( /// /// NOTE: the caller MUST guarantee that the actual enum value for `col_reader` matches /// the type `T`. Otherwise, disastrous consequence could happen. -pub fn get_typed_column_reader(col_reader: ColumnReader) -> ColumnReaderImpl { +pub fn get_typed_column_reader( + col_reader: ColumnReader, +) -> ColumnReaderImpl { match col_reader { ColumnReader::BoolColumnReader(r) => unsafe { mem::transmute(r) }, ColumnReader::Int32ColumnReader(r) => unsafe { mem::transmute(r) }, @@ -135,19 +144,20 @@ impl ColumnReaderImpl { /// Reads a batch of values of at most `batch_size`. /// /// This will try to read from the row group, and fills up at most `batch_size` values - /// for `def_levels`, `rep_levels` and `values`. It will stop either when the row group - /// is depleted or `batch_size` values has been read, or there is no space in the input - /// slices (values/definition levels/repetition levels). + /// for `def_levels`, `rep_levels` and `values`. It will stop either when the row + /// group is depleted or `batch_size` values has been read, or there is no space + /// in the input slices (values/definition levels/repetition levels). /// - /// Note that in case the field being read is not required, `values` could contain less - /// values than `def_levels`. Also note that this will skip reading def / rep levels if - /// the field is required / not repeated, respectively. + /// Note that in case the field being read is not required, `values` could contain + /// less values than `def_levels`. Also note that this will skip reading def / rep + /// levels if the field is required / not repeated, respectively. /// /// If `def_levels` or `rep_levels` is `None`, this will also skip reading the - /// respective levels. This is useful when the caller of this function knows in advance - /// that the field is required and non-repeated, therefore can avoid allocating memory - /// for the levels data. Note that if field has definition levels, but caller provides - /// None, there might be inconsistency between levels/values (see comments below). + /// respective levels. This is useful when the caller of this function knows in + /// advance that the field is required and non-repeated, therefore can avoid + /// allocating memory for the levels data. Note that if field has definition + /// levels, but caller provides None, there might be inconsistency between + /// levels/values (see comments below). /// /// Returns a tuple where the first element is the actual number of values read, /// and the second element is the actual number of levels read. @@ -186,8 +196,8 @@ impl ColumnReaderImpl { (self.num_buffered_values - self.num_decoded_values) as usize, ); - // Adjust batch size by taking into account how much space is left in values - // slice or levels slices (if available) + // Adjust batch size by taking into account how much space is left in + // values slice or levels slices (if available) adjusted_size = min(adjusted_size, values.len() - values_read); if let Some(ref levels) = def_levels { adjusted_size = min(adjusted_size, levels.len() - levels_read); @@ -206,8 +216,9 @@ impl ColumnReaderImpl { // If the field is required and non-repeated, there are no definition levels if self.descr.max_def_level() > 0 && def_levels.as_ref().is_some() { if let Some(ref mut levels) = def_levels { - num_def_levels = self - .read_def_levels(&mut levels[levels_read..levels_read + iter_batch_size])?; + num_def_levels = self.read_def_levels( + &mut levels[levels_read..levels_read + iter_batch_size], + )?; for i in levels_read..levels_read + num_def_levels { if levels[i] == self.descr.max_def_level() { values_to_read += 1; @@ -215,17 +226,20 @@ impl ColumnReaderImpl { } } } else { - // If max definition level == 0, then it is REQUIRED field, read all values. - // If definition levels are not provided, we still read all values. + // If max definition level == 0, then it is REQUIRED field, read all + // values. If definition levels are not provided, we still + // read all values. values_to_read = iter_batch_size; } if self.descr.max_rep_level() > 0 && rep_levels.is_some() { if let Some(ref mut levels) = rep_levels { - num_rep_levels = self - .read_rep_levels(&mut levels[levels_read..levels_read + iter_batch_size])?; + num_rep_levels = self.read_rep_levels( + &mut levels[levels_read..levels_read + iter_batch_size], + )?; - // If definition levels are defined, check that rep levels == def levels + // If definition levels are defined, check that rep levels == def + // levels if def_levels.is_some() { assert_eq!( num_def_levels, num_rep_levels, @@ -293,8 +307,10 @@ impl ColumnReaderImpl { rep_level_encoding, self.descr.max_rep_level(), ); - let total_bytes = rep_decoder - .set_data(self.num_buffered_values as usize, buffer_ptr.all()); + let total_bytes = rep_decoder.set_data( + self.num_buffered_values as usize, + buffer_ptr.all(), + ); buffer_ptr = buffer_ptr.start_from(total_bytes); self.rep_level_decoder = Some(rep_decoder); } @@ -304,13 +320,16 @@ impl ColumnReaderImpl { def_level_encoding, self.descr.max_def_level(), ); - let total_bytes = def_decoder - .set_data(self.num_buffered_values as usize, buffer_ptr.all()); + let total_bytes = def_decoder.set_data( + self.num_buffered_values as usize, + buffer_ptr.all(), + ); buffer_ptr = buffer_ptr.start_from(total_bytes); self.def_level_decoder = Some(def_decoder); } - // Data page v1 does not have offset, all content of buffer should be passed + // Data page v1 does not have offset, all content of buffer + // should be passed self.set_current_page_encoding( encoding, &buffer_ptr, @@ -336,9 +355,11 @@ impl ColumnReaderImpl { let mut offset = 0; - // DataPage v2 only supports RLE encoding for repetition levels + // DataPage v2 only supports RLE encoding for repetition + // levels if self.descr.max_rep_level() > 0 { - let mut rep_decoder = LevelDecoder::v2(self.descr.max_rep_level()); + let mut rep_decoder = + LevelDecoder::v2(self.descr.max_rep_level()); let bytes_read = rep_decoder.set_data_range( self.num_buffered_values as usize, &buf, @@ -349,9 +370,11 @@ impl ColumnReaderImpl { self.rep_level_decoder = Some(rep_decoder); } - // DataPage v2 only supports RLE encoding for definition levels + // DataPage v2 only supports RLE encoding for definition + // levels if self.descr.max_def_level() > 0 { - let mut def_decoder = LevelDecoder::v2(self.descr.max_def_level()); + let mut def_decoder = + LevelDecoder::v2(self.descr.max_def_level()); let bytes_read = def_decoder.set_data_range( self.num_buffered_values as usize, &buf, @@ -411,7 +434,9 @@ impl ColumnReaderImpl { #[inline] fn has_next(&mut self) -> Result { - if self.num_buffered_values == 0 || self.num_buffered_values == self.num_decoded_values { + if self.num_buffered_values == 0 + || self.num_buffered_values == self.num_decoded_values + { // TODO: should we return false if read_new_page() = true and // num_buffered_values = 0? if !self.read_new_page()? { @@ -1002,8 +1027,8 @@ mod tests { // // # Page assembly // - // Page construction and generation of values, definition and repetition levels happens - // in `make_pages` function. + // Page construction and generation of values, definition and repetition levels + // happens in `make_pages` function. // All values are randomly generated based on provided min/max, levels are calculated // based on provided max level for column descriptor (which is basically either int32 // or int64 type in tests) and `levels_per_page` variable. @@ -1260,7 +1285,8 @@ mod tests { ); let max_def_level = desc.max_def_level(); let page_reader = TestPageReader::new(Vec::from(pages)); - let column_reader: ColumnReader = get_column_reader(desc, Box::new(page_reader)); + let column_reader: ColumnReader = + get_column_reader(desc, Box::new(page_reader)); let mut typed_column_reader = get_typed_column_reader::(column_reader); let mut curr_values_read = 0; @@ -1408,15 +1434,16 @@ mod tests { // Adds levels to the buffer and return number of encoded bytes fn add_levels(&mut self, max_level: i16, levels: &[i16]) -> u32 { let size = max_buffer_size(Encoding::RLE, max_level, levels.len()); - let mut level_encoder = LevelEncoder::v1(Encoding::RLE, max_level, vec![0; size]); + let mut level_encoder = + LevelEncoder::v1(Encoding::RLE, max_level, vec![0; size]); level_encoder.put(levels).expect("put() should be OK"); let encoded_levels = level_encoder.consume().expect("consume() should be OK"); // Actual encoded bytes (without length offset) let encoded_bytes = &encoded_levels[mem::size_of::()..]; if self.datapage_v2 { - // Level encoder always initializes with offset of i32, where it stores length of - // encoded data; for data page v2 we explicitly store length, therefore we should - // skip i32 bytes. + // Level encoder always initializes with offset of i32, where it stores + // length of encoded data; for data page v2 we explicitly + // store length, therefore we should skip i32 bytes. self.buffer.extend_from_slice(encoded_bytes); } else { self.buffer.extend_from_slice(encoded_levels.as_slice()); @@ -1469,8 +1496,10 @@ mod tests { buf: ByteBufferPtr::new(self.buffer), num_values: self.num_values, encoding: self.encoding.unwrap(), - num_nulls: 0, // set to dummy value - don't need this when reading data page - num_rows: self.num_values, // also don't need this when reading data page + num_nulls: 0, /* set to dummy value - don't need this when reading + * data page */ + num_rows: self.num_values, /* also don't need this when reading + * data page */ def_levels_byte_len: self.def_levels_byte_len, rep_levels_byte_len: self.rep_levels_byte_len, is_compressed: false, @@ -1532,7 +1561,11 @@ mod tests { // Generate the current page - let mut pb = DataPageBuilderImpl::new(desc.clone(), num_values_cur_page as u32, use_v2); + let mut pb = DataPageBuilderImpl::new( + desc.clone(), + num_values_cur_page as u32, + use_v2, + ); if max_rep_level > 0 { pb.add_rep_levels(max_rep_level, &rep_levels[level_range.clone()]); } @@ -1560,7 +1593,8 @@ mod tests { num_values += num_values_cur_page; } - if encoding == Encoding::PLAIN_DICTIONARY || encoding == Encoding::RLE_DICTIONARY { + if encoding == Encoding::PLAIN_DICTIONARY || encoding == Encoding::RLE_DICTIONARY + { let dict = dict_encoder .write_dict() .expect("write_dict() should be OK"); diff --git a/rust/parquet/src/column/writer.rs b/rust/parquet/src/column/writer.rs index 26bd7c5aac778..b520997e46ffb 100644 --- a/rust/parquet/src/column/writer.rs +++ b/rust/parquet/src/column/writer.rs @@ -54,27 +54,41 @@ pub fn get_column_writer( page_writer: Box, ) -> ColumnWriter { match descr.physical_type() { - Type::BOOLEAN => { - ColumnWriter::BoolColumnWriter(ColumnWriterImpl::new(descr, props, page_writer)) - } - Type::INT32 => { - ColumnWriter::Int32ColumnWriter(ColumnWriterImpl::new(descr, props, page_writer)) - } - Type::INT64 => { - ColumnWriter::Int64ColumnWriter(ColumnWriterImpl::new(descr, props, page_writer)) - } - Type::INT96 => { - ColumnWriter::Int96ColumnWriter(ColumnWriterImpl::new(descr, props, page_writer)) - } - Type::FLOAT => { - ColumnWriter::FloatColumnWriter(ColumnWriterImpl::new(descr, props, page_writer)) - } - Type::DOUBLE => { - ColumnWriter::DoubleColumnWriter(ColumnWriterImpl::new(descr, props, page_writer)) - } - Type::BYTE_ARRAY => { - ColumnWriter::ByteArrayColumnWriter(ColumnWriterImpl::new(descr, props, page_writer)) - } + Type::BOOLEAN => ColumnWriter::BoolColumnWriter(ColumnWriterImpl::new( + descr, + props, + page_writer, + )), + Type::INT32 => ColumnWriter::Int32ColumnWriter(ColumnWriterImpl::new( + descr, + props, + page_writer, + )), + Type::INT64 => ColumnWriter::Int64ColumnWriter(ColumnWriterImpl::new( + descr, + props, + page_writer, + )), + Type::INT96 => ColumnWriter::Int96ColumnWriter(ColumnWriterImpl::new( + descr, + props, + page_writer, + )), + Type::FLOAT => ColumnWriter::FloatColumnWriter(ColumnWriterImpl::new( + descr, + props, + page_writer, + )), + Type::DOUBLE => ColumnWriter::DoubleColumnWriter(ColumnWriterImpl::new( + descr, + props, + page_writer, + )), + Type::BYTE_ARRAY => ColumnWriter::ByteArrayColumnWriter(ColumnWriterImpl::new( + descr, + props, + page_writer, + )), Type::FIXED_LEN_BYTE_ARRAY => ColumnWriter::FixedLenByteArrayColumnWriter( ColumnWriterImpl::new(descr, props, page_writer), ), @@ -86,7 +100,9 @@ pub fn get_column_writer( /// /// NOTE: the caller MUST guarantee that the actual enum value for `col_writer` matches /// the type `T`. Otherwise, disastrous consequence could happen. -pub fn get_typed_column_writer(col_writer: ColumnWriter) -> ColumnWriterImpl { +pub fn get_typed_column_writer( + col_writer: ColumnWriter, +) -> ColumnWriterImpl { match col_writer { ColumnWriter::BoolColumnWriter(r) => unsafe { mem::transmute(r) }, ColumnWriter::Int32ColumnWriter(r) => unsafe { mem::transmute(r) }, @@ -138,12 +154,13 @@ impl ColumnWriterImpl { let compressor = create_codec(codec).unwrap(); // Optionally set dictionary encoder. - let dict_encoder = - if props.dictionary_enabled(descr.path()) && Self::has_dictionary_support(&props) { - Some(DictEncoder::new(descr.clone(), Rc::new(MemTracker::new()))) - } else { - None - }; + let dict_encoder = if props.dictionary_enabled(descr.path()) + && Self::has_dictionary_support(&props) + { + Some(DictEncoder::new(descr.clone(), Rc::new(MemTracker::new()))) + } else { + None + }; // Whether or not this column writer has a dictionary encoding. let has_dictionary = dict_encoder.is_some(); @@ -204,11 +221,12 @@ impl ColumnWriterImpl { // We check for DataPage limits only after we have inserted the values. If a user // writes a large number of values, the DataPage size can be well above the limit. // - // The purpose of this chunking is to bound this. Even if a user writes large number - // of values, the chunking will ensure that we add data page at a reasonable pagesize - // limit. + // The purpose of this chunking is to bound this. Even if a user writes large + // number of values, the chunking will ensure that we add data page at a + // reasonable pagesize limit. - // TODO: find out why we don't account for size of levels when we estimate page size. + // TODO: find out why we don't account for size of levels when we estimate page + // size. // Find out the minimal length to prevent index out of bound errors. let mut min_len = values.len(); @@ -283,7 +301,8 @@ impl ColumnWriterImpl { let num_values; let mut values_to_write = 0; - // Check if number of definition levels is the same as number of repetition levels. + // Check if number of definition levels is the same as number of repetition + // levels. if def_levels.is_some() && rep_levels.is_some() { let def = def_levels.unwrap(); let rep = rep_levels.unwrap(); @@ -485,13 +504,15 @@ impl ColumnWriterImpl { let mut buffer = vec![]; if max_rep_level > 0 { - let levels = self.encode_levels_v2(&self.rep_levels_sink[..], max_rep_level)?; + let levels = + self.encode_levels_v2(&self.rep_levels_sink[..], max_rep_level)?; rep_levels_byte_len = levels.len(); buffer.extend_from_slice(&levels[..]); } if max_def_level > 0 { - let levels = self.encode_levels_v2(&self.def_levels_sink[..], max_def_level)?; + let levels = + self.encode_levels_v2(&self.def_levels_sink[..], max_def_level)?; def_levels_byte_len = levels.len(); buffer.extend_from_slice(&levels[..]); } @@ -502,7 +523,8 @@ impl ColumnWriterImpl { // Data Page v2 compresses values only. match self.compressor { Some(ref mut cmpr) => { - let mut compressed_buf = Vec::with_capacity(value_bytes.data().len()); + let mut compressed_buf = + Vec::with_capacity(value_bytes.data().len()); cmpr.compress(value_bytes.data(), &mut compressed_buf)?; buffer.extend_from_slice(&compressed_buf[..]); } @@ -515,7 +537,8 @@ impl ColumnWriterImpl { buf: ByteBufferPtr::new(buffer), num_values: self.num_buffered_values, encoding, - num_nulls: self.num_buffered_values - self.num_buffered_encoded_values, + num_nulls: self.num_buffered_values + - self.num_buffered_encoded_values, num_rows: self.num_buffered_rows, def_levels_byte_len: def_levels_byte_len as u32, rep_levels_byte_len: rep_levels_byte_len as u32, @@ -807,7 +830,8 @@ mod tests { reader::{get_column_reader, get_typed_column_reader, ColumnReaderImpl}, }; use crate::file::{ - properties::WriterProperties, reader::SerializedPageReader, writer::SerializedPageWriter, + properties::WriterProperties, reader::SerializedPageReader, + writer::SerializedPageWriter, }; use crate::schema::types::{ColumnDescriptor, ColumnPath, Type as SchemaType}; use crate::util::{ @@ -919,7 +943,8 @@ mod tests { .unwrap(); let (bytes_written, rows_written, metadata) = writer.close().unwrap(); - // PlainEncoder uses bit writer to write boolean values, which all fit into 1 byte. + // PlainEncoder uses bit writer to write boolean values, which all fit into 1 + // byte. assert_eq!(bytes_written, 1); assert_eq!(rows_written, 4); assert_eq!(metadata.encodings(), &vec![Encoding::PLAIN, Encoding::RLE]); @@ -1428,8 +1453,12 @@ mod tests { max_batch_size = cmp::max(max_batch_size, levels.len()); } - let mut writer = - get_test_column_writer::(page_writer, max_def_level, max_rep_level, Rc::new(props)); + let mut writer = get_test_column_writer::( + page_writer, + max_def_level, + max_rep_level, + Rc::new(props), + ); let values_written = writer.write_batch(values, def_levels, rep_levels).unwrap(); assert_eq!(values_written, values.len()); @@ -1445,7 +1474,8 @@ mod tests { ) .unwrap(), ); - let reader = get_test_column_reader::(page_reader, max_def_level, max_rep_level); + let reader = + get_test_column_reader::(page_reader, max_def_level, max_rep_level); let mut actual_values = vec![T::T::default(); max_batch_size]; let mut actual_def_levels = match def_levels { diff --git a/rust/parquet/src/compression.rs b/rust/parquet/src/compression.rs index 3644ffcc54272..9cf2ac263dbd9 100644 --- a/rust/parquet/src/compression.rs +++ b/rust/parquet/src/compression.rs @@ -55,13 +55,14 @@ use crate::errors::{ParquetError, Result}; pub trait Codec { /// Compresses data stored in slice `input_buf` and writes the compressed result /// to `output_buf`. - /// Note that you'll need to call `clear()` before reusing the same `output_buf` across - /// different `compress` calls. + /// Note that you'll need to call `clear()` before reusing the same `output_buf` + /// across different `compress` calls. fn compress(&mut self, input_buf: &[u8], output_buf: &mut Vec) -> Result<()>; /// Decompresses data stored in slice `input_buf` and writes output to `output_buf`. /// Returns the total number of bytes written. - fn decompress(&mut self, input_buf: &[u8], output_buf: &mut Vec) -> Result; + fn decompress(&mut self, input_buf: &[u8], output_buf: &mut Vec) + -> Result; } /// Given the compression type `codec`, returns a codec used to compress and decompress @@ -96,7 +97,11 @@ impl SnappyCodec { } impl Codec for SnappyCodec { - fn decompress(&mut self, input_buf: &[u8], output_buf: &mut Vec) -> Result { + fn decompress( + &mut self, + input_buf: &[u8], + output_buf: &mut Vec, + ) -> Result { let len = decompress_len(input_buf)?; output_buf.resize(len, 0); self.decoder @@ -126,7 +131,11 @@ impl GZipCodec { } impl Codec for GZipCodec { - fn decompress(&mut self, input_buf: &[u8], output_buf: &mut Vec) -> Result { + fn decompress( + &mut self, + input_buf: &[u8], + output_buf: &mut Vec, + ) -> Result { let mut decoder = read::GzDecoder::new(input_buf); decoder.read_to_end(output_buf).map_err(|e| e.into()) } @@ -153,7 +162,11 @@ impl BrotliCodec { } impl Codec for BrotliCodec { - fn decompress(&mut self, input_buf: &[u8], output_buf: &mut Vec) -> Result { + fn decompress( + &mut self, + input_buf: &[u8], + output_buf: &mut Vec, + ) -> Result { brotli::Decompressor::new(input_buf, BROTLI_DEFAULT_BUFFER_SIZE) .read_to_end(output_buf) .map_err(|e| e.into()) @@ -184,7 +197,11 @@ impl LZ4Codec { } impl Codec for LZ4Codec { - fn decompress(&mut self, input_buf: &[u8], output_buf: &mut Vec) -> Result { + fn decompress( + &mut self, + input_buf: &[u8], + output_buf: &mut Vec, + ) -> Result { let mut decoder = lz4::Decoder::new(input_buf)?; let mut buffer: [u8; LZ4_BUFFER_SIZE] = [0; LZ4_BUFFER_SIZE]; let mut total_len = 0; @@ -228,7 +245,11 @@ impl ZSTDCodec { const ZSTD_COMPRESSION_LEVEL: i32 = 1; impl Codec for ZSTDCodec { - fn decompress(&mut self, input_buf: &[u8], output_buf: &mut Vec) -> Result { + fn decompress( + &mut self, + input_buf: &[u8], + output_buf: &mut Vec, + ) -> Result { let mut decoder = zstd::Decoder::new(input_buf)?; match io::copy(&mut decoder, output_buf) { Ok(n) => Ok(n as usize), diff --git a/rust/parquet/src/data_type.rs b/rust/parquet/src/data_type.rs index bfe0889cf71c4..fedd0b765c2df 100644 --- a/rust/parquet/src/data_type.rs +++ b/rust/parquet/src/data_type.rs @@ -284,7 +284,9 @@ gen_as_bytes!(f64); impl AsBytes for Int96 { fn as_bytes(&self) -> &[u8] { - unsafe { ::std::slice::from_raw_parts(self.data() as *const [u32] as *const u8, 12) } + unsafe { + ::std::slice::from_raw_parts(self.data() as *const [u32] as *const u8, 12) + } } } diff --git a/rust/parquet/src/encodings/decoding.rs b/rust/parquet/src/encodings/decoding.rs index f0e93fe1abea7..e02aed6af8f25 100644 --- a/rust/parquet/src/encodings/decoding.rs +++ b/rust/parquet/src/encodings/decoding.rs @@ -44,8 +44,9 @@ pub trait Decoder { /// Consumes values from this decoder and write the results to `buffer`. This will try /// to fill up `buffer`. /// - /// Returns the actual number of values decoded, which should be equal to `buffer.len()` - /// unless the remaining number of values is less than `buffer.len()`. + /// Returns the actual number of values decoded, which should be equal to + /// `buffer.len()` unless the remaining number of values is less than + /// `buffer.len()`. fn get(&mut self, buffer: &mut [T::T]) -> Result; /// Returns the number of values left in this decoder stream. @@ -102,7 +103,8 @@ pub struct PlainDecoder { // Read `data` bit by bit. Only set if `T` is bool. bit_reader: Option, - // To allow `T` in the generic parameter for this struct. This doesn't take any space. + // To allow `T` in the generic parameter for this struct. This doesn't take any + // space. _phantom: PhantomData, } @@ -215,7 +217,8 @@ impl Decoder for PlainDecoder { let data = self.data.as_mut().unwrap(); let num_values = cmp::min(buffer.len(), self.num_values); for i in 0..num_values { - let len: usize = read_num_bytes!(u32, 4, data.start_from(self.start).as_ref()) as usize; + let len: usize = + read_num_bytes!(u32, 4, data.start_from(self.start).as_ref()) as usize; self.start += mem::size_of::(); if data.len() < self.start + len { return Err(eof_err!("Not enough bytes to decode")); @@ -344,7 +347,11 @@ impl RleValueDecoder { } #[inline] - fn set_data_internal(&mut self, data: ByteBufferPtr, num_values: usize) -> Result<()> { + fn set_data_internal( + &mut self, + data: ByteBufferPtr, + num_values: usize, + ) -> Result<()> { // We still need to remove prefix of i32 from the stream. let i32_size = mem::size_of::(); let data_size = read_num_bytes!(i32, i32_size, data.as_ref()) as usize; @@ -360,7 +367,11 @@ impl RleValueDecoder { impl Decoder for RleValueDecoder { #[inline] - default fn set_data(&mut self, _data: ByteBufferPtr, _num_values: usize) -> Result<()> { + default fn set_data( + &mut self, + _data: ByteBufferPtr, + _num_values: usize, + ) -> Result<()> { panic!("RleValueDecoder only supports BoolType"); } @@ -521,10 +532,11 @@ impl Decoder for DeltaBitPackDecoder { .bit_reader .get_vlq_int() .ok_or(eof_err!("Not enough data to decode 'num_mini_blocks'"))?; - self.num_values = - self.bit_reader - .get_vlq_int() - .ok_or(eof_err!("Not enough data to decode 'num_values'"))? as usize; + self.num_values = self + .bit_reader + .get_vlq_int() + .ok_or(eof_err!("Not enough data to decode 'num_values'"))? + as usize; self.first_value = self .bit_reader .get_zigzag_vlq_int() @@ -557,7 +569,8 @@ impl Decoder for DeltaBitPackDecoder { if self.values_current_mini_block == 0 { self.mini_block_idx += 1; if self.mini_block_idx < self.delta_bit_widths.size() { - self.delta_bit_width = self.delta_bit_widths.data()[self.mini_block_idx]; + self.delta_bit_width = + self.delta_bit_widths.data()[self.mini_block_idx]; self.values_current_mini_block = self.values_per_mini_block; } else { self.init_block()?; @@ -565,9 +578,11 @@ impl Decoder for DeltaBitPackDecoder { self.load_deltas_in_mini_block()?; } - // we decrement values in current mini block, so we need to invert index for delta - let delta = - self.get_delta(self.deltas_in_mini_block.len() - self.values_current_mini_block); + // we decrement values in current mini block, so we need to invert index for + // delta + let delta = self.get_delta( + self.deltas_in_mini_block.len() - self.values_current_mini_block, + ); // It is OK for deltas to contain "overflowed" values after encoding, // e.g. i64::MAX - i64::MIN, so we use `wrapping_add` to "overflow" again and // restore original value. @@ -808,7 +823,8 @@ impl Decoder for DeltaByteArrayDecoder { prefix_len_decoder.get(&mut self.prefix_lengths[..])?; let mut suffix_decoder = DeltaLengthByteArrayDecoder::new(); - suffix_decoder.set_data(data.start_from(prefix_len_decoder.get_offset()), num_values)?; + suffix_decoder + .set_data(data.start_from(prefix_len_decoder.get_offset()), num_values)?; self.suffix_decoder = Some(suffix_decoder); self.num_values = num_prefixes; self.current_idx = 0; @@ -849,12 +865,14 @@ impl Decoder for DeltaByteArrayDecoder { impl Decoder for DeltaByteArrayDecoder { fn set_data(&mut self, data: ByteBufferPtr, num_values: usize) -> Result<()> { - let s: &mut DeltaByteArrayDecoder = unsafe { mem::transmute(self) }; + let s: &mut DeltaByteArrayDecoder = + unsafe { mem::transmute(self) }; s.set_data(data, num_values) } fn get(&mut self, buffer: &mut [ByteArray]) -> Result { - let s: &mut DeltaByteArrayDecoder = unsafe { mem::transmute(self) }; + let s: &mut DeltaByteArrayDecoder = + unsafe { mem::transmute(self) }; s.get(buffer) } } @@ -865,8 +883,12 @@ mod tests { use std::{mem, rc::Rc}; - use crate::schema::types::{ColumnDescPtr, ColumnDescriptor, ColumnPath, Type as SchemaType}; - use crate::util::{bit_util::set_array_bit, memory::MemTracker, test_common::RandGen}; + use crate::schema::types::{ + ColumnDescPtr, ColumnDescriptor, ColumnPath, Type as SchemaType, + }; + use crate::util::{ + bit_util::set_array_bit, memory::MemTracker, test_common::RandGen, + }; #[test] fn test_get_decoders() { @@ -1074,8 +1096,8 @@ mod tests { #[test] fn test_delta_bit_packed_int32_repeat() { let block_data = vec![ - 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, - 6, 7, 8, + 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, + 3, 4, 5, 6, 7, 8, ]; test_delta_bit_packed_decode::(vec![block_data]); } @@ -1089,13 +1111,14 @@ mod tests { #[test] fn test_delta_bit_packed_int32_same_values() { let block_data = vec![ - 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, + 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, + 127, ]; test_delta_bit_packed_decode::(vec![block_data]); let block_data = vec![ - -127, -127, -127, -127, -127, -127, -127, -127, -127, -127, -127, -127, -127, -127, - -127, -127, + -127, -127, -127, -127, -127, -127, -127, -127, -127, -127, -127, -127, -127, + -127, -127, -127, ]; test_delta_bit_packed_decode::(vec![block_data]); } @@ -1178,8 +1201,8 @@ mod tests { #[test] fn test_delta_bit_packed_decoder_sample() { let data_bytes = vec![ - 128, 1, 4, 3, 58, 28, 6, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, + 128, 1, 4, 3, 58, 28, 6, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]; let buffer = ByteBufferPtr::new(data_bytes); let mut decoder: DeltaBitPackDecoder = DeltaBitPackDecoder::new(); @@ -1266,8 +1289,9 @@ mod tests { let col_descr = create_test_col_desc_ptr(-1, T::get_physical_type()); // Encode data - let mut encoder = get_encoder::(col_descr.clone(), encoding, Rc::new(MemTracker::new())) - .expect("get encoder"); + let mut encoder = + get_encoder::(col_descr.clone(), encoding, Rc::new(MemTracker::new())) + .expect("get encoder"); for v in &data[..] { encoder.put(&v[..]).expect("ok to encode"); @@ -1278,7 +1302,8 @@ mod tests { let expected: Vec = data.iter().flat_map(|s| s.clone()).collect(); // Decode data and compare with original - let mut decoder = get_decoder::(col_descr.clone(), encoding).expect("get decoder"); + let mut decoder = + get_decoder::(col_descr.clone(), encoding).expect("get decoder"); let mut result = vec![T::T::default(); expected.len()]; decoder @@ -1294,7 +1319,10 @@ mod tests { assert_eq!(result, expected); } - fn create_and_check_decoder(encoding: Encoding, err: Option) { + fn create_and_check_decoder( + encoding: Encoding, + err: Option, + ) { let descr = create_test_col_desc_ptr(-1, T::get_physical_type()); let decoder = get_decoder::(descr, encoding); match err { @@ -1341,7 +1369,10 @@ mod tests { let mut v = vec![]; let type_len = ::std::mem::size_of::(); v.extend_from_slice(unsafe { - ::std::slice::from_raw_parts(data.as_ptr() as *const u8, data.len() * type_len) + ::std::slice::from_raw_parts( + data.as_ptr() as *const u8, + data.len() * type_len, + ) }); v } @@ -1367,7 +1398,8 @@ mod tests { let mut v = vec![]; for d in data { unsafe { - let copy = ::std::slice::from_raw_parts(d.data().as_ptr() as *const u8, 12); + let copy = + ::std::slice::from_raw_parts(d.data().as_ptr() as *const u8, 12); v.extend_from_slice(copy); }; } diff --git a/rust/parquet/src/encodings/encoding.rs b/rust/parquet/src/encodings/encoding.rs index e1d674cc6ca2c..a045187d8295b 100644 --- a/rust/parquet/src/encodings/encoding.rs +++ b/rust/parquet/src/encodings/encoding.rs @@ -271,8 +271,8 @@ impl DictEncoder { plain_encoder.flush_buffer() } - /// Writes out the dictionary values with RLE encoding in a byte buffer, and return the - /// result. + /// Writes out the dictionary values with RLE encoding in a byte buffer, and return + /// the result. #[inline] pub fn write_indices(&mut self) -> Result { // TODO: the caller should allocate the buffer @@ -311,7 +311,9 @@ impl DictEncoder { self.hash_slots[j] = index; self.add_dict_key(value.clone()); - if self.uniques.size() > (self.hash_table_size as f32 * MAX_HASH_LOAD) as usize { + if self.uniques.size() + > (self.hash_table_size as f32 * MAX_HASH_LOAD) as usize + { self.double_table_size(); } } @@ -585,8 +587,9 @@ impl DeltaBitPackEncoder { /// Writes page header for blocks, this method is invoked when we are done encoding /// values. It is also okay to encode when no values have been provided fn write_page_header(&mut self) { - // We ignore the result of each 'put' operation, because MAX_PAGE_HEADER_WRITER_SIZE - // is chosen to fit all header values and guarantees that writes will not fail. + // We ignore the result of each 'put' operation, because + // MAX_PAGE_HEADER_WRITER_SIZE is chosen to fit all header values and + // guarantees that writes will not fail. // Write the size of each block self.page_header_writer.put_vlq_int(self.block_size as u64); @@ -622,8 +625,8 @@ impl DeltaBitPackEncoder { }; for i in 0..self.num_mini_blocks { - // Find how many values we need to encode - either block size or whatever values - // left + // Find how many values we need to encode - either block size or whatever + // values left let n = cmp::min(self.mini_block_size, self.values_in_block); if n == 0 { break; @@ -632,7 +635,8 @@ impl DeltaBitPackEncoder { // Compute the max delta in current mini block let mut max_delta = i64::min_value(); for j in 0..n { - max_delta = cmp::max(max_delta, self.deltas[i * self.mini_block_size + j]); + max_delta = + cmp::max(max_delta, self.deltas[i * self.mini_block_size + j]); } // Compute bit width to store (max_delta - min_delta) @@ -641,8 +645,8 @@ impl DeltaBitPackEncoder { // Encode values in current mini block using min_delta and bit_width for j in 0..n { - let packed_value = - self.subtract_u64(self.deltas[i * self.mini_block_size + j], min_delta); + let packed_value = self + .subtract_u64(self.deltas[i * self.mini_block_size + j], min_delta); self.bit_writer.put_value(packed_value, bit_width); } @@ -913,7 +917,9 @@ impl DeltaByteArrayEncoder { impl Encoder for DeltaByteArrayEncoder { default fn put(&mut self, _values: &[T::T]) -> Result<()> { - panic!("DeltaByteArrayEncoder only supports ByteArrayType and FixedLenByteArrayType"); + panic!( + "DeltaByteArrayEncoder only supports ByteArrayType and FixedLenByteArrayType" + ); } fn encoding(&self) -> Encoding { @@ -926,7 +932,9 @@ impl Encoder for DeltaByteArrayEncoder { } default fn flush_buffer(&mut self) -> Result { - panic!("DeltaByteArrayEncoder only supports ByteArrayType and FixedLenByteArrayType"); + panic!( + "DeltaByteArrayEncoder only supports ByteArrayType and FixedLenByteArrayType" + ); } } @@ -937,10 +945,12 @@ impl Encoder for DeltaByteArrayEncoder { for byte_array in values { let current = byte_array.data(); - // Maximum prefix length that is shared between previous value and current value + // Maximum prefix length that is shared between previous value and current + // value let prefix_len = cmp::min(self.previous.len(), current.len()); let mut match_len = 0; - while match_len < prefix_len && self.previous[match_len] == current[match_len] { + while match_len < prefix_len && self.previous[match_len] == current[match_len] + { match_len += 1; } prefix_lengths.push(match_len as i32); @@ -972,12 +982,14 @@ impl Encoder for DeltaByteArrayEncoder { impl Encoder for DeltaByteArrayEncoder { fn put(&mut self, values: &[ByteArray]) -> Result<()> { - let s: &mut DeltaByteArrayEncoder = unsafe { mem::transmute(self) }; + let s: &mut DeltaByteArrayEncoder = + unsafe { mem::transmute(self) }; s.put(values) } fn flush_buffer(&mut self) -> Result { - let s: &mut DeltaByteArrayEncoder = unsafe { mem::transmute(self) }; + let s: &mut DeltaByteArrayEncoder = + unsafe { mem::transmute(self) }; s.flush_buffer() } } @@ -989,7 +1001,9 @@ mod tests { use std::rc::Rc; use crate::decoding::{get_decoder, Decoder, DictDecoder, PlainDecoder}; - use crate::schema::types::{ColumnDescPtr, ColumnDescriptor, ColumnPath, Type as SchemaType}; + use crate::schema::types::{ + ColumnDescPtr, ColumnDescriptor, ColumnPath, Type as SchemaType, + }; use crate::util::{memory::MemTracker, test_common::RandGen}; const TEST_SET_SIZE: usize = 1024; @@ -1080,7 +1094,11 @@ mod tests { #[test] fn test_dict_encoded_size() { - fn run_test(type_length: i32, values: &[T::T], expected_size: usize) { + fn run_test( + type_length: i32, + values: &[T::T], + expected_size: usize, + ) { let mut encoder = create_test_dict_encoder::(type_length); assert_eq!(encoder.dict_encoded_size(), 0); encoder.put(values).unwrap(); @@ -1102,8 +1120,16 @@ mod tests { &[Int96::from(vec![1, 2, 3]), Int96::from(vec![2, 3, 4])], 32, ); - run_test::(-1, &[ByteArray::from("abcd"), ByteArray::from("efj")], 15); - run_test::(2, &[ByteArray::from("ab"), ByteArray::from("bc")], 4); + run_test::( + -1, + &[ByteArray::from("abcd"), ByteArray::from("efj")], + 15, + ); + run_test::( + 2, + &[ByteArray::from("ab"), ByteArray::from("bc")], + 4, + ); } #[test] @@ -1179,8 +1205,10 @@ mod tests { // See: https://github.com/sunchao/parquet-rs/issues/47 #[test] fn test_issue_47() { - let mut encoder = create_test_encoder::(0, Encoding::DELTA_BYTE_ARRAY); - let mut decoder = create_test_decoder::(0, Encoding::DELTA_BYTE_ARRAY); + let mut encoder = + create_test_encoder::(0, Encoding::DELTA_BYTE_ARRAY); + let mut decoder = + create_test_decoder::(0, Encoding::DELTA_BYTE_ARRAY); let mut input = vec![]; input.push(ByteArray::from("aa")); @@ -1189,7 +1217,8 @@ mod tests { input.push(ByteArray::from("aaa")); let mut output = vec![ByteArray::default(); input.len()]; - let mut result = put_and_get(&mut encoder, &mut decoder, &input[..2], &mut output[..2]); + let mut result = + put_and_get(&mut encoder, &mut decoder, &input[..2], &mut output[..2]); assert!( result.is_ok(), "first put_and_get() failed with: {}", @@ -1304,7 +1333,10 @@ mod tests { decoder.get(output) } - fn create_and_check_encoder(encoding: Encoding, err: Option) { + fn create_and_check_encoder( + encoding: Encoding, + err: Option, + ) { let descr = create_test_col_desc_ptr(-1, T::get_physical_type()); let mem_tracker = Rc::new(MemTracker::new()); let encoder = get_encoder::(descr, encoding, mem_tracker); diff --git a/rust/parquet/src/encodings/levels.rs b/rust/parquet/src/encodings/levels.rs index 29c92ddcdba9b..93de6b1d546b0 100644 --- a/rust/parquet/src/encodings/levels.rs +++ b/rust/parquet/src/encodings/levels.rs @@ -31,14 +31,20 @@ use crate::util::{ /// repetition/definition level and number of total buffered values (includes null /// values). #[inline] -pub fn max_buffer_size(encoding: Encoding, max_level: i16, num_buffered_values: usize) -> usize { +pub fn max_buffer_size( + encoding: Encoding, + max_level: i16, + num_buffered_values: usize, +) -> usize { let bit_width = log2(max_level as u64 + 1) as u8; match encoding { Encoding::RLE => { RleEncoder::max_buffer_size(bit_width, num_buffered_values) + RleEncoder::min_buffer_size(bit_width) } - Encoding::BIT_PACKED => ceil((num_buffered_values * bit_width as usize) as i64, 8) as usize, + Encoding::BIT_PACKED => { + ceil((num_buffered_values * bit_width as usize) as i64, 8) as usize + } _ => panic!("Unsupported encoding type {}", encoding), } } @@ -71,7 +77,10 @@ impl LevelEncoder { // Here we set full byte buffer without adjusting for num_buffered_values, // because byte buffer will already be allocated with size from // `max_buffer_size()` method. - LevelEncoder::BIT_PACKED(bit_width, BitWriter::new_from_buf(byte_buffer, 0)) + LevelEncoder::BIT_PACKED( + bit_width, + BitWriter::new_from_buf(byte_buffer, 0), + ) } _ => panic!("Unsupported encoding type {}", encoding), } @@ -85,8 +94,8 @@ impl LevelEncoder { } /// Put/encode levels vector into this level encoder. - /// Returns number of encoded values that are less than or equal to length of the input - /// buffer. + /// Returns number of encoded values that are less than or equal to length of the + /// input buffer. /// /// RLE and BIT_PACKED level encoders return Err() when internal buffer overflows or /// flush fails. @@ -94,7 +103,8 @@ impl LevelEncoder { pub fn put(&mut self, buffer: &[i16]) -> Result { let mut num_encoded = 0; match *self { - LevelEncoder::RLE(ref mut encoder) | LevelEncoder::RLE_V2(ref mut encoder) => { + LevelEncoder::RLE(ref mut encoder) + | LevelEncoder::RLE_V2(ref mut encoder) => { for value in buffer { if !encoder.put(*value as u64)? { return Err(general_err!("RLE buffer is full")); @@ -179,8 +189,8 @@ impl LevelDecoder { /// `data` is encoded data as byte buffer, `num_buffered_values` represents total /// number of values that is expected. /// - /// Both RLE and BIT_PACKED level decoders set `num_buffered_values` as total number of - /// values that they can return and track num values. + /// Both RLE and BIT_PACKED level decoders set `num_buffered_values` as total number + /// of values that they can return and track num values. #[inline] pub fn set_data(&mut self, num_buffered_values: usize, data: ByteBufferPtr) -> usize { match *self { @@ -193,9 +203,10 @@ impl LevelDecoder { } LevelDecoder::BIT_PACKED(ref mut num_values, bit_width, ref mut decoder) => { *num_values = Some(num_buffered_values); - // Set appropriate number of bytes: if max size is larger than buffer - set full - // buffer - let num_bytes = ceil((num_buffered_values * bit_width as usize) as i64, 8); + // Set appropriate number of bytes: if max size is larger than buffer - + // set full buffer + let num_bytes = + ceil((num_buffered_values * bit_width as usize) as i64, 8); let data_size = cmp::min(num_bytes as usize, data.len()); decoder.reset(data.range(data.start(), data_size)); data_size @@ -221,7 +232,9 @@ impl LevelDecoder { *num_values = Some(num_buffered_values); len } - _ => panic!("set_data_range() method is only supported by RLE v2 encoding type"), + _ => panic!( + "set_data_range() method is only supported by RLE v2 encoding type" + ), } } @@ -251,11 +264,12 @@ impl LevelDecoder { Ok(values_read) } LevelDecoder::BIT_PACKED(ref mut num_values, bit_width, ref mut decoder) => { - // When extracting values from bit reader, it might return more values than left - // because of padding to a full byte, we use num_values to track precise number - // of values. + // When extracting values from bit reader, it might return more values + // than left because of padding to a full byte, we use + // num_values to track precise number of values. let len = cmp::min(num_values.unwrap(), buffer.len()); - let values_read = decoder.get_batch::(&mut buffer[..len], bit_width as usize); + let values_read = + decoder.get_batch::(&mut buffer[..len], bit_width as usize); *num_values = num_values.map(|len| len - values_read); Ok(values_read) } @@ -344,7 +358,12 @@ mod tests { // Tests encoding/decoding of values when output buffer is larger than number of // encoded values - fn test_internal_roundtrip_underflow(enc: Encoding, levels: &[i16], max_level: i16, v2: bool) { + fn test_internal_roundtrip_underflow( + enc: Encoding, + levels: &[i16], + max_level: i16, + v2: bool, + ) { let size = max_buffer_size(enc, max_level, levels.len()); let mut encoder = if v2 { LevelEncoder::v2(max_level, vec![0; size]) @@ -374,7 +393,12 @@ mod tests { } // Tests when encoded values are larger than encoder's buffer - fn test_internal_roundtrip_overflow(enc: Encoding, levels: &[i16], max_level: i16, v2: bool) { + fn test_internal_roundtrip_overflow( + enc: Encoding, + levels: &[i16], + max_level: i16, + v2: bool, + ) { let size = max_buffer_size(enc, max_level, levels.len()); let mut encoder = if v2 { LevelEncoder::v2(max_level, vec![0; size]) @@ -421,7 +445,12 @@ mod tests { let levels = vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9]; let max_level = 10; test_internal_roundtrip_incremental(Encoding::RLE, &levels, max_level, false); - test_internal_roundtrip_incremental(Encoding::BIT_PACKED, &levels, max_level, false); + test_internal_roundtrip_incremental( + Encoding::BIT_PACKED, + &levels, + max_level, + false, + ); test_internal_roundtrip_incremental(Encoding::RLE, &levels, max_level, true); } @@ -450,7 +479,12 @@ mod tests { let levels = vec![1, 1, 2, 3, 2, 1, 1, 2, 3, 1]; let max_level = 3; test_internal_roundtrip_underflow(Encoding::RLE, &levels, max_level, false); - test_internal_roundtrip_underflow(Encoding::BIT_PACKED, &levels, max_level, false); + test_internal_roundtrip_underflow( + Encoding::BIT_PACKED, + &levels, + max_level, + false, + ); test_internal_roundtrip_underflow(Encoding::RLE, &levels, max_level, true); } @@ -486,7 +520,9 @@ mod tests { } #[test] - #[should_panic(expected = "set_data_range() method is only supported by RLE v2 encoding type")] + #[should_panic( + expected = "set_data_range() method is only supported by RLE v2 encoding type" + )] fn test_bit_packed_decoder_set_data_range() { // Buffer containing both repetition and definition levels let buffer = ByteBufferPtr::new(vec![1, 2, 3, 4, 5]); @@ -497,7 +533,8 @@ mod tests { #[test] fn test_bit_packed_decoder_set_data() { - // Test the maximum size that is assigned based on number of values and buffer length + // Test the maximum size that is assigned based on number of values and buffer + // length let buffer = ByteBufferPtr::new(vec![1, 2, 3, 4, 5]); let max_level = 1; let mut decoder = LevelDecoder::v1(Encoding::BIT_PACKED, max_level); diff --git a/rust/parquet/src/encodings/rle.rs b/rust/parquet/src/encodings/rle.rs index 1a8b6e5c7c6b8..f9347a6eacbe6 100644 --- a/rust/parquet/src/encodings/rle.rs +++ b/rust/parquet/src/encodings/rle.rs @@ -140,7 +140,8 @@ impl RleEncoder { // Second the maximum size for RLE run let min_rle_run_size = 1 + bit_util::ceil(bit_width as i64, 8) as usize; - let rle_max_size = bit_util::ceil(num_values as i64, 8) as usize * min_rle_run_size; + let rle_max_size = + bit_util::ceil(num_values as i64, 8) as usize * min_rle_run_size; ::std::cmp::max(bit_packed_max_size, rle_max_size) as usize } @@ -206,7 +207,8 @@ impl RleEncoder { Ok(self.bit_writer.flush_buffer()) } - /// Clears the internal state so this encoder can be reused (e.g., after becoming full). + /// Clears the internal state so this encoder can be reused (e.g., after becoming + /// full). #[inline] pub fn clear(&mut self) { self.bit_writer.clear(); @@ -222,9 +224,13 @@ impl RleEncoder { /// internal writer. #[inline] pub fn flush(&mut self) -> Result<()> { - if self.bit_packed_count > 0 || self.repeat_count > 0 || self.num_buffered_values > 0 { + if self.bit_packed_count > 0 + || self.repeat_count > 0 + || self.num_buffered_values > 0 + { let all_repeat = self.bit_packed_count == 0 - && (self.repeat_count == self.num_buffered_values || self.num_buffered_values == 0); + && (self.repeat_count == self.num_buffered_values + || self.num_buffered_values == 0); if self.repeat_count > 0 && all_repeat { self.flush_rle_run()?; } else { @@ -306,7 +312,8 @@ impl RleEncoder { self.bit_packed_count += self.num_buffered_values; let num_groups = self.bit_packed_count / 8; if num_groups + 1 >= MAX_GROUPS_PER_BIT_PACKED_RUN { - // We've reached the maximum value that can be hold in a single bit-packed run. + // We've reached the maximum value that can be hold in a single bit-packed + // run. assert!(self.indicator_byte_pos >= 0); self.flush_bit_packed_run(true)?; } else { @@ -403,10 +410,12 @@ impl RleDecoder { while values_read < buffer.len() { if self.rle_left > 0 { assert!(self.current_value.is_some()); - let num_values = cmp::min(buffer.len() - values_read, self.rle_left as usize); + let num_values = + cmp::min(buffer.len() - values_read, self.rle_left as usize); for i in 0..num_values { - let repeated_value = - unsafe { transmute_copy::(self.current_value.as_mut().unwrap()) }; + let repeated_value = unsafe { + transmute_copy::(self.current_value.as_mut().unwrap()) + }; buffer[values_read + i] = repeated_value; } self.rle_left -= num_values as u32; @@ -449,7 +458,8 @@ impl RleDecoder { while values_read < max_values { if self.rle_left > 0 { assert!(self.current_value.is_some()); - let num_values = cmp::min(max_values - values_read, self.rle_left as usize); + let num_values = + cmp::min(max_values - values_read, self.rle_left as usize); let dict_idx = self.current_value.unwrap() as usize; for i in 0..num_values { buffer[values_read + i] = dict[dict_idx].clone(); @@ -498,7 +508,8 @@ impl RleDecoder { } else { self.rle_left = (indicator_value >> 1) as u32; let value_width = bit_util::ceil(self.bit_width as i64, 8); - self.current_value = bit_reader.get_aligned::(value_width as usize); + self.current_value = + bit_reader.get_aligned::(value_width as usize); assert!(self.current_value.is_some()); } return true; @@ -562,7 +573,8 @@ mod tests { // 100 / 8 = 13 groups // 00011011 10101010 ... 00001010 let data2 = ByteBufferPtr::new(vec![ - 0x1B, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0x0A, + 0x1B, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, + 0x0A, ]); let mut decoder: RleDecoder = RleDecoder::new(1); @@ -618,10 +630,14 @@ mod tests { decoder.set_data(data); let mut buffer = vec![""; 12]; let expected = vec![ - "ddd", "eee", "fff", "ddd", "eee", "fff", "ddd", "eee", "fff", "eee", "fff", "fff", + "ddd", "eee", "fff", "ddd", "eee", "fff", "ddd", "eee", "fff", "eee", "fff", + "fff", ]; - let result = - decoder.get_batch_with_dict::<&str>(dict.as_slice(), buffer.as_mut_slice(), 12); + let result = decoder.get_batch_with_dict::<&str>( + dict.as_slice(), + buffer.as_mut_slice(), + 12, + ); assert!(result.is_ok()); assert_eq!(buffer, expected); } @@ -781,7 +797,8 @@ mod tests { assert!(result, "put() should not return false"); } - let buffer = ByteBufferPtr::new(encoder.consume().expect("consume() should be OK")); + let buffer = + ByteBufferPtr::new(encoder.consume().expect("consume() should be OK")); // Verify read let mut decoder = RleDecoder::new(bit_width); @@ -815,7 +832,8 @@ mod tests { for _ in 0..niters { values.clear(); let mut rng = thread_rng(); - let seed_vec: Vec = Standard.sample_iter(&mut rng).take(seed_len).collect(); + let seed_vec: Vec = + Standard.sample_iter(&mut rng).take(seed_len).collect(); let mut seed = [0u8; 32]; seed.copy_from_slice(&seed_vec[0..seed_len]); let mut gen = rand::StdRng::from_seed(seed); diff --git a/rust/parquet/src/file/metadata.rs b/rust/parquet/src/file/metadata.rs index 06507fdcad2a8..16825a0c070ec 100644 --- a/rust/parquet/src/file/metadata.rs +++ b/rust/parquet/src/file/metadata.rs @@ -57,7 +57,10 @@ pub struct ParquetMetaData { impl ParquetMetaData { /// Creates Parquet metadata from file metadata and a list of row group metadata `Rc`s /// for each available row group. - pub fn new(file_metadata: FileMetaData, row_group_ptrs: Vec) -> Self { + pub fn new( + file_metadata: FileMetaData, + row_group_ptrs: Vec, + ) -> Self { ParquetMetaData { file_metadata: Rc::new(file_metadata), row_groups: row_group_ptrs, @@ -158,8 +161,8 @@ impl FileMetaData { /// Column (sort) order used for `min` and `max` values of each column in this file. /// - /// Each column order corresponds to one column, determined by its position in the list, - /// matching the position of the column in the schema. + /// Each column order corresponds to one column, determined by its position in the + /// list, matching the position of the column in the schema. /// /// When `None` is returned, there are no column orders available, and each column /// should be assumed to have undefined (legacy) column order. @@ -230,7 +233,10 @@ impl RowGroupMetaData { } /// Method to convert from Thrift. - pub fn from_thrift(schema_descr: SchemaDescPtr, mut rg: RowGroup) -> Result { + pub fn from_thrift( + schema_descr: SchemaDescPtr, + mut rg: RowGroup, + ) -> Result { assert_eq!(schema_descr.num_columns(), rg.columns.len()); let total_byte_size = rg.total_byte_size; let num_rows = rg.num_rows; diff --git a/rust/parquet/src/file/properties.rs b/rust/parquet/src/file/properties.rs index 47b232e6fab04..54f093d222535 100644 --- a/rust/parquet/src/file/properties.rs +++ b/rust/parquet/src/file/properties.rs @@ -295,9 +295,9 @@ impl WriterPropertiesBuilder { /// Sets encoding for any column. /// - /// If dictionary is not enabled, this is treated as a primary encoding for all columns. - /// In case when dictionary is enabled for any column, this value is considered to - /// be a fallback encoding for that column. + /// If dictionary is not enabled, this is treated as a primary encoding for all + /// columns. In case when dictionary is enabled for any column, this value is + /// considered to be a fallback encoding for that column. /// /// Panics if user tries to set dictionary encoding here, regardless of dictinoary /// encoding flag being set. @@ -349,9 +349,10 @@ impl WriterPropertiesBuilder { /// Sets encoding for a column. /// Takes precedence over globally defined settings. /// - /// If dictionary is not enabled, this is treated as a primary encoding for this column. - /// In case when dictionary is enabled for this column, either through global defaults - /// or explicitly, this value is considered to be a fallback encoding for this column. + /// If dictionary is not enabled, this is treated as a primary encoding for this + /// column. In case when dictionary is enabled for this column, either through + /// global defaults or explicitly, this value is considered to be a fallback + /// encoding for this column. /// /// Panics if user tries to set dictionary encoding here, regardless of dictinoary /// encoding flag being set. @@ -383,7 +384,11 @@ impl WriterPropertiesBuilder { /// Sets max size for statistics for a column. /// Takes precedence over globally defined settings. - pub fn set_column_max_statistics_size(mut self, col: ColumnPath, value: usize) -> Self { + pub fn set_column_max_statistics_size( + mut self, + col: ColumnPath, + value: usize, + ) -> Self { self.get_mut_props(col).set_max_statistics_size(value); self } @@ -460,8 +465,9 @@ impl ColumnProperties { self.codec } - /// Returns `Some(true)` if dictionary encoding is enabled for this column, if disabled - /// then returns `Some(false)`. If result is `None`, then no setting has been provided. + /// Returns `Some(true)` if dictionary encoding is enabled for this column, if + /// disabled then returns `Some(false)`. If result is `None`, then no setting has + /// been provided. fn dictionary_enabled(&self) -> Option { self.dictionary_enabled } diff --git a/rust/parquet/src/file/reader.rs b/rust/parquet/src/file/reader.rs index 747fbbc64f82e..90d115590a42e 100644 --- a/rust/parquet/src/file/reader.rs +++ b/rust/parquet/src/file/reader.rs @@ -193,8 +193,10 @@ impl SerializedFileReader { // TODO: row group filtering let mut prot = TCompactInputProtocol::new(metadata_buf); - let mut t_file_metadata: TFileMetaData = TFileMetaData::read_from_in_protocol(&mut prot) - .map_err(|e| ParquetError::General(format!("Could not parse metadata: {}", e)))?; + let mut t_file_metadata: TFileMetaData = + TFileMetaData::read_from_in_protocol(&mut prot).map_err(|e| { + ParquetError::General(format!("Could not parse metadata: {}", e)) + })?; let schema = types::from_thrift(&mut t_file_metadata.schema)?; let schema_descr = Rc::new(SchemaDescriptor::new(schema.clone())); let mut row_groups = Vec::new(); @@ -204,7 +206,8 @@ impl SerializedFileReader { rg, )?)); } - let column_orders = Self::parse_column_orders(t_file_metadata.column_orders, &schema_descr); + let column_orders = + Self::parse_column_orders(t_file_metadata.column_orders, &schema_descr); let file_metadata = FileMetaData::new( t_file_metadata.version, @@ -338,7 +341,8 @@ impl RowGroupReader for SerializedRowGroupReader col_start = col.dictionary_page_offset().unwrap(); } let col_length = col.compressed_size(); - let file_chunk = FileSource::new(self.buf.get_ref(), col_start as u64, col_length as usize); + let file_chunk = + FileSource::new(self.buf.get_ref(), col_start as u64, col_length as usize); let page_reader = SerializedPageReader::new( file_chunk, col.num_values(), @@ -353,28 +357,33 @@ impl RowGroupReader for SerializedRowGroupReader let col_descr = schema_descr.column(i); let col_page_reader = self.get_column_page_reader(i)?; let col_reader = match col_descr.physical_type() { - Type::BOOLEAN => { - ColumnReader::BoolColumnReader(ColumnReaderImpl::new(col_descr, col_page_reader)) - } - Type::INT32 => { - ColumnReader::Int32ColumnReader(ColumnReaderImpl::new(col_descr, col_page_reader)) - } - Type::INT64 => { - ColumnReader::Int64ColumnReader(ColumnReaderImpl::new(col_descr, col_page_reader)) - } - Type::INT96 => { - ColumnReader::Int96ColumnReader(ColumnReaderImpl::new(col_descr, col_page_reader)) - } - Type::FLOAT => { - ColumnReader::FloatColumnReader(ColumnReaderImpl::new(col_descr, col_page_reader)) - } - Type::DOUBLE => { - ColumnReader::DoubleColumnReader(ColumnReaderImpl::new(col_descr, col_page_reader)) - } - Type::BYTE_ARRAY => ColumnReader::ByteArrayColumnReader(ColumnReaderImpl::new( + Type::BOOLEAN => ColumnReader::BoolColumnReader(ColumnReaderImpl::new( + col_descr, + col_page_reader, + )), + Type::INT32 => ColumnReader::Int32ColumnReader(ColumnReaderImpl::new( + col_descr, + col_page_reader, + )), + Type::INT64 => ColumnReader::Int64ColumnReader(ColumnReaderImpl::new( col_descr, col_page_reader, )), + Type::INT96 => ColumnReader::Int96ColumnReader(ColumnReaderImpl::new( + col_descr, + col_page_reader, + )), + Type::FLOAT => ColumnReader::FloatColumnReader(ColumnReaderImpl::new( + col_descr, + col_page_reader, + )), + Type::DOUBLE => ColumnReader::DoubleColumnReader(ColumnReaderImpl::new( + col_descr, + col_page_reader, + )), + Type::BYTE_ARRAY => ColumnReader::ByteArrayColumnReader( + ColumnReaderImpl::new(col_descr, col_page_reader), + ), Type::FIXED_LEN_BYTE_ARRAY => ColumnReader::FixedLenByteArrayColumnReader( ColumnReaderImpl::new(col_descr, col_page_reader), ), @@ -438,18 +447,19 @@ impl PageReader for SerializedPageReader { while self.seen_num_values < self.total_num_values { let page_header = self.read_page_header()?; - // When processing data page v2, depending on enabled compression for the page, we - // should account for uncompressed data ('offset') of repetition and definition - // levels. + // When processing data page v2, depending on enabled compression for the + // page, we should account for uncompressed data ('offset') of + // repetition and definition levels. // - // We always use 0 offset for other pages other than v2, `true` flag means that - // compression will be applied if decompressor is defined + // We always use 0 offset for other pages other than v2, `true` flag means + // that compression will be applied if decompressor is defined let mut offset: usize = 0; let mut can_decompress = true; if let Some(ref header_v2) = page_header.data_page_header_v2 { offset = (header_v2.definition_levels_byte_length - + header_v2.repetition_levels_byte_length) as usize; + + header_v2.repetition_levels_byte_length) + as usize; // When is_compressed flag is missing the page is considered compressed can_decompress = header_v2.is_compressed.unwrap_or(true); } @@ -460,13 +470,13 @@ impl PageReader for SerializedPageReader { let mut buffer = vec![0; offset + compressed_len]; self.buf.read_exact(&mut buffer)?; - // TODO: page header could be huge because of statistics. We should set a maximum - // page header size and abort if that is exceeded. + // TODO: page header could be huge because of statistics. We should set a + // maximum page header size and abort if that is exceeded. if let Some(decompressor) = self.decompressor.as_mut() { if can_decompress { let mut decompressed_buffer = Vec::with_capacity(uncompressed_len); - let decompressed_size = - decompressor.decompress(&buffer[offset..], &mut decompressed_buffer)?; + let decompressed_size = decompressor + .decompress(&buffer[offset..], &mut decompressed_buffer)?; if decompressed_size != uncompressed_len { return Err(general_err!( "Actual decompressed size doesn't match the expected one ({} vs {})", @@ -487,7 +497,8 @@ impl PageReader for SerializedPageReader { let result = match page_header.type_ { PageType::DICTIONARY_PAGE => { assert!(page_header.dictionary_page_header.is_some()); - let dict_header = page_header.dictionary_page_header.as_ref().unwrap(); + let dict_header = + page_header.dictionary_page_header.as_ref().unwrap(); let is_sorted = dict_header.is_sorted.unwrap_or(false); Page::DictionaryPage { buf: ByteBufferPtr::new(buffer), @@ -504,9 +515,16 @@ impl PageReader for SerializedPageReader { buf: ByteBufferPtr::new(buffer), num_values: header.num_values as u32, encoding: Encoding::from(header.encoding), - def_level_encoding: Encoding::from(header.definition_level_encoding), - rep_level_encoding: Encoding::from(header.repetition_level_encoding), - statistics: statistics::from_thrift(self.physical_type, header.statistics), + def_level_encoding: Encoding::from( + header.definition_level_encoding, + ), + rep_level_encoding: Encoding::from( + header.repetition_level_encoding, + ), + statistics: statistics::from_thrift( + self.physical_type, + header.statistics, + ), } } PageType::DATA_PAGE_V2 => { @@ -523,7 +541,10 @@ impl PageReader for SerializedPageReader { def_levels_byte_len: header.definition_levels_byte_length as u32, rep_levels_byte_len: header.repetition_levels_byte_length as u32, is_compressed, - statistics: statistics::from_thrift(self.physical_type, header.statistics), + statistics: statistics::from_thrift( + self.physical_type, + header.statistics, + ), } } _ => { @@ -566,8 +587,8 @@ mod tests { // let cursor = Cursor::new(buffer.as_ref()); // let read_from_file = - // SerializedFileReader::new(File::open("testdata/alltypes_plain.parquet").unwrap()) - // .unwrap(); + // SerializedFileReader::new(File::open("testdata/alltypes_plain.parquet"). + // unwrap()) .unwrap(); // let read_from_cursor = SerializedFileReader::new(cursor).unwrap(); // let file_iter = read_from_file.get_row_iter(None).unwrap(); @@ -589,18 +610,22 @@ mod tests { #[test] fn test_file_reader_metadata_invalid_length() { - let test_file = get_temp_file("corrupt-3.parquet", &[0, 0, 0, 255, b'P', b'A', b'R', b'1']); + let test_file = + get_temp_file("corrupt-3.parquet", &[0, 0, 0, 255, b'P', b'A', b'R', b'1']); let reader_result = SerializedFileReader::new(test_file); assert!(reader_result.is_err()); assert_eq!( reader_result.err().unwrap(), - general_err!("Invalid Parquet file. Metadata length is less than zero (-16777216)") + general_err!( + "Invalid Parquet file. Metadata length is less than zero (-16777216)" + ) ); } #[test] fn test_file_reader_metadata_invalid_start() { - let test_file = get_temp_file("corrupt-4.parquet", &[255, 0, 0, 0, b'P', b'A', b'R', b'1']); + let test_file = + get_temp_file("corrupt-4.parquet", &[255, 0, 0, 0, b'P', b'A', b'R', b'1']); let reader_result = SerializedFileReader::new(test_file); assert!(reader_result.is_err()); assert_eq!( @@ -636,7 +661,10 @@ mod tests { ]); assert_eq!( - SerializedFileReader::::parse_column_orders(t_column_orders, &schema_descr), + SerializedFileReader::::parse_column_orders( + t_column_orders, + &schema_descr + ), Some(vec![ ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::SIGNED), ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::SIGNED) @@ -656,7 +684,8 @@ mod tests { let schema = SchemaType::group_type_builder("schema").build().unwrap(); let schema_descr = SchemaDescriptor::new(Rc::new(schema)); - let t_column_orders = Some(vec![TColumnOrder::TYPEORDER(TypeDefinedOrder::new())]); + let t_column_orders = + Some(vec![TColumnOrder::TYPEORDER(TypeDefinedOrder::new())]); SerializedFileReader::::parse_column_orders(t_column_orders, &schema_descr); } diff --git a/rust/parquet/src/file/statistics.rs b/rust/parquet/src/file/statistics.rs index 03831bbc72bf7..27bbcb68a9033 100644 --- a/rust/parquet/src/file/statistics.rs +++ b/rust/parquet/src/file/statistics.rs @@ -83,7 +83,10 @@ macro_rules! statistics_enum_func { } /// Converts Thrift definition into `Statistics`. -pub fn from_thrift(physical_type: Type, thrift_stats: Option) -> Option { +pub fn from_thrift( + physical_type: Type, + thrift_stats: Option, +) -> Option { match thrift_stats { Some(stats) => { // Number of nulls recorded, when it is not available, we just mark it as 0. @@ -141,19 +144,25 @@ pub fn from_thrift(physical_type: Type, thrift_stats: Option) -> Op ), Type::INT96 => { // INT96 statistics may not be correct, because comparison is signed - // byte-wise, not actual timestamps. It is recommended to ignore min/max - // statistics for INT96 columns. + // byte-wise, not actual timestamps. It is recommended to ignore + // min/max statistics for INT96 columns. let min = min.map(|data| { assert_eq!(data.len(), 12); unsafe { - let raw = ::std::slice::from_raw_parts(data.as_ptr() as *mut u32, 3); + let raw = ::std::slice::from_raw_parts( + data.as_ptr() as *mut u32, + 3, + ); Int96::from(Vec::from(raw)) } }); let max = max.map(|data| { assert_eq!(data.len(), 12); unsafe { - let raw = ::std::slice::from_raw_parts(data.as_ptr() as *mut u32, 3); + let raw = ::std::slice::from_raw_parts( + data.as_ptr() as *mut u32, + 3, + ); Int96::from(Vec::from(raw)) } }); @@ -454,7 +463,11 @@ impl fmt::Debug for TypedStatistics { f, "{{min: {:?}, max: {:?}, distinct_count: {:?}, null_count: {}, \ min_max_deprecated: {}}}", - self.min, self.max, self.distinct_count, self.null_count, self.is_min_max_deprecated + self.min, + self.max, + self.distinct_count, + self.null_count, + self.is_min_max_deprecated ) } } diff --git a/rust/parquet/src/file/writer.rs b/rust/parquet/src/file/writer.rs index e000842f3895f..a5cd78ad8d2ea 100644 --- a/rust/parquet/src/file/writer.rs +++ b/rust/parquet/src/file/writer.rs @@ -35,8 +35,8 @@ use crate::column::{ }; use crate::errors::{ParquetError, Result}; use crate::file::{ - metadata::*, properties::WriterPropertiesPtr, statistics::to_thrift as statistics_to_thrift, - FOOTER_SIZE, PARQUET_MAGIC, + metadata::*, properties::WriterPropertiesPtr, + statistics::to_thrift as statistics_to_thrift, FOOTER_SIZE, PARQUET_MAGIC, }; use crate::schema::types::{self, SchemaDescPtr, SchemaDescriptor, TypePtr}; use crate::util::io::{FileSink, Position}; @@ -72,8 +72,8 @@ pub trait FileWriter { /// All row groups must be appended before this method is called. /// No writes are allowed after this point. /// - /// Can be called multiple times. It is up to implementation to either result in no-op, - /// or return an `Err` for subsequent calls. + /// Can be called multiple times. It is up to implementation to either result in + /// no-op, or return an `Err` for subsequent calls. fn close(&mut self) -> Result<()>; } @@ -130,7 +130,11 @@ pub struct SerializedFileWriter { impl SerializedFileWriter { /// Creates new file writer. - pub fn new(mut file: File, schema: TypePtr, properties: WriterPropertiesPtr) -> Result { + pub fn new( + mut file: File, + schema: TypePtr, + properties: WriterPropertiesPtr, + ) -> Result { Self::start_file(&mut file)?; Ok(Self { file, @@ -219,8 +223,11 @@ impl FileWriter for SerializedFileWriter { fn next_row_group(&mut self) -> Result> { self.assert_closed()?; self.assert_previous_writer_closed()?; - let row_group_writer = - SerializedRowGroupWriter::new(self.descr.clone(), self.props.clone(), &self.file); + let row_group_writer = SerializedRowGroupWriter::new( + self.descr.clone(), + self.props.clone(), + &self.file, + ); self.previous_writer_closed = false; Ok(Box::new(row_group_writer)) } @@ -259,7 +266,11 @@ pub struct SerializedRowGroupWriter { } impl SerializedRowGroupWriter { - pub fn new(schema_descr: SchemaDescPtr, properties: WriterPropertiesPtr, file: &File) -> Self { + pub fn new( + schema_descr: SchemaDescPtr, + properties: WriterPropertiesPtr, + file: &File, + ) -> Self { let num_columns = schema_descr.num_columns(); Self { descr: schema_descr, @@ -564,7 +575,8 @@ mod tests { #[test] fn test_row_group_writer_error_not_all_columns_written() { - let file = get_temp_file("test_row_group_writer_error_not_all_columns_written", &[]); + let file = + get_temp_file("test_row_group_writer_error_not_all_columns_written", &[]); let schema = Rc::new( types::Type::group_type_builder("schema") .with_fields(&mut vec![Rc::new( @@ -781,7 +793,10 @@ mod tests { encoding, def_level_encoding, rep_level_encoding, - statistics: from_thrift(physical_type, to_thrift(statistics.as_ref())), + statistics: from_thrift( + physical_type, + to_thrift(statistics.as_ref()), + ), } } &Page::DataPageV2 { @@ -797,7 +812,8 @@ mod tests { } => { total_num_values += num_values as i64; let offset = (def_levels_byte_len + rep_levels_byte_len) as usize; - let cmp_buf = compress_helper(compressor.as_mut(), &buf.data()[offset..]); + let cmp_buf = + compress_helper(compressor.as_mut(), &buf.data()[offset..]); let mut output_buf = Vec::from(&buf.data()[..offset]); output_buf.extend_from_slice(&cmp_buf[..]); @@ -810,7 +826,10 @@ mod tests { def_levels_byte_len, rep_levels_byte_len, is_compressed: compressor.is_some(), - statistics: from_thrift(physical_type, to_thrift(statistics.as_ref())), + statistics: from_thrift( + physical_type, + to_thrift(statistics.as_ref()), + ), } } &Page::DictionaryPage { diff --git a/rust/parquet/src/reader/schema.rs b/rust/parquet/src/reader/schema.rs index 68fd867a821cd..34276a2d5633f 100644 --- a/rust/parquet/src/reader/schema.rs +++ b/rust/parquet/src/reader/schema.rs @@ -32,7 +32,10 @@ use arrow::datatypes::{DataType, Field, Schema}; /// Convert parquet schema to arrow schema. pub fn parquet_to_arrow_schema(parquet_schema: SchemaDescPtr) -> Result { - parquet_to_arrow_schema_by_columns(parquet_schema.clone(), 0..parquet_schema.columns().len()) + parquet_to_arrow_schema_by_columns( + parquet_schema.clone(), + 0..parquet_schema.columns().len(), + ) } /// Convert parquet schema to arrow schema, only preserving some leaf columns. @@ -117,8 +120,9 @@ impl ParquetTypeConverter { /// [`to_data_type`](`ParquetTypeConverter::to_data_type`), except it reserves schema /// name. fn to_field(&self) -> Result> { - self.to_data_type() - .map(|opt| opt.map(|dt| Field::new(self.schema.name(), dt, self.is_nullable()))) + self.to_data_type().map(|opt| { + opt.map(|dt| Field::new(self.schema.name(), dt, self.is_nullable())) + }) } // Utility functions. @@ -252,7 +256,9 @@ impl ParquetTypeConverter { .iter() .map(|field_ptr| self.clone_with_schema(field_ptr.clone()).to_field()) .collect::>>>() - .map(|result| result.into_iter().filter_map(|f| f).collect::>()) + .map(|result| { + result.into_iter().filter_map(|f| f).collect::>() + }) .map(|fields| { if fields.is_empty() { None @@ -286,7 +292,8 @@ impl ParquetTypeConverter { item_converter.to_primitive_type_inner().map(|dt| Some(dt)) } else { Err(ArrowError( - "Primitive element type of list must be repeated.".to_string(), + "Primitive element type of list must be repeated." + .to_string(), )) } } @@ -301,7 +308,8 @@ impl ParquetTypeConverter { && list_item.name() != format!("{}_tuple", self.schema.name()) { let nested_item = fields.first().unwrap(); - let nested_item_converter = self.clone_with_schema(nested_item.clone()); + let nested_item_converter = + self.clone_with_schema(nested_item.clone()); nested_item_converter.to_data_type() } else { @@ -346,7 +354,8 @@ mod tests { let parquet_group_type = parse_message_type(message_type).unwrap(); let parquet_schema = SchemaDescriptor::new(Rc::new(parquet_group_type)); - let converted_arrow_schema = parquet_to_arrow_schema(Rc::new(parquet_schema)).unwrap(); + let converted_arrow_schema = + parquet_to_arrow_schema(Rc::new(parquet_schema)).unwrap(); let arrow_fields = vec![ Field::new("boolean", DataType::Boolean, false), @@ -374,7 +383,8 @@ mod tests { let parquet_group_type = parse_message_type(message_type).unwrap(); let parquet_schema = Rc::new(SchemaDescriptor::new(Rc::new(parquet_group_type))); - let converted_arrow_schema = parquet_to_arrow_schema(parquet_schema.clone()).unwrap(); + let converted_arrow_schema = + parquet_to_arrow_schema(parquet_schema.clone()).unwrap(); let arrow_fields = vec![ Field::new("boolean", DataType::Boolean, false), @@ -382,9 +392,11 @@ mod tests { ]; assert_eq!(&arrow_fields, converted_arrow_schema.fields()); - let converted_arrow_schema = - parquet_to_arrow_schema_by_columns(parquet_schema.clone(), vec![0usize, 1usize]) - .unwrap(); + let converted_arrow_schema = parquet_to_arrow_schema_by_columns( + parquet_schema.clone(), + vec![0usize, 1usize], + ) + .unwrap(); assert_eq!(&arrow_fields, converted_arrow_schema.fields()); } @@ -545,7 +557,8 @@ mod tests { // } // Special case: group is named array { - let arrow_struct = DataType::Struct(vec![Field::new("str", DataType::Utf8, false)]); + let arrow_struct = + DataType::Struct(vec![Field::new("str", DataType::Utf8, false)]); arrow_fields.push(Field::new( "my_list", DataType::List(Box::new(arrow_struct)), @@ -561,7 +574,8 @@ mod tests { // } // Special case: group named ends in _tuple { - let arrow_struct = DataType::Struct(vec![Field::new("str", DataType::Utf8, false)]); + let arrow_struct = + DataType::Struct(vec![Field::new("str", DataType::Utf8, false)]); arrow_fields.push(Field::new( "my_list", DataType::List(Box::new(arrow_struct)), @@ -582,7 +596,8 @@ mod tests { let parquet_group_type = parse_message_type(message_type).unwrap(); let parquet_schema = Rc::new(SchemaDescriptor::new(Rc::new(parquet_group_type))); - let converted_arrow_schema = parquet_to_arrow_schema(parquet_schema.clone()).unwrap(); + let converted_arrow_schema = + parquet_to_arrow_schema(parquet_schema.clone()).unwrap(); let converted_fields = converted_arrow_schema.fields(); assert_eq!(arrow_fields.len(), converted_fields.len()); @@ -599,7 +614,8 @@ mod tests { Field::new("leaf1", DataType::Boolean, false), Field::new("leaf2", DataType::Int32, false), ]; - let group1_struct = Field::new("group1", DataType::Struct(group1_fields), false); + let group1_struct = + Field::new("group1", DataType::Struct(group1_fields), false); arrow_fields.push(group1_struct); let leaf3_field = Field::new("leaf3", DataType::Int64, false); @@ -618,7 +634,8 @@ mod tests { let parquet_group_type = parse_message_type(message_type).unwrap(); let parquet_schema = Rc::new(SchemaDescriptor::new(Rc::new(parquet_group_type))); - let converted_arrow_schema = parquet_to_arrow_schema(parquet_schema.clone()).unwrap(); + let converted_arrow_schema = + parquet_to_arrow_schema(parquet_schema.clone()).unwrap(); let converted_fields = converted_arrow_schema.fields(); assert_eq!(arrow_fields.len(), converted_fields.len()); @@ -668,7 +685,8 @@ mod tests { let parquet_schema = Rc::new(SchemaDescriptor::new(Rc::new(parquet_group_type))); let converted_arrow_schema = - parquet_to_arrow_schema_by_columns(parquet_schema.clone(), vec![0, 3, 4]).unwrap(); + parquet_to_arrow_schema_by_columns(parquet_schema.clone(), vec![0, 3, 4]) + .unwrap(); let converted_fields = converted_arrow_schema.fields(); assert_eq!(arrow_fields.len(), converted_fields.len()); @@ -718,7 +736,8 @@ mod tests { let parquet_schema = Rc::new(SchemaDescriptor::new(Rc::new(parquet_group_type))); let converted_arrow_schema = - parquet_to_arrow_schema_by_columns(parquet_schema.clone(), vec![3, 4, 0]).unwrap(); + parquet_to_arrow_schema_by_columns(parquet_schema.clone(), vec![3, 4, 0]) + .unwrap(); let converted_fields = converted_arrow_schema.fields(); assert_eq!(arrow_fields.len(), converted_fields.len()); @@ -768,7 +787,8 @@ mod tests { let parquet_group_type = parse_message_type(message_type).unwrap(); let parquet_schema = Rc::new(SchemaDescriptor::new(Rc::new(parquet_group_type))); - let converted_arrow_schema = parquet_to_arrow_schema(parquet_schema.clone()).unwrap(); + let converted_arrow_schema = + parquet_to_arrow_schema(parquet_schema.clone()).unwrap(); let converted_fields = converted_arrow_schema.fields(); assert_eq!(arrow_fields.len(), converted_fields.len()); diff --git a/rust/parquet/src/record/api.rs b/rust/parquet/src/record/api.rs index d0be43ad730ed..87f88b376478b 100644 --- a/rust/parquet/src/record/api.rs +++ b/rust/parquet/src/record/api.rs @@ -546,7 +546,8 @@ impl Field { match descr.physical_type() { PhysicalType::BYTE_ARRAY => match descr.logical_type() { LogicalType::UTF8 | LogicalType::ENUM | LogicalType::JSON => { - let value = unsafe { String::from_utf8_unchecked(value.data().to_vec()) }; + let value = + unsafe { String::from_utf8_unchecked(value.data().to_vec()) }; Field::Str(value) } LogicalType::BSON | LogicalType::NONE => Field::Bytes(value), @@ -598,11 +599,15 @@ impl fmt::Display for Field { write!(f, "{:?}", value) } } - Field::Decimal(ref value) => write!(f, "{}", convert_decimal_to_string(value)), + Field::Decimal(ref value) => { + write!(f, "{}", convert_decimal_to_string(value)) + } Field::Str(ref value) => write!(f, "\"{}\"", value), Field::Bytes(ref value) => write!(f, "{:?}", value.data()), Field::Date(value) => write!(f, "{}", convert_date_to_string(value)), - Field::Timestamp(value) => write!(f, "{}", convert_timestamp_to_string(value)), + Field::Timestamp(value) => { + write!(f, "{}", convert_timestamp_to_string(value)) + } Field::Group(ref fields) => write!(f, "{}", fields), Field::ListInternal(ref list) => { let elems = &list.elements; @@ -675,7 +680,8 @@ fn convert_decimal_to_string(decimal: &Decimal) -> String { } num_str.insert_str(negative as usize, "0."); } else { - // No zeroes need to be prepended to the unscaled value, simply insert decimal point. + // No zeroes need to be prepended to the unscaled value, simply insert decimal + // point. num_str.insert((point + negative) as usize, '.'); } @@ -770,7 +776,8 @@ mod tests { let row = Field::convert_int32(&descr, 14611); assert_eq!(row, Field::Date(14611)); - let descr = make_column_descr![PhysicalType::INT32, LogicalType::DECIMAL, 0, 8, 2]; + let descr = + make_column_descr![PhysicalType::INT32, LogicalType::DECIMAL, 0, 8, 2]; let row = Field::convert_int32(&descr, 444); assert_eq!(row, Field::Decimal(Decimal::from_i32(444, 8, 2))); } @@ -785,7 +792,8 @@ mod tests { let row = Field::convert_int64(&descr, 78239823); assert_eq!(row, Field::ULong(78239823)); - let descr = make_column_descr![PhysicalType::INT64, LogicalType::TIMESTAMP_MILLIS]; + let descr = + make_column_descr![PhysicalType::INT64, LogicalType::TIMESTAMP_MILLIS]; let row = Field::convert_int64(&descr, 1541186529153); assert_eq!(row, Field::Timestamp(1541186529153)); @@ -793,7 +801,8 @@ mod tests { let row = Field::convert_int64(&descr, 2222); assert_eq!(row, Field::Long(2222)); - let descr = make_column_descr![PhysicalType::INT64, LogicalType::DECIMAL, 0, 8, 2]; + let descr = + make_column_descr![PhysicalType::INT64, LogicalType::DECIMAL, 0, 8, 2]; let row = Field::convert_int64(&descr, 3333); assert_eq!(row, Field::Decimal(Decimal::from_i64(3333, 8, 2))); } @@ -871,7 +880,8 @@ mod tests { assert_eq!(row, Field::Bytes(value)); // DECIMAL - let descr = make_column_descr![PhysicalType::BYTE_ARRAY, LogicalType::DECIMAL, 0, 8, 2]; + let descr = + make_column_descr![PhysicalType::BYTE_ARRAY, LogicalType::DECIMAL, 0, 8, 2]; let value = ByteArray::from(vec![207, 200]); let row = Field::convert_byte_array(&descr, value.clone()); assert_eq!(row, Field::Decimal(Decimal::from_bytes(value, 8, 2))); diff --git a/rust/parquet/src/record/reader.rs b/rust/parquet/src/record/reader.rs index e1d3c964eca3a..a5dbcb1cb4e14 100644 --- a/rust/parquet/src/record/reader.rs +++ b/rust/parquet/src/record/reader.rs @@ -55,7 +55,11 @@ impl TreeBuilder { } /// Creates new root reader for provided schema and row group. - pub fn build(&self, descr: SchemaDescPtr, row_group_reader: &RowGroupReader) -> Reader { + pub fn build( + &self, + descr: SchemaDescPtr, + row_group_reader: &RowGroupReader, + ) -> Reader { // Prepare lookup table of column path -> original column index // This allows to prune columns and map schema leaf nodes to the column readers let mut paths: HashMap = HashMap::new(); @@ -72,7 +76,14 @@ impl TreeBuilder { let mut path = Vec::new(); for field in descr.root_schema().get_fields() { - let reader = self.reader_tree(field.clone(), &mut path, 0, 0, &paths, row_group_reader); + let reader = self.reader_tree( + field.clone(), + &mut path, + 0, + 0, + &paths, + row_group_reader, + ); readers.push(reader); } @@ -82,7 +93,11 @@ impl TreeBuilder { } /// Creates iterator of `Row`s directly from schema descriptor and row group. - pub fn as_iter(&self, descr: SchemaDescPtr, row_group_reader: &RowGroupReader) -> ReaderIter { + pub fn as_iter( + &self, + descr: SchemaDescPtr, + row_group_reader: &RowGroupReader, + ) -> ReaderIter { let num_records = row_group_reader.metadata().num_rows() as usize; ReaderIter::new(self.build(descr, row_group_reader), num_records) } @@ -126,7 +141,12 @@ impl TreeBuilder { match field.get_basic_info().logical_type() { // List types LogicalType::LIST => { - assert_eq!(field.get_fields().len(), 1, "Invalid list type {:?}", field); + assert_eq!( + field.get_fields().len(), + 1, + "Invalid list type {:?}", + field + ); let repeated_field = field.get_fields()[0].clone(); assert_eq!( @@ -179,7 +199,12 @@ impl TreeBuilder { } // Map types (key-value pairs) LogicalType::MAP | LogicalType::MAP_KEY_VALUE => { - assert_eq!(field.get_fields().len(), 1, "Invalid map type: {:?}", field); + assert_eq!( + field.get_fields().len(), + 1, + "Invalid map type: {:?}", + field + ); assert!( !field.get_fields()[0].is_primitive(), "Invalid map type: {:?}", @@ -237,9 +262,10 @@ impl TreeBuilder { Box::new(value_reader), ) } - // A repeated field that is neither contained by a `LIST`- or `MAP`-annotated - // group nor annotated by `LIST` or `MAP` should be interpreted as a required - // list of required elements where the element type is the type of the field. + // A repeated field that is neither contained by a `LIST`- or + // `MAP`-annotated group nor annotated by `LIST` or `MAP` + // should be interpreted as a required list of required + // elements where the element type is the type of the field. _ if repetition == Repetition::REPEATED => { let required_field = Type::group_type_builder(field.name()) .with_repetition(Repetition::REQUIRED) @@ -302,8 +328,8 @@ pub enum Reader { // Reader for repeated values, e.g. lists, contains type information, definition // level, repetition level and a child reader RepeatedReader(TypePtr, i16, i16, Box), - // Reader of key-value pairs, e.g. maps, contains type information, definition level, - // repetition level, child reader for keys and child reader for values + // Reader of key-value pairs, e.g. maps, contains type information, definition + // level, repetition level, child reader for keys and child reader for values KeyValueReader(TypePtr, i16, i16, Box, Box), } @@ -406,7 +432,10 @@ impl Reader { if reader.repetition() != Repetition::OPTIONAL || reader.current_def_level() > def_level { - fields.push((String::from(reader.field_name()), reader.read_field())); + fields.push(( + String::from(reader.field_name()), + reader.read_field(), + )); } else { reader.advance_columns(); fields.push((String::from(reader.field_name()), Field::Null)); @@ -422,21 +451,29 @@ impl Reader { elements.push(reader.read_field()); } else { reader.advance_columns(); - // If the current definition level is equal to the definition level of this - // repeated type, then the result is an empty list and the repetition level + // If the current definition level is equal to the definition + // level of this repeated type, then the + // result is an empty list and the repetition level // will always be <= rl. break; } - // This covers case when we are out of repetition levels and should close the - // group, or there are no values left to buffer. + // This covers case when we are out of repetition levels and should + // close the group, or there are no values left to + // buffer. if !reader.has_next() || reader.current_rep_level() <= rep_level { break; } } Field::ListInternal(make_list(elements)) } - Reader::KeyValueReader(_, def_level, rep_level, ref mut keys, ref mut values) => { + Reader::KeyValueReader( + _, + def_level, + rep_level, + ref mut keys, + ref mut values, + ) => { let mut pairs = Vec::new(); loop { if keys.current_def_level() > def_level { @@ -444,14 +481,16 @@ impl Reader { } else { keys.advance_columns(); values.advance_columns(); - // If the current definition level is equal to the definition level of this - // repeated type, then the result is an empty list and the repetition level + // If the current definition level is equal to the definition + // level of this repeated type, then the + // result is an empty list and the repetition level // will always be <= rl. break; } - // This covers case when we are out of repetition levels and should close the - // group, or there are no values left to buffer. + // This covers case when we are out of repetition levels and should + // close the group, or there are no values left to + // buffer. if !keys.has_next() || keys.current_rep_level() <= rep_level { break; } @@ -588,8 +627,10 @@ pub struct RowIter<'a> { impl<'a> RowIter<'a> { /// Creates iterator of [`Row`](`::record::api::Row`)s for all row groups in a file. pub fn from_file(proj: Option, reader: &'a FileReader) -> Result { - let descr = - Self::get_proj_descr(proj, reader.metadata().file_metadata().schema_descr_ptr())?; + let descr = Self::get_proj_descr( + proj, + reader.metadata().file_metadata().schema_descr_ptr(), + )?; let num_row_groups = reader.num_row_groups(); Ok(Self { @@ -603,13 +644,16 @@ impl<'a> RowIter<'a> { } /// Creates iterator of [`Row`](`::record::api::Row`)s for a specific row group. - pub fn from_row_group(proj: Option, reader: &'a RowGroupReader) -> Result { + pub fn from_row_group( + proj: Option, + reader: &'a RowGroupReader, + ) -> Result { let descr = Self::get_proj_descr(proj, reader.metadata().schema_descr_ptr())?; let tree_builder = Self::tree_builder(); let row_iter = tree_builder.as_iter(descr.clone(), reader); - // For row group we need to set `current_row_group` >= `num_row_groups`, because we - // only have one row group and can't buffer more. + // For row group we need to set `current_row_group` >= `num_row_groups`, because + // we only have one row group and can't buffer more. Ok(Self { descr, tree_builder, @@ -630,7 +674,10 @@ impl<'a> RowIter<'a> { /// Helper method to get schema descriptor for projected schema. /// If projection is None, then full schema is returned. #[inline] - fn get_proj_descr(proj: Option, root_descr: SchemaDescPtr) -> Result { + fn get_proj_descr( + proj: Option, + root_descr: SchemaDescPtr, + ) -> Result { match proj { Some(projection) => { // check if projection is part of file schema @@ -905,11 +952,17 @@ mod tests { list![ group![ ("E".to_string(), Field::Int(10)), - ("F".to_string(), Field::Str("aaa".to_string())) + ( + "F".to_string(), + Field::Str("aaa".to_string()) + ) ], group![ ("E".to_string(), Field::Int(-10)), - ("F".to_string(), Field::Str("bbb".to_string())) + ( + "F".to_string(), + Field::Str("bbb".to_string()) + ) ] ], list![group![ @@ -989,7 +1042,10 @@ mod tests { ], group![ ("E".to_string(), Field::Int(10)), - ("F".to_string(), Field::Str("aaa".to_string())) + ( + "F".to_string(), + Field::Str("aaa".to_string()) + ) ], group![ ("E".to_string(), Field::Null), @@ -997,7 +1053,10 @@ mod tests { ], group![ ("E".to_string(), Field::Int(-10)), - ("F".to_string(), Field::Str("bbb".to_string())) + ( + "F".to_string(), + Field::Str("bbb".to_string()) + ) ], group![ ("E".to_string(), Field::Null), @@ -1007,7 +1066,10 @@ mod tests { list![ group![ ("E".to_string(), Field::Int(11)), - ("F".to_string(), Field::Str("c".to_string())) + ( + "F".to_string(), + Field::Str("c".to_string()) + ) ], Field::Null ], @@ -1031,7 +1093,10 @@ mod tests { ), ( Field::Str("g2".to_string()), - group![("H".to_string(), group![("i".to_string(), list![])])] + group![( + "H".to_string(), + group![("i".to_string(), list![])] + )] ), (Field::Str("g3".to_string()), Field::Null), ( @@ -1165,7 +1230,8 @@ mod tests { } "; let schema = parse_message_type(&schema).unwrap(); - let rows = test_file_reader_rows("nested_maps.snappy.parquet", Some(schema)).unwrap(); + let rows = + test_file_reader_rows("nested_maps.snappy.parquet", Some(schema)).unwrap(); let expected_rows = vec![ row![ ("c".to_string(), Field::Double(1.0)), @@ -1213,7 +1279,8 @@ mod tests { } "; let schema = parse_message_type(&schema).unwrap(); - let rows = test_file_reader_rows("nested_maps.snappy.parquet", Some(schema)).unwrap(); + let rows = + test_file_reader_rows("nested_maps.snappy.parquet", Some(schema)).unwrap(); let expected_rows = vec![ row![( "a".to_string(), @@ -1279,7 +1346,8 @@ mod tests { } "; let schema = parse_message_type(&schema).unwrap(); - let rows = test_file_reader_rows("nested_lists.snappy.parquet", Some(schema)).unwrap(); + let rows = + test_file_reader_rows("nested_lists.snappy.parquet", Some(schema)).unwrap(); let expected_rows = vec![ row![( "a".to_string(), @@ -1456,7 +1524,8 @@ mod tests { fn test_row_group_rows(file_name: &str, schema: Option) -> Result> { let file = get_test_file(file_name); let file_reader: Box = Box::new(SerializedFileReader::new(file)?); - // Check the first row group only, because files will contain only single row group + // Check the first row group only, because files will contain only single row + // group let row_group_reader = file_reader.get_row_group(0).unwrap(); let iter = row_group_reader.get_row_iter(schema)?; Ok(iter.collect()) diff --git a/rust/parquet/src/record/triplet.rs b/rust/parquet/src/record/triplet.rs index 6ec7799ccb03c..9915b18f61704 100644 --- a/rust/parquet/src/record/triplet.rs +++ b/rust/parquet/src/record/triplet.rs @@ -57,30 +57,32 @@ impl TripletIter { /// Creates new triplet for column reader pub fn new(descr: ColumnDescPtr, reader: ColumnReader, batch_size: usize) -> Self { match descr.physical_type() { - PhysicalType::BOOLEAN => { - TripletIter::BoolTripletIter(TypedTripletIter::new(descr, batch_size, reader)) - } - PhysicalType::INT32 => { - TripletIter::Int32TripletIter(TypedTripletIter::new(descr, batch_size, reader)) - } - PhysicalType::INT64 => { - TripletIter::Int64TripletIter(TypedTripletIter::new(descr, batch_size, reader)) - } - PhysicalType::INT96 => { - TripletIter::Int96TripletIter(TypedTripletIter::new(descr, batch_size, reader)) - } - PhysicalType::FLOAT => { - TripletIter::FloatTripletIter(TypedTripletIter::new(descr, batch_size, reader)) - } - PhysicalType::DOUBLE => { - TripletIter::DoubleTripletIter(TypedTripletIter::new(descr, batch_size, reader)) - } - PhysicalType::BYTE_ARRAY => { - TripletIter::ByteArrayTripletIter(TypedTripletIter::new(descr, batch_size, reader)) - } - PhysicalType::FIXED_LEN_BYTE_ARRAY => TripletIter::FixedLenByteArrayTripletIter( + PhysicalType::BOOLEAN => TripletIter::BoolTripletIter(TypedTripletIter::new( + descr, batch_size, reader, + )), + PhysicalType::INT32 => TripletIter::Int32TripletIter(TypedTripletIter::new( + descr, batch_size, reader, + )), + PhysicalType::INT64 => TripletIter::Int64TripletIter(TypedTripletIter::new( + descr, batch_size, reader, + )), + PhysicalType::INT96 => TripletIter::Int96TripletIter(TypedTripletIter::new( + descr, batch_size, reader, + )), + PhysicalType::FLOAT => TripletIter::FloatTripletIter(TypedTripletIter::new( + descr, batch_size, reader, + )), + PhysicalType::DOUBLE => TripletIter::DoubleTripletIter( + TypedTripletIter::new(descr, batch_size, reader), + ), + PhysicalType::BYTE_ARRAY => TripletIter::ByteArrayTripletIter( TypedTripletIter::new(descr, batch_size, reader), ), + PhysicalType::FIXED_LEN_BYTE_ARRAY => { + TripletIter::FixedLenByteArrayTripletIter(TypedTripletIter::new( + descr, batch_size, reader, + )) + } } } @@ -154,11 +156,15 @@ impl TripletIter { TripletIter::DoubleTripletIter(ref typed) => { Field::convert_double(typed.column_descr(), *typed.current_value()) } - TripletIter::ByteArrayTripletIter(ref typed) => { - Field::convert_byte_array(typed.column_descr(), typed.current_value().clone()) - } + TripletIter::ByteArrayTripletIter(ref typed) => Field::convert_byte_array( + typed.column_descr(), + typed.current_value().clone(), + ), TripletIter::FixedLenByteArrayTripletIter(ref typed) => { - Field::convert_byte_array(typed.column_descr(), typed.current_value().clone()) + Field::convert_byte_array( + typed.column_descr(), + typed.current_value().clone(), + ) } } } @@ -302,8 +308,12 @@ impl TypedTripletIter { }; // Buffer triplets - self.reader - .read_batch(self.batch_size, def_levels, rep_levels, &mut self.values)? + self.reader.read_batch( + self.batch_size, + def_levels, + rep_levels, + &mut self.values, + )? }; // No more values or levels to read @@ -320,9 +330,10 @@ impl TypedTripletIter { self.triplets_left = values_read; } else if values_read < levels_read { // Add spacing for triplets. - // The idea is setting values for positions in def_levels when current definition - // level equals to maximum definition level. Values and levels are guaranteed to - // line up, because of the column reader method. + // The idea is setting values for positions in def_levels when current + // definition level equals to maximum definition level. + // Values and levels are guaranteed to line up, because of + // the column reader method. // Note: if values_read == 0, then spacing will not be triggered let mut idx = values_read; @@ -360,7 +371,8 @@ mod tests { #[test] #[should_panic(expected = "Expected positive batch size, found: 0")] fn test_triplet_zero_batch_size() { - let column_path = ColumnPath::from(vec!["b_struct".to_string(), "b_c_int".to_string()]); + let column_path = + ColumnPath::from(vec!["b_struct".to_string(), "b_c_int".to_string()]); test_column_in_file( "nulls.snappy.parquet", 0, diff --git a/rust/parquet/src/schema/parser.rs b/rust/parquet/src/schema/parser.rs index 955c6c9830223..c8c2a02e82810 100644 --- a/rust/parquet/src/schema/parser.rs +++ b/rust/parquet/src/schema/parser.rs @@ -150,7 +150,11 @@ fn assert_token(token: Option<&str>, expected: &str) -> Result<()> { } // Utility function to parse i32 or return general error. -fn parse_i32(value: Option<&str>, not_found_msg: &str, parse_fail_msg: &str) -> Result { +fn parse_i32( + value: Option<&str>, + not_found_msg: &str, + parse_fail_msg: &str, +) -> Result { value .ok_or(general_err!(not_found_msg)) .and_then(|v| v.parse::().map_err(|_| general_err!(parse_fail_msg))) @@ -200,7 +204,9 @@ impl<'a> Parser<'a> { .and_then(|v| v.to_uppercase().parse::())?; match self.tokenizer.next() { - Some(group) if group.to_uppercase() == "GROUP" => self.add_group_type(Some(repetition)), + Some(group) if group.to_uppercase() == "GROUP" => { + self.add_group_type(Some(repetition)) + } Some(type_string) => { let physical_type = type_string.to_uppercase().parse::()?; self.add_primitive_type(repetition, physical_type) @@ -424,11 +430,12 @@ mod tests { assert_eq!( res, vec![ - "message", "schema", "{", "required", "int32", "a", ";", "optional", "binary", "c", - "(", "UTF8", ")", ";", "required", "group", "d", "{", "required", "int32", "a", - ";", "optional", "binary", "c", "(", "UTF8", ")", ";", "}", "required", "group", - "e", "(", "LIST", ")", "{", "repeated", "group", "list", "{", "required", "int32", - "element", ";", "}", "}", "}" + "message", "schema", "{", "required", "int32", "a", ";", "optional", + "binary", "c", "(", "UTF8", ")", ";", "required", "group", "d", "{", + "required", "int32", "a", ";", "optional", "binary", "c", "(", "UTF8", + ")", ";", "}", "required", "group", "e", "(", "LIST", ")", "{", + "repeated", "group", "list", "{", "required", "int32", "element", ";", + "}", "}", "}" ] ); } @@ -586,22 +593,28 @@ mod tests { let expected = Type::group_type_builder("root") .with_fields(&mut vec![ Rc::new( - Type::primitive_type_builder("f1", PhysicalType::FIXED_LEN_BYTE_ARRAY) - .with_logical_type(LogicalType::DECIMAL) - .with_length(5) - .with_precision(9) - .with_scale(3) - .build() - .unwrap(), + Type::primitive_type_builder( + "f1", + PhysicalType::FIXED_LEN_BYTE_ARRAY, + ) + .with_logical_type(LogicalType::DECIMAL) + .with_length(5) + .with_precision(9) + .with_scale(3) + .build() + .unwrap(), ), Rc::new( - Type::primitive_type_builder("f2", PhysicalType::FIXED_LEN_BYTE_ARRAY) - .with_logical_type(LogicalType::DECIMAL) - .with_length(16) - .with_precision(38) - .with_scale(18) - .build() - .unwrap(), + Type::primitive_type_builder( + "f2", + PhysicalType::FIXED_LEN_BYTE_ARRAY, + ) + .with_logical_type(LogicalType::DECIMAL) + .with_length(16) + .with_precision(38) + .with_scale(18) + .build() + .unwrap(), ), ]) .build() @@ -645,11 +658,14 @@ mod tests { .with_repetition(Repetition::OPTIONAL) .with_logical_type(LogicalType::LIST) .with_fields(&mut vec![Rc::new( - Type::primitive_type_builder("a2", PhysicalType::BYTE_ARRAY) - .with_repetition(Repetition::REPEATED) - .with_logical_type(LogicalType::UTF8) - .build() - .unwrap(), + Type::primitive_type_builder( + "a2", + PhysicalType::BYTE_ARRAY, + ) + .with_repetition(Repetition::REPEATED) + .with_logical_type(LogicalType::UTF8) + .build() + .unwrap(), )]) .build() .unwrap(), diff --git a/rust/parquet/src/schema/printer.rs b/rust/parquet/src/schema/printer.rs index 87c3683d9237d..85ef1cc86d5fc 100644 --- a/rust/parquet/src/schema/printer.rs +++ b/rust/parquet/src/schema/printer.rs @@ -46,7 +46,9 @@ use std::{fmt, io}; use crate::basic::{LogicalType, Type as PhysicalType}; -use crate::file::metadata::{ColumnChunkMetaData, FileMetaData, ParquetMetaData, RowGroupMetaData}; +use crate::file::metadata::{ + ColumnChunkMetaData, FileMetaData, ParquetMetaData, RowGroupMetaData, +}; use crate::schema::types::Type; /// Prints Parquet metadata [`ParquetMetaData`](`::file::metadata::ParquetMetaData`) @@ -204,8 +206,9 @@ impl<'a> Printer<'a> { let logical_type_str = match basic_info.logical_type() { LogicalType::NONE => format!(""), decimal @ LogicalType::DECIMAL => { - // For decimal type we should print precision and scale if they are > 0, e.g. - // DECIMAL(9, 2) - DECIMAL(9) - DECIMAL + // For decimal type we should print precision and scale if they + // are > 0, e.g. DECIMAL(9, 2) - + // DECIMAL(9) - DECIMAL let precision_scale = match (precision, scale) { (p, s) if p > 0 && s > 0 => format!(" ({}, {})", p, s), (p, 0) if p > 0 => format!(" ({})", p), @@ -314,12 +317,13 @@ mod tests { .with_logical_type(LogicalType::UTF8) .with_id(1) .build(); - let f3 = Type::primitive_type_builder("f3", PhysicalType::FIXED_LEN_BYTE_ARRAY) - .with_repetition(Repetition::REPEATED) - .with_logical_type(LogicalType::INTERVAL) - .with_length(12) - .with_id(2) - .build(); + let f3 = + Type::primitive_type_builder("f3", PhysicalType::FIXED_LEN_BYTE_ARRAY) + .with_repetition(Repetition::REPEATED) + .with_logical_type(LogicalType::INTERVAL) + .with_length(12) + .with_id(2) + .build(); let mut struct_fields = Vec::new(); struct_fields.push(Rc::new(f1.unwrap())); struct_fields.push(Rc::new(f2.unwrap())); diff --git a/rust/parquet/src/schema/types.rs b/rust/parquet/src/schema/types.rs index aa314d6100183..adf6f4e456aba 100644 --- a/rust/parquet/src/schema/types.rs +++ b/rust/parquet/src/schema/types.rs @@ -55,7 +55,10 @@ pub enum Type { impl Type { /// Creates primitive type builder with provided field name and physical type. - pub fn primitive_type_builder(name: &str, physical_type: PhysicalType) -> PrimitiveTypeBuilder { + pub fn primitive_type_builder( + name: &str, + physical_type: PhysicalType, + ) -> PrimitiveTypeBuilder { PrimitiveTypeBuilder::new(name, physical_type) } @@ -104,7 +107,8 @@ impl Type { /// This method can be used to check if projected columns are part of the root schema. pub fn check_contains(&self, sub_type: &Type) -> bool { // Names match, and repetitions match or not set for both - let basic_match = self.get_basic_info().name() == sub_type.get_basic_info().name() + let basic_match = self.get_basic_info().name() + == sub_type.get_basic_info().name() && (self.is_schema() && sub_type.is_schema() || !self.is_schema() && !sub_type.is_schema() @@ -314,8 +318,9 @@ impl<'a> PrimitiveTypeBuilder<'a> { } } PhysicalType::FIXED_LEN_BYTE_ARRAY => { - let max_precision = - (2f64.powi(8 * self.length - 1) - 1f64).log10().floor() as i32; + let max_precision = (2f64.powi(8 * self.length - 1) - 1f64) + .log10() + .floor() as i32; if self.precision > max_precision { return Err(general_err!( @@ -357,7 +362,9 @@ impl<'a> PrimitiveTypeBuilder<'a> { } } LogicalType::INTERVAL => { - if self.physical_type != PhysicalType::FIXED_LEN_BYTE_ARRAY || self.length != 12 { + if self.physical_type != PhysicalType::FIXED_LEN_BYTE_ARRAY + || self.length != 12 + { return Err(general_err!( "INTERVAL can only annotate FIXED_LEN_BYTE_ARRAY(12)" )); @@ -745,7 +752,8 @@ impl SchemaDescriptor { result.as_ref() } - /// Returns column root [`Type`](`::schema::types::Type`) pointer for a field position. + /// Returns column root [`Type`](`::schema::types::Type`) pointer for a field + /// position. pub fn get_column_root_ptr(&self, i: usize) -> TypePtr { let result = self.column_root_of(i); result.clone() @@ -858,7 +866,10 @@ pub fn from_thrift(elements: &[SchemaElement]) -> Result { /// The first result is the starting index for the next Type after this one. If it is /// equal to `elements.len()`, then this Type is the last one. /// The second result is the result Type. -fn from_thrift_helper(elements: &[SchemaElement], index: usize) -> Result<(usize, TypePtr)> { +fn from_thrift_helper( + elements: &[SchemaElement], + index: usize, +) -> Result<(usize, TypePtr)> { // Whether or not the current node is root (message type). // There is only one message type node in the schema tree. let is_root_node = index == 0; @@ -916,11 +927,11 @@ fn from_thrift_helper(elements: &[SchemaElement], index: usize) -> Result<(usize .with_logical_type(logical_type) .with_fields(&mut fields); if let Some(rep) = repetition { - // Sometimes parquet-cpp and parquet-mr set repetition level REQUIRED or REPEATED - // for root node. + // Sometimes parquet-cpp and parquet-mr set repetition level REQUIRED or + // REPEATED for root node. // - // We only set repetition for group types that are not top-level message type. - // According to parquet-format: + // We only set repetition for group types that are not top-level message + // type. According to parquet-format: // Root of the schema does not have a repetition_type. // All other types must have one. if !is_root_node { @@ -1350,7 +1361,8 @@ mod tests { .with_repetition(Repetition::REQUIRED) .with_logical_type(LogicalType::INT_64) .build()?; - let item2 = Type::primitive_type_builder("item2", PhysicalType::BOOLEAN).build()?; + let item2 = + Type::primitive_type_builder("item2", PhysicalType::BOOLEAN).build()?; let item3 = Type::primitive_type_builder("item3", PhysicalType::INT32) .with_repetition(Repetition::REPEATED) .with_logical_type(LogicalType::INT_32) diff --git a/rust/parquet/src/util/bit_packing.rs b/rust/parquet/src/util/bit_packing.rs index 851fb36ea5c98..99628672d549c 100644 --- a/rust/parquet/src/util/bit_packing.rs +++ b/rust/parquet/src/util/bit_packing.rs @@ -26,7 +26,11 @@ // https://github.com/tantivy-search/bitpacking // but the layout it uses for SIMD is different from Parquet. // TODO: support packing as well, which is used for encoding. -pub unsafe fn unpack32(mut in_ptr: *const u32, out_ptr: *mut u32, num_bits: usize) -> *const u32 { +pub unsafe fn unpack32( + mut in_ptr: *const u32, + out_ptr: *mut u32, + num_bits: usize, +) -> *const u32 { in_ptr = match num_bits { 0 => nullunpacker32(in_ptr, out_ptr), 1 => unpack1_32(in_ptr, out_ptr), diff --git a/rust/parquet/src/util/bit_util.rs b/rust/parquet/src/util/bit_util.rs index ae680ecca4735..5e7e7bd0feb2a 100644 --- a/rust/parquet/src/util/bit_util.rs +++ b/rust/parquet/src/util/bit_util.rs @@ -32,7 +32,11 @@ macro_rules! read_num_bytes { assert!($size <= $src.len()); let mut data: $ty = Default::default(); unsafe { - ::std::ptr::copy_nonoverlapping($src.as_ptr(), &mut data as *mut $ty as *mut u8, $size); + ::std::ptr::copy_nonoverlapping( + $src.as_ptr(), + &mut data as *mut $ty as *mut u8, + $size, + ); } data }}; @@ -50,7 +54,13 @@ pub fn convert_to_bytes(val: &T, num_bytes: usize) -> Vec { #[inline] pub fn memcpy(source: &[u8], target: &mut [u8]) { assert!(target.len() >= source.len()); - unsafe { ::std::ptr::copy_nonoverlapping(source.as_ptr(), target.as_mut_ptr(), source.len()) } + unsafe { + ::std::ptr::copy_nonoverlapping( + source.as_ptr(), + target.as_mut_ptr(), + source.len(), + ) + } } #[inline] @@ -255,8 +265,8 @@ impl BitWriter { } /// Returns the internal buffer length. This is the maximum number of bytes that this - /// writer can write. User needs to call `consume` to consume the current buffer before - /// more data can be written. + /// writer can write. User needs to call `consume` to consume the current buffer + /// before more data can be written. #[inline] pub fn buffer_len(&self) -> usize { self.max_bytes @@ -271,7 +281,8 @@ impl BitWriter { assert!(num_bits <= 64); assert_eq!(v.checked_shr(num_bits as u32).unwrap_or(0), 0); // covers case v >> 64 - if self.byte_offset * 8 + self.bit_offset + num_bits > self.max_bytes as usize * 8 { + if self.byte_offset * 8 + self.bit_offset + num_bits > self.max_bytes as usize * 8 + { return false; } @@ -286,8 +297,8 @@ impl BitWriter { self.byte_offset += 8; self.bit_offset -= 64; self.buffered_values = 0; - // Perform checked right shift: v >> offset, where offset < 64, otherwise we shift - // all bits + // Perform checked right shift: v >> offset, where offset < 64, otherwise we + // shift all bits self.buffered_values = v .checked_shr((num_bits - self.bit_offset) as u32) .unwrap_or(0); @@ -321,7 +332,12 @@ impl BitWriter { /// Returns false if there's not enough room left, or the `pos` is not valid. /// True otherwise. #[inline] - pub fn put_aligned_offset(&mut self, val: T, num_bytes: usize, offset: usize) -> bool { + pub fn put_aligned_offset( + &mut self, + val: T, + num_bytes: usize, + offset: usize, + ) -> bool { if num_bytes + offset > self.max_bytes { return false; } @@ -432,8 +448,8 @@ impl BitReader { return None; } - let mut v = - trailing_bits(self.buffered_values, self.bit_offset + num_bits) >> self.bit_offset; + let mut v = trailing_bits(self.buffered_values, self.bit_offset + num_bits) + >> self.bit_offset; self.bit_offset += num_bits; if self.bit_offset >= 64 { @@ -491,8 +507,8 @@ impl BitReader { in_ptr = unpack32(in_ptr, out_ptr, num_bits); self.byte_offset += 4 * num_bits; for n in 0..32 { - // We need to copy from smaller size to bigger size to avoid overwritting - // other memory regions. + // We need to copy from smaller size to bigger size to avoid + // overwritting other memory regions. if size_of::() > size_of::() { ::std::ptr::copy_nonoverlapping( out_buf[n..].as_ptr() as *const u32, diff --git a/rust/parquet/src/util/hash_util.rs b/rust/parquet/src/util/hash_util.rs index b4685fbd004da..b9441f819a46d 100644 --- a/rust/parquet/src/util/hash_util.rs +++ b/rust/parquet/src/util/hash_util.rs @@ -39,7 +39,10 @@ fn murmur_hash2_64a(data: &T, seed: u64) -> u64 { let len = data_bytes.len(); let len_64 = (len / 8) * 8; let data_bytes_64 = unsafe { - ::std::slice::from_raw_parts(&data_bytes[0..len_64] as *const [u8] as *const u64, len / 8) + ::std::slice::from_raw_parts( + &data_bytes[0..len_64] as *const [u8] as *const u64, + len / 8, + ) }; let mut h = seed ^ (MURMUR_PRIME.wrapping_mul(data_bytes.len() as u64)); diff --git a/rust/parquet/src/util/io.rs b/rust/parquet/src/util/io.rs index d667c8e817a91..177cfb9724671 100644 --- a/rust/parquet/src/util/io.rs +++ b/rust/parquet/src/util/io.rs @@ -212,7 +212,8 @@ mod tests { // Read data using file chunk let mut res = vec![0u8; 7]; - let mut chunk = FileSource::new(&file, 0, file.metadata().unwrap().len() as usize); + let mut chunk = + FileSource::new(&file, 0, file.metadata().unwrap().len() as usize); chunk.read(&mut res[..]).unwrap(); assert_eq!(res, vec![b'a', b'b', b'c', b'd', b'e', b'f', b'g']); diff --git a/rust/rustfmt.toml b/rust/rustfmt.toml index 72eeee0af1c53..b692119bbc123 100644 --- a/rust/rustfmt.toml +++ b/rust/rustfmt.toml @@ -15,4 +15,7 @@ # specific language governing permissions and limitations # under the License. -format_doc_comments = true \ No newline at end of file +max_width = 90 +wrap_comments = true +format_doc_comments = true +comment_width = 90 \ No newline at end of file