Skip to content

Commit

Permalink
fix(timestamp): add trim for the input date string (GreptimeTeam#2078)
Browse files Browse the repository at this point in the history
* fix(timestamp): add trim for the input date string

* fix(timestamp): add analyzer rule to trim strings before conversion

* fix: adjust according to CR
  • Loading branch information
etolbakov authored Aug 3, 2023
1 parent 6f1094d commit 1492700
Show file tree
Hide file tree
Showing 11 changed files with 218 additions and 0 deletions.
8 changes: 8 additions & 0 deletions src/common/time/src/date.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ impl FromStr for Date {
type Err = Error;

fn from_str(s: &str) -> Result<Self> {
let s = s.trim();
let date = NaiveDate::parse_from_str(s, "%F").context(ParseDateStrSnafu { raw: s })?;
Ok(Self(date.num_days_from_ce() - UNIX_EPOCH_FROM_CE))
}
Expand Down Expand Up @@ -108,6 +109,13 @@ mod tests {
Date::from_str("1969-01-01").unwrap().to_string()
);

assert_eq!(
"1969-01-01",
Date::from_str(" 1969-01-01 ")
.unwrap()
.to_string()
);

let now = Utc::now().date_naive().format("%F").to_string();
assert_eq!(now, Date::from_str(&now).unwrap().to_string());
}
Expand Down
3 changes: 3 additions & 0 deletions src/common/time/src/datetime.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ impl FromStr for DateTime {
type Err = Error;

fn from_str(s: &str) -> Result<Self> {
let s = s.trim();
let local = Local {};
let timestamp = if let Ok(d) = NaiveDateTime::parse_from_str(s, DATETIME_FORMAT) {
match local.from_local_datetime(&d) {
Expand Down Expand Up @@ -111,6 +112,8 @@ mod tests {
let time = "1970-01-01 00:00:00+0800";
let dt = DateTime::from_str(time).unwrap();
assert_eq!(time, &dt.to_string());
let dt = DateTime::from_str(" 1970-01-01 00:00:00+0800 ").unwrap();
assert_eq!(time, &dt.to_string());
}

#[test]
Expand Down
6 changes: 6 additions & 0 deletions src/common/time/src/timestamp.rs
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,7 @@ impl FromStr for Timestamp {
/// - `2022-09-20 14:16:43.012345` (local timezone, without T)
fn from_str(s: &str) -> Result<Self, Self::Err> {
// RFC3339 timestamp (with a T)
let s = s.trim();
if let Ok(ts) = DateTime::parse_from_rfc3339(s) {
return Ok(Timestamp::new(ts.timestamp_nanos(), TimeUnit::Nanosecond));
}
Expand Down Expand Up @@ -953,6 +954,11 @@ mod tests {
Timestamp::new(0, TimeUnit::Second),
Timestamp::from_str("1970-01-01 08:00:00").unwrap()
);

assert_eq!(
Timestamp::new(0, TimeUnit::Second),
Timestamp::from_str(" 1970-01-01 08:00:00 ").unwrap()
);
}

#[test]
Expand Down
1 change: 1 addition & 0 deletions src/query/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ table = { path = "../table" }
tokio.workspace = true

[dev-dependencies]
arrow.workspace = true
approx_eq = "0.1"
catalog = { path = "../catalog", features = ["testing"] }
common-function-macro = { path = "../common/function-macro" }
Expand Down
1 change: 1 addition & 0 deletions src/query/src/optimizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,5 @@
// limitations under the License.

pub mod order_hint;
pub mod string_normalization;
pub mod type_conversion;
167 changes: 167 additions & 0 deletions src/query/src/optimizer/string_normalization.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

use arrow_schema::DataType;
use datafusion::config::ConfigOptions;
use datafusion::logical_expr::expr::Cast;
use datafusion_common::tree_node::{Transformed, TreeNode, TreeNodeRewriter};
use datafusion_common::{Result, ScalarValue};
use datafusion_expr::{Expr, LogicalPlan};
use datafusion_optimizer::analyzer::AnalyzerRule;

/// StringNormalizationRule normalizes(trims) string values in logical plan.
/// Mainly used for timestamp trimming
pub struct StringNormalizationRule;

impl AnalyzerRule for StringNormalizationRule {
fn analyze(&self, plan: LogicalPlan, _config: &ConfigOptions) -> Result<LogicalPlan> {
plan.transform(&|plan| {
let mut converter = StringNormalizationConverter;
let inputs = plan.inputs().into_iter().cloned().collect::<Vec<_>>();
let expr = plan
.expressions()
.into_iter()
.map(|e| e.rewrite(&mut converter))
.collect::<Result<Vec<_>>>()?;
datafusion_expr::utils::from_plan(&plan, &expr, &inputs).map(Transformed::Yes)
})
}

fn name(&self) -> &str {
"StringNormalizationRule"
}
}

struct StringNormalizationConverter;

impl TreeNodeRewriter for StringNormalizationConverter {
type N = Expr;

/// remove extra whitespaces from the String value when
/// there is a CAST from a String to Timestamp.
/// Otherwise - no modifications applied
fn mutate(&mut self, expr: Expr) -> Result<Expr> {
let new_expr = match expr {
Expr::Cast(Cast { expr, data_type }) => {
let expr = match data_type {
DataType::Timestamp(_, _) => match *expr {
Expr::Literal(value) => match value {
ScalarValue::Utf8(Some(s)) => trim_utf_expr(s),
_ => Expr::Literal(value),
},
expr => expr,
},
_ => *expr,
};
Expr::Cast(Cast {
expr: Box::new(expr),
data_type,
})
}
expr => expr,
};
Ok(new_expr)
}
}

fn trim_utf_expr(s: String) -> Expr {
let parts: Vec<_> = s.split_whitespace().collect();
let trimmed = parts.join(" ");
Expr::Literal(ScalarValue::Utf8(Some(trimmed)))
}

#[cfg(test)]
mod tests {
use std::sync::Arc;

use arrow::datatypes::TimeUnit::{Microsecond, Millisecond, Nanosecond, Second};
use arrow::datatypes::{DataType, SchemaRef};
use arrow_schema::{Field, Schema, TimeUnit};
use datafusion::datasource::{provider_as_source, MemTable};
use datafusion_common::config::ConfigOptions;
use datafusion_expr::{lit, Cast, Expr, LogicalPlan, LogicalPlanBuilder};
use datafusion_optimizer::analyzer::AnalyzerRule;

use crate::optimizer::string_normalization::StringNormalizationRule;

#[test]
fn test_normalization_for_string_with_extra_whitespaces_to_timestamp_cast() {
let timestamp_str_with_whitespaces = " 2017-07-23 13:10:11 ";
let config = &ConfigOptions::default();
let projects = vec![
create_timestamp_cast_project(Nanosecond, timestamp_str_with_whitespaces),
create_timestamp_cast_project(Microsecond, timestamp_str_with_whitespaces),
create_timestamp_cast_project(Millisecond, timestamp_str_with_whitespaces),
create_timestamp_cast_project(Second, timestamp_str_with_whitespaces),
];
for (time_unit, proj) in projects {
let plan = create_test_plan_with_project(proj);
let result = StringNormalizationRule.analyze(plan, config).unwrap();
let expected = format!("Projection: CAST(Utf8(\"2017-07-23 13:10:11\") AS Timestamp({:#?}, None))\n TableScan: t",
time_unit
);
assert_eq!(expected, format!("{:?}", result));
}
}

#[test]
fn test_normalization_for_non_timestamp_casts() {
let config = &ConfigOptions::default();
let proj_int_to_timestamp = vec![Expr::Cast(Cast::new(
Box::new(lit(158412331400600000_i64)),
DataType::Timestamp(Nanosecond, None),
))];
let int_to_timestamp_plan = create_test_plan_with_project(proj_int_to_timestamp);
let result = StringNormalizationRule
.analyze(int_to_timestamp_plan, config)
.unwrap();
let expected = String::from(
"Projection: CAST(Int64(158412331400600000) AS Timestamp(Nanosecond, None))\n TableScan: t"
);
assert_eq!(expected, format!("{:?}", result));

let proj_string_to_int = vec![Expr::Cast(Cast::new(
Box::new(lit(" 5 ")),
DataType::Int32,
))];
let string_to_int_plan = create_test_plan_with_project(proj_string_to_int);
let result = StringNormalizationRule
.analyze(string_to_int_plan, &ConfigOptions::default())
.unwrap();
let expected = String::from("Projection: CAST(Utf8(\" 5 \") AS Int32)\n TableScan: t");
assert_eq!(expected, format!("{:?}", result));
}

fn create_test_plan_with_project(proj: Vec<Expr>) -> LogicalPlan {
prepare_test_plan_builder()
.project(proj)
.unwrap()
.build()
.unwrap()
}

fn create_timestamp_cast_project(unit: TimeUnit, timestamp_str: &str) -> (TimeUnit, Vec<Expr>) {
let proj = vec![Expr::Cast(Cast::new(
Box::new(lit(timestamp_str)),
DataType::Timestamp(unit.clone(), None),
))];
(unit, proj)
}

fn prepare_test_plan_builder() -> LogicalPlanBuilder {
let schema = Schema::new(vec![Field::new("f", DataType::Float64, false)]);
let table = MemTable::try_new(SchemaRef::from(schema), vec![]).unwrap();
LogicalPlanBuilder::scan("t", provider_as_source(Arc::new(table)), None).unwrap()
}
}
2 changes: 2 additions & 0 deletions src/query/src/query_engine/state.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ use table::TableRef;

use crate::dist_plan::{DistExtensionPlanner, DistPlannerAnalyzer};
use crate::optimizer::order_hint::OrderHintRule;
use crate::optimizer::string_normalization::StringNormalizationRule;
use crate::optimizer::type_conversion::TypeConversionRule;
use crate::query_engine::options::QueryOptions;

Expand Down Expand Up @@ -84,6 +85,7 @@ impl QueryEngineState {
analyzer.rules.insert(0, Arc::new(DistPlannerAnalyzer));
}
analyzer.rules.insert(0, Arc::new(TypeConversionRule));
analyzer.rules.insert(0, Arc::new(StringNormalizationRule));
let mut optimizer = Optimizer::new();
optimizer.rules.push(Arc::new(OrderHintRule));

Expand Down
8 changes: 8 additions & 0 deletions tests/cases/standalone/common/select/dummy.result
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,14 @@ select TO_UNIXTIME('2023-03-01T06:35:02Z');
| 1677652502 |
+-------------------------------------------+

select TO_UNIXTIME(' 2023-03-01T06:35:02Z ');

+---------------------------------------------------+
| to_unixtime(Utf8(" 2023-03-01T06:35:02Z ")) |
+---------------------------------------------------+
| 1677652502 |
+---------------------------------------------------+

select TO_UNIXTIME(2);

+-----------------------+
Expand Down
2 changes: 2 additions & 0 deletions tests/cases/standalone/common/select/dummy.sql
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ select * where "a" = "A";

select TO_UNIXTIME('2023-03-01T06:35:02Z');

select TO_UNIXTIME(' 2023-03-01T06:35:02Z ');

select TO_UNIXTIME(2);

create table test_unixtime(a int, b timestamp time index);
Expand Down
16 changes: 16 additions & 0 deletions tests/cases/standalone/common/timestamp/timestamp.result
Original file line number Diff line number Diff line change
@@ -1,3 +1,19 @@
SELECT timestamp ' 2017-07-23 13:10:11 ';

+-----------------------------+
| Utf8("2017-07-23 13:10:11") |
+-----------------------------+
| 2017-07-23T13:10:11 |
+-----------------------------+

SELECT timestamp ' 2017-07-23 13:10:11 ';

+-----------------------------+
| Utf8("2017-07-23 13:10:11") |
+-----------------------------+
| 2017-07-23T13:10:11 |
+-----------------------------+

CREATE TABLE timestamp_with_precision (ts TIMESTAMP(6) TIME INDEX, cnt INT);

Affected Rows: 0
Expand Down
4 changes: 4 additions & 0 deletions tests/cases/standalone/common/timestamp/timestamp.sql
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
SELECT timestamp ' 2017-07-23 13:10:11 ';

SELECT timestamp ' 2017-07-23 13:10:11 ';

CREATE TABLE timestamp_with_precision (ts TIMESTAMP(6) TIME INDEX, cnt INT);

INSERT INTO timestamp_with_precision(ts,cnt) VALUES ('2023-04-04 08:00:00.0052+0000', 1);
Expand Down

0 comments on commit 1492700

Please sign in to comment.