fix(timestamp): add trim for the input date string (GreptimeTeam#2078)

* fix(timestamp): add trim for the input date string * fix(timestamp): add analyzer rule to trim strings before conversion * fix: adjust according to CR
QuenKar · Aug 3, 2023 · 1492700 · 1492700
1 parent 6f1094d
commit 1492700
Show file tree

Hide file tree

Showing 11 changed files with 218 additions and 0 deletions.
diff --git a/src/common/time/src/date.rs b/src/common/time/src/date.rs
@@ -41,6 +41,7 @@ impl FromStr for Date {
     type Err = Error;
 
     fn from_str(s: &str) -> Result<Self> {
+        let s = s.trim();
         let date = NaiveDate::parse_from_str(s, "%F").context(ParseDateStrSnafu { raw: s })?;
         Ok(Self(date.num_days_from_ce() - UNIX_EPOCH_FROM_CE))
     }
@@ -108,6 +109,13 @@ mod tests {
             Date::from_str("1969-01-01").unwrap().to_string()
         );
 
+        assert_eq!(
+            "1969-01-01",
+            Date::from_str("     1969-01-01       ")
+                .unwrap()
+                .to_string()
+        );
+
         let now = Utc::now().date_naive().format("%F").to_string();
         assert_eq!(now, Date::from_str(&now).unwrap().to_string());
     }

diff --git a/src/common/time/src/datetime.rs b/src/common/time/src/datetime.rs
@@ -55,6 +55,7 @@ impl FromStr for DateTime {
     type Err = Error;
 
     fn from_str(s: &str) -> Result<Self> {
+        let s = s.trim();
         let local = Local {};
         let timestamp = if let Ok(d) = NaiveDateTime::parse_from_str(s, DATETIME_FORMAT) {
             match local.from_local_datetime(&d) {
@@ -111,6 +112,8 @@ mod tests {
         let time = "1970-01-01 00:00:00+0800";
         let dt = DateTime::from_str(time).unwrap();
         assert_eq!(time, &dt.to_string());
+        let dt = DateTime::from_str("      1970-01-01       00:00:00+0800       ").unwrap();
+        assert_eq!(time, &dt.to_string());
     }
 
     #[test]

diff --git a/src/common/time/src/timestamp.rs b/src/common/time/src/timestamp.rs
@@ -227,6 +227,7 @@ impl FromStr for Timestamp {
     /// - `2022-09-20 14:16:43.012345` (local timezone, without T)
     fn from_str(s: &str) -> Result<Self, Self::Err> {
         // RFC3339 timestamp (with a T)
+        let s = s.trim();
         if let Ok(ts) = DateTime::parse_from_rfc3339(s) {
             return Ok(Timestamp::new(ts.timestamp_nanos(), TimeUnit::Nanosecond));
         }
@@ -953,6 +954,11 @@ mod tests {
             Timestamp::new(0, TimeUnit::Second),
             Timestamp::from_str("1970-01-01 08:00:00").unwrap()
         );
+
+        assert_eq!(
+            Timestamp::new(0, TimeUnit::Second),
+            Timestamp::from_str("      1970-01-01        08:00:00    ").unwrap()
+        );
     }
 
     #[test]

diff --git a/src/query/Cargo.toml b/src/query/Cargo.toml
@@ -51,6 +51,7 @@ table = { path = "../table" }
 tokio.workspace = true
 
 [dev-dependencies]
+arrow.workspace = true
 approx_eq = "0.1"
 catalog = { path = "../catalog", features = ["testing"] }
 common-function-macro = { path = "../common/function-macro" }

diff --git a/src/query/src/optimizer.rs b/src/query/src/optimizer.rs
@@ -13,4 +13,5 @@
 // limitations under the License.
 
 pub mod order_hint;
+pub mod string_normalization;
 pub mod type_conversion;
diff --git a/src/query/src/optimizer/string_normalization.rs b/src/query/src/optimizer/string_normalization.rs
@@ -0,0 +1,167 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use arrow_schema::DataType;
+use datafusion::config::ConfigOptions;
+use datafusion::logical_expr::expr::Cast;
+use datafusion_common::tree_node::{Transformed, TreeNode, TreeNodeRewriter};
+use datafusion_common::{Result, ScalarValue};
+use datafusion_expr::{Expr, LogicalPlan};
+use datafusion_optimizer::analyzer::AnalyzerRule;
+
+/// StringNormalizationRule normalizes(trims) string values in logical plan.
+/// Mainly used for timestamp trimming
+pub struct StringNormalizationRule;
+
+impl AnalyzerRule for StringNormalizationRule {
+    fn analyze(&self, plan: LogicalPlan, _config: &ConfigOptions) -> Result<LogicalPlan> {
+        plan.transform(&|plan| {
+            let mut converter = StringNormalizationConverter;
+            let inputs = plan.inputs().into_iter().cloned().collect::<Vec<_>>();
+            let expr = plan
+                .expressions()
+                .into_iter()
+                .map(|e| e.rewrite(&mut converter))
+                .collect::<Result<Vec<_>>>()?;
+            datafusion_expr::utils::from_plan(&plan, &expr, &inputs).map(Transformed::Yes)
+        })
+    }
+
+    fn name(&self) -> &str {
+        "StringNormalizationRule"
+    }
+}
+
+struct StringNormalizationConverter;
+
+impl TreeNodeRewriter for StringNormalizationConverter {
+    type N = Expr;
+
+    /// remove extra whitespaces from the String value when
+    /// there is a CAST from a String to Timestamp.
+    /// Otherwise - no modifications applied
+    fn mutate(&mut self, expr: Expr) -> Result<Expr> {
+        let new_expr = match expr {
+            Expr::Cast(Cast { expr, data_type }) => {
+                let expr = match data_type {
+                    DataType::Timestamp(_, _) => match *expr {
+                        Expr::Literal(value) => match value {
+                            ScalarValue::Utf8(Some(s)) => trim_utf_expr(s),
+                            _ => Expr::Literal(value),
+                        },
+                        expr => expr,
+                    },
+                    _ => *expr,
+                };
+                Expr::Cast(Cast {
+                    expr: Box::new(expr),
+                    data_type,
+                })
+            }
+            expr => expr,
+        };
+        Ok(new_expr)
+    }
+}
+
+fn trim_utf_expr(s: String) -> Expr {
+    let parts: Vec<_> = s.split_whitespace().collect();
+    let trimmed = parts.join(" ");
+    Expr::Literal(ScalarValue::Utf8(Some(trimmed)))
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use arrow::datatypes::TimeUnit::{Microsecond, Millisecond, Nanosecond, Second};
+    use arrow::datatypes::{DataType, SchemaRef};
+    use arrow_schema::{Field, Schema, TimeUnit};
+    use datafusion::datasource::{provider_as_source, MemTable};
+    use datafusion_common::config::ConfigOptions;
+    use datafusion_expr::{lit, Cast, Expr, LogicalPlan, LogicalPlanBuilder};
+    use datafusion_optimizer::analyzer::AnalyzerRule;
+
+    use crate::optimizer::string_normalization::StringNormalizationRule;
+
+    #[test]
+    fn test_normalization_for_string_with_extra_whitespaces_to_timestamp_cast() {
+        let timestamp_str_with_whitespaces = "    2017-07-23    13:10:11   ";
+        let config = &ConfigOptions::default();
+        let projects = vec![
+            create_timestamp_cast_project(Nanosecond, timestamp_str_with_whitespaces),
+            create_timestamp_cast_project(Microsecond, timestamp_str_with_whitespaces),
+            create_timestamp_cast_project(Millisecond, timestamp_str_with_whitespaces),
+            create_timestamp_cast_project(Second, timestamp_str_with_whitespaces),
+        ];
+        for (time_unit, proj) in projects {
+            let plan = create_test_plan_with_project(proj);
+            let result = StringNormalizationRule.analyze(plan, config).unwrap();
+            let expected = format!("Projection: CAST(Utf8(\"2017-07-23 13:10:11\") AS Timestamp({:#?}, None))\n  TableScan: t",
+                                   time_unit
+            );
+            assert_eq!(expected, format!("{:?}", result));
+        }
+    }
+
+    #[test]
+    fn test_normalization_for_non_timestamp_casts() {
+        let config = &ConfigOptions::default();
+        let proj_int_to_timestamp = vec![Expr::Cast(Cast::new(
+            Box::new(lit(158412331400600000_i64)),
+            DataType::Timestamp(Nanosecond, None),
+        ))];
+        let int_to_timestamp_plan = create_test_plan_with_project(proj_int_to_timestamp);
+        let result = StringNormalizationRule
+            .analyze(int_to_timestamp_plan, config)
+            .unwrap();
+        let expected = String::from(
+            "Projection: CAST(Int64(158412331400600000) AS Timestamp(Nanosecond, None))\n  TableScan: t"
+        );
+        assert_eq!(expected, format!("{:?}", result));
+
+        let proj_string_to_int = vec![Expr::Cast(Cast::new(
+            Box::new(lit("  5   ")),
+            DataType::Int32,
+        ))];
+        let string_to_int_plan = create_test_plan_with_project(proj_string_to_int);
+        let result = StringNormalizationRule
+            .analyze(string_to_int_plan, &ConfigOptions::default())
+            .unwrap();
+        let expected = String::from("Projection: CAST(Utf8(\"  5   \") AS Int32)\n  TableScan: t");
+        assert_eq!(expected, format!("{:?}", result));
+    }
+
+    fn create_test_plan_with_project(proj: Vec<Expr>) -> LogicalPlan {
+        prepare_test_plan_builder()
+            .project(proj)
+            .unwrap()
+            .build()
+            .unwrap()
+    }
+
+    fn create_timestamp_cast_project(unit: TimeUnit, timestamp_str: &str) -> (TimeUnit, Vec<Expr>) {
+        let proj = vec![Expr::Cast(Cast::new(
+            Box::new(lit(timestamp_str)),
+            DataType::Timestamp(unit.clone(), None),
+        ))];
+        (unit, proj)
+    }
+
+    fn prepare_test_plan_builder() -> LogicalPlanBuilder {
+        let schema = Schema::new(vec![Field::new("f", DataType::Float64, false)]);
+        let table = MemTable::try_new(SchemaRef::from(schema), vec![]).unwrap();
+        LogicalPlanBuilder::scan("t", provider_as_source(Arc::new(table)), None).unwrap()
+    }
+}
diff --git a/src/query/src/query_engine/state.rs b/src/query/src/query_engine/state.rs
@@ -45,6 +45,7 @@ use table::TableRef;
 
 use crate::dist_plan::{DistExtensionPlanner, DistPlannerAnalyzer};
 use crate::optimizer::order_hint::OrderHintRule;
+use crate::optimizer::string_normalization::StringNormalizationRule;
 use crate::optimizer::type_conversion::TypeConversionRule;
 use crate::query_engine::options::QueryOptions;
 
@@ -84,6 +85,7 @@ impl QueryEngineState {
             analyzer.rules.insert(0, Arc::new(DistPlannerAnalyzer));
         }
         analyzer.rules.insert(0, Arc::new(TypeConversionRule));
+        analyzer.rules.insert(0, Arc::new(StringNormalizationRule));
         let mut optimizer = Optimizer::new();
         optimizer.rules.push(Arc::new(OrderHintRule));
 

diff --git a/tests/cases/standalone/common/select/dummy.result b/tests/cases/standalone/common/select/dummy.result
@@ -42,6 +42,14 @@ select TO_UNIXTIME('2023-03-01T06:35:02Z');
 | 1677652502                                |
 +-------------------------------------------+
 
+select TO_UNIXTIME('    2023-03-01T06:35:02Z    ');
+
++---------------------------------------------------+
+| to_unixtime(Utf8("    2023-03-01T06:35:02Z    ")) |
++---------------------------------------------------+
+| 1677652502                                        |
++---------------------------------------------------+
+
 select TO_UNIXTIME(2);
 
 +-----------------------+

diff --git a/tests/cases/standalone/common/select/dummy.sql b/tests/cases/standalone/common/select/dummy.sql
@@ -12,6 +12,8 @@ select * where "a" = "A";
 
 select TO_UNIXTIME('2023-03-01T06:35:02Z');
 
+select TO_UNIXTIME('    2023-03-01T06:35:02Z    ');
+
 select TO_UNIXTIME(2);
 
 create table test_unixtime(a int, b timestamp time index);

diff --git a/tests/cases/standalone/common/timestamp/timestamp.result b/tests/cases/standalone/common/timestamp/timestamp.result
@@ -1,3 +1,19 @@
+SELECT timestamp '    2017-07-23    13:10:11   ';
+
++-----------------------------+
+| Utf8("2017-07-23 13:10:11") |
++-----------------------------+
+| 2017-07-23T13:10:11         |
++-----------------------------+
+
+SELECT timestamp '    2017-07-23 13:10:11    ';
+
++-----------------------------+
+| Utf8("2017-07-23 13:10:11") |
++-----------------------------+
+| 2017-07-23T13:10:11         |
++-----------------------------+
+
 CREATE TABLE timestamp_with_precision (ts TIMESTAMP(6) TIME INDEX, cnt INT);
 
 Affected Rows: 0

diff --git a/tests/cases/standalone/common/timestamp/timestamp.sql b/tests/cases/standalone/common/timestamp/timestamp.sql
@@ -1,3 +1,7 @@
+SELECT timestamp '    2017-07-23    13:10:11   ';
+
+SELECT timestamp '    2017-07-23 13:10:11    ';
+
 CREATE TABLE timestamp_with_precision (ts TIMESTAMP(6) TIME INDEX, cnt INT);
 
 INSERT INTO timestamp_with_precision(ts,cnt) VALUES ('2023-04-04 08:00:00.0052+0000', 1);