From 6148257b3eebbad3272005f2e49e0e9615fd8f02 Mon Sep 17 00:00:00 2001 From: "Robert (Bobby) Evans" Date: Fri, 23 Aug 2024 14:40:01 -0500 Subject: [PATCH] JSON tests for corrected date, timestamp, and mixed types (#11388) Signed-off-by: Robert (Bobby) Evans --- .../src/main/python/json_matrix_test.py | 318 ++++++++++++++++-- .../src/test/resources/escaped_strings.json | 16 + .../src/test/resources/mixed_objects.json | 35 ++ .../resources/nested_escaped_strings.json | 55 +++ .../timestamp_formatted_strings.json | 38 +++ .../timestamp_tz_formatted_strings.json | 12 + .../nvidia/spark/rapids/GpuJsonTuple.scala | 7 +- 7 files changed, 449 insertions(+), 32 deletions(-) create mode 100644 integration_tests/src/test/resources/mixed_objects.json create mode 100644 integration_tests/src/test/resources/nested_escaped_strings.json create mode 100644 integration_tests/src/test/resources/timestamp_formatted_strings.json create mode 100644 integration_tests/src/test/resources/timestamp_tz_formatted_strings.json diff --git a/integration_tests/src/main/python/json_matrix_test.py b/integration_tests/src/main/python/json_matrix_test.py index a47771f47f8..c9dec8afac9 100644 --- a/integration_tests/src/main/python/json_matrix_test.py +++ b/integration_tests/src/main/python/json_matrix_test.py @@ -584,7 +584,11 @@ def test_json_tuple_dec_locale_non_aribic(std_input_path): "int_struct_formatted.json", "int_mixed_array_struct_formatted.json", "escaped_strings.json", - pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11361'))] + "nested_escaped_strings.json", + pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11361')), + "mixed_objects.json", + "timestamp_formatted_strings.json", + "timestamp_tz_formatted_strings.json"] @pytest.mark.parametrize('input_file', COMMON_TEST_FILES) @pytest.mark.parametrize('read_func', [read_json_df]) # we have done so many tests already that we don't need both read func. They are the same @@ -670,7 +674,11 @@ def test_from_json_longs(std_input_path, input_file): "int_struct_formatted.json", "int_mixed_array_struct_formatted.json", "escaped_strings.json", - pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11361'))]) + "nested_escaped_strings.json", + pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11361')), + "mixed_objects.json", + "timestamp_formatted_strings.json", + "timestamp_tz_formatted_strings.json"]) @pytest.mark.parametrize('read_func', [read_json_df]) # we have done so many tests already that we don't need both read func. They are the same def test_scan_json_decs(std_input_path, read_func, spark_tmp_table_factory, input_file, dt): assert_gpu_and_cpu_are_equal_collect( @@ -695,7 +703,11 @@ def test_scan_json_decs(std_input_path, read_func, spark_tmp_table_factory, inpu "int_struct_formatted.json", "int_mixed_array_struct_formatted.json", "escaped_strings.json", - pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11361'))]) + "nested_escaped_strings.json", + pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11361')), + "mixed_objects.json", + "timestamp_formatted_strings.json", + "timestamp_tz_formatted_strings.json"]) @allow_non_gpu(TEXT_INPUT_EXEC, *non_utc_allow) # https://github.com/NVIDIA/spark-rapids/issues/10453 def test_from_json_decs(std_input_path, input_file, dt): schema = StructType([StructField("data", dt)]) @@ -719,7 +731,11 @@ def test_from_json_decs(std_input_path, input_file, dt): "int_struct_formatted.json", pytest.param("int_mixed_array_struct_formatted.json", marks=pytest.mark.xfail(condition=is_spark_400_or_later(), reason='https://github.com/NVIDIA/spark-rapids/issues/11154')), "escaped_strings.json", - pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11361'))]) + pytest.param("nested_escaped_strings.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/10534')), + pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11361')), + "mixed_objects.json", + "timestamp_formatted_strings.json", + "timestamp_tz_formatted_strings.json"]) @pytest.mark.parametrize('read_func', [read_json_df]) def test_scan_json_strings(std_input_path, read_func, spark_tmp_table_factory, input_file): assert_gpu_and_cpu_are_equal_collect( @@ -743,7 +759,11 @@ def test_scan_json_strings(std_input_path, read_func, spark_tmp_table_factory, i "int_struct_formatted.json", "int_mixed_array_struct_formatted.json", "escaped_strings.json", - pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11361'))]) + pytest.param("nested_escaped_strings.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/10534')), + pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11361')), + "mixed_objects.json", + "timestamp_formatted_strings.json", + "timestamp_tz_formatted_strings.json"]) @allow_non_gpu(TEXT_INPUT_EXEC, *non_utc_allow) # https://github.com/NVIDIA/spark-rapids/issues/10453 def test_from_json_strings(std_input_path, input_file): schema = StructType([StructField("data", StringType())]) @@ -765,12 +785,19 @@ def test_from_json_strings(std_input_path, input_file): "int_array_formatted.json", "int_struct_formatted.json", "int_mixed_array_struct_formatted.json", - pytest.param("escaped_strings.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/10196')), - pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11361'))]) + pytest.param("escaped_strings.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11386')), + pytest.param("nested_escaped_strings.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11387')), + pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11361')), + "mixed_objects.json", + "timestamp_formatted_strings.json", + "timestamp_tz_formatted_strings.json"]) @allow_non_gpu(TEXT_INPUT_EXEC) def test_get_json_object_formats(std_input_path, input_file): assert_gpu_and_cpu_are_equal_collect( - lambda spark : read_json_as_text(spark, std_input_path + '/' + input_file, "json").selectExpr("*", '''get_json_object(json, "$.data")''')) + lambda spark : read_json_as_text(spark, std_input_path + '/' + input_file, "json").selectExpr("*", + '''get_json_object(json, "$.data")''', + '''get_json_object(json, '$.id')''', + '''get_json_object(json, '$.name')''')) @pytest.mark.parametrize('input_file', [ "int_formatted.json", @@ -787,11 +814,28 @@ def test_get_json_object_formats(std_input_path, input_file): "int_struct_formatted.json", "int_mixed_array_struct_formatted.json", "escaped_strings.json", - pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11361'))]) + "nested_escaped_strings.json", + pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11361')), + "mixed_objects.json", + "timestamp_formatted_strings.json", + "timestamp_tz_formatted_strings.json"]) @allow_non_gpu(TEXT_INPUT_EXEC) def test_get_json_object_child_formats(std_input_path, input_file): assert_gpu_and_cpu_are_equal_collect( - lambda spark : read_json_as_text(spark, std_input_path + '/' + input_file, "json").selectExpr("*", '''get_json_object(json, "$.data.a")''')) + lambda spark : read_json_as_text(spark, std_input_path + '/' + input_file, "json").selectExpr("*", + '''get_json_object(json, "$.data.a")''', + '''get_json_object(json, '$.tags[0]')''', + '''get_json_object(json, '$.details.address.city')''', + '''get_json_object(json, '$.user.profile.username')''', + '''get_json_object(json, '$.user.skills[0]')''', + '''get_json_object(json, '$.user.projects[1].name')''', + '''get_json_object(json, '$.departments[0].employees[1].name')''', + '''get_json_object(json, '$.departments[1].employees[0].id')''', + '''get_json_object(json, '$.data.numeric')''', + '''get_json_object(json, '$.data.details.timestamp')''', + '''get_json_object(json, '$.data.details.list[1]')''', + '''get_json_object(json, '$.company.departments[1].employees[0].name')''', + '''get_json_object(json, '$.company.departments[0].employees[1].role')''')) @pytest.mark.parametrize('input_file', [ "int_formatted.json", @@ -807,12 +851,20 @@ def test_get_json_object_child_formats(std_input_path, input_file): "int_array_formatted.json", "int_struct_formatted.json", "int_mixed_array_struct_formatted.json", - pytest.param("escaped_strings.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/10196')), - pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11361'))]) + pytest.param("escaped_strings.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11386')), + pytest.param("nested_escaped_strings.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11387')), + pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11361')), + "mixed_objects.json", + "timestamp_formatted_strings.json", + "timestamp_tz_formatted_strings.json"]) @allow_non_gpu(TEXT_INPUT_EXEC) def test_json_tuple_formats(std_input_path, input_file): assert_gpu_and_cpu_are_equal_collect( - lambda spark : read_json_as_text(spark, std_input_path + '/' + input_file, "json").selectExpr("*", '''json_tuple(json, "data")'''), + lambda spark : read_json_as_text(spark, std_input_path + '/' + input_file, "json").selectExpr("*", + '''json_tuple(json, "data")''').selectExpr("*", + # json_tuple is not the same as get_json_object + '''json_tuple(json, 'id', 'name', 'details.address.city') AS (id, name, city)''').selectExpr("*", + '''json_tuple(json, 'user.profile.username', 'user.skills[0]', 'user.projects[1].name') AS (username, first_skill, second_project_name)'''), conf =_enable_json_tuple_conf) @pytest.mark.parametrize('input_file', COMMON_TEST_FILES) @@ -847,7 +899,11 @@ def test_from_json_bools(std_input_path, input_file): "int_struct_formatted.json", "int_mixed_array_struct_formatted.json", "escaped_strings.json", - pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11361'))]) + "nested_escaped_strings.json", + pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11361')), + "mixed_objects.json", + "timestamp_formatted_strings.json", + "timestamp_tz_formatted_strings.json"]) @pytest.mark.parametrize('read_func', [read_json_df]) def test_scan_json_floats(std_input_path, read_func, spark_tmp_table_factory, input_file): assert_gpu_and_cpu_are_equal_collect( @@ -871,7 +927,11 @@ def test_scan_json_floats(std_input_path, read_func, spark_tmp_table_factory, in "int_struct_formatted.json", "int_mixed_array_struct_formatted.json", "escaped_strings.json", - pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11361'))]) + "nested_escaped_strings.json", + pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11361')), + "mixed_objects.json", + "timestamp_formatted_strings.json", + "timestamp_tz_formatted_strings.json"]) @allow_non_gpu(TEXT_INPUT_EXEC, *non_utc_allow) # https://github.com/NVIDIA/spark-rapids/issues/10453 def test_from_json_floats(std_input_path, input_file): schema = StructType([StructField("data", FloatType())]) @@ -894,7 +954,11 @@ def test_from_json_floats(std_input_path, input_file): "int_struct_formatted.json", "int_mixed_array_struct_formatted.json", "escaped_strings.json", - pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11361'))]) + "nested_escaped_strings.json", + pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11361')), + "mixed_objects.json", + "timestamp_formatted_strings.json", + "timestamp_tz_formatted_strings.json"]) @pytest.mark.parametrize('read_func', [read_json_df]) def test_scan_json_doubles(std_input_path, read_func, spark_tmp_table_factory, input_file): assert_gpu_and_cpu_are_equal_collect( @@ -918,7 +982,11 @@ def test_scan_json_doubles(std_input_path, read_func, spark_tmp_table_factory, i "int_struct_formatted.json", "int_mixed_array_struct_formatted.json", "escaped_strings.json", - pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11361'))]) + "nested_escaped_strings.json", + pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11361')), + "mixed_objects.json", + "timestamp_formatted_strings.json", + "timestamp_tz_formatted_strings.json"]) @allow_non_gpu(TEXT_INPUT_EXEC, *non_utc_allow) # https://github.com/NVIDIA/spark-rapids/issues/10453 def test_from_json_doubles(std_input_path, input_file): schema = StructType([StructField("data", DoubleType())]) @@ -926,6 +994,118 @@ def test_from_json_doubles(std_input_path, input_file): lambda spark : read_json_as_text(spark, std_input_path + '/' + input_file, "json").select(f.col('json'), f.from_json(f.col('json'), schema)), conf =_enable_json_to_structs_conf) +@pytest.mark.parametrize('input_file', [ + "int_formatted.json", + "float_formatted.json", + "sci_formatted.json", + pytest.param("int_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/9664')), + "float_formatted_strings.json", + "sci_formatted_strings.json", + pytest.param("decimal_locale_formatted_strings.json", marks=pytest.mark.xfail(condition=is_before_spark_330(), reason='https://github.com/NVIDIA/spark-rapids/issues/11390')), + "single_quoted_strings.json", + "boolean_formatted.json", + "int_array_formatted.json", + "int_struct_formatted.json", + "int_mixed_array_struct_formatted.json", + pytest.param("escaped_strings.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/9664')), + "nested_escaped_strings.json", + pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11361')), + "mixed_objects.json", + pytest.param("timestamp_formatted_strings.json", marks=pytest.mark.xfail(condition=is_before_spark_330(), reason='https://github.com/NVIDIA/spark-rapids/issues/11391')), + pytest.param("timestamp_tz_formatted_strings.json", marks=pytest.mark.xfail(condition=is_before_spark_330(), reason='https://github.com/NVIDIA/spark-rapids/issues/11391'))]) +@pytest.mark.parametrize('read_func', [read_json_df]) +@allow_non_gpu(*non_utc_allow) # https://github.com/NVIDIA/spark-rapids/issues/10453 +def test_scan_json_corrected_dates(std_input_path, read_func, spark_tmp_table_factory, input_file): + conf = copy_and_update(_enable_all_types_json_scan_conf, {"spark.sql.legacy.timeParserPolicy": "CORRECTED"}) + assert_gpu_and_cpu_are_equal_collect( + read_func(std_input_path + '/' + input_file, + StructType([StructField("data", DateType())]), + spark_tmp_table_factory), + conf=conf) + +@pytest.mark.parametrize('input_file', [ + "int_formatted.json", + "float_formatted.json", + "sci_formatted.json", + pytest.param("int_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/9664')), + "float_formatted_strings.json", + "sci_formatted_strings.json", + pytest.param("decimal_locale_formatted_strings.json", marks=pytest.mark.xfail(condition=is_before_spark_330(), reason='https://github.com/NVIDIA/spark-rapids/issues/11390')), + "single_quoted_strings.json", + "boolean_formatted.json", + "int_array_formatted.json", + "int_struct_formatted.json", + "int_mixed_array_struct_formatted.json", + pytest.param("escaped_strings.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/9664')), + "nested_escaped_strings.json", + pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11361')), + "mixed_objects.json", + pytest.param("timestamp_formatted_strings.json", marks=pytest.mark.xfail(condition=is_before_spark_330(), reason='https://github.com/NVIDIA/spark-rapids/issues/11391')), + pytest.param("timestamp_tz_formatted_strings.json", marks=pytest.mark.xfail(condition=is_before_spark_330(), reason='https://github.com/NVIDIA/spark-rapids/issues/11391'))]) +@allow_non_gpu(TEXT_INPUT_EXEC, *non_utc_allow) # https://github.com/NVIDIA/spark-rapids/issues/10453 +def test_from_json_corrected_dates(std_input_path, input_file): + schema = StructType([StructField("data", DateType())]) + conf = copy_and_update(_enable_json_to_structs_conf, {"spark.sql.legacy.timeParserPolicy": "CORRECTED"}) + assert_gpu_and_cpu_are_equal_collect( + lambda spark : read_json_as_text(spark, std_input_path + '/' + input_file, "json").select(f.col('json'), f.from_json(f.col('json'), schema)), + conf = conf) + +@pytest.mark.parametrize('input_file', [ + pytest.param("int_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/10535')), + "float_formatted.json", + "sci_formatted.json", + pytest.param("int_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/10535')), + "float_formatted_strings.json", + "sci_formatted_strings.json", + pytest.param("decimal_locale_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/10535')), + "single_quoted_strings.json", + pytest.param("boolean_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/10535')), + "int_array_formatted.json", + "int_struct_formatted.json", + "int_mixed_array_struct_formatted.json", + "escaped_strings.json", + "nested_escaped_strings.json", + pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11361')), + "mixed_objects.json", + "timestamp_formatted_strings.json", + pytest.param("timestamp_tz_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/6846'))]) +@pytest.mark.parametrize('read_func', [read_json_df]) +@allow_non_gpu(*non_utc_allow) +def test_scan_json_corrected_timestamps(std_input_path, read_func, spark_tmp_table_factory, input_file): + conf = copy_and_update(_enable_all_types_json_scan_conf, {"spark.sql.legacy.timeParserPolicy": "CORRECTED"}) + assert_gpu_and_cpu_are_equal_collect( + read_func(std_input_path + '/' + input_file, + StructType([StructField("data", TimestampType())]), + spark_tmp_table_factory), + conf=conf) + +@pytest.mark.parametrize('input_file', [ + pytest.param("int_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/10535')), + "float_formatted.json", + "sci_formatted.json", + pytest.param("int_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/10535')), + "float_formatted_strings.json", + "sci_formatted_strings.json", + pytest.param("decimal_locale_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/10535')), + "single_quoted_strings.json", + pytest.param("boolean_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/10535')), + "int_array_formatted.json", + "int_struct_formatted.json", + "int_mixed_array_struct_formatted.json", + "escaped_strings.json", + "nested_escaped_strings.json", + pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11361')), + "mixed_objects.json", + "timestamp_formatted_strings.json", + pytest.param("timestamp_tz_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/6846'))]) +@allow_non_gpu(TEXT_INPUT_EXEC, *non_utc_allow) +def test_from_json_corrected_timestamps(std_input_path, input_file): + schema = StructType([StructField("data", TimestampType())]) + conf = copy_and_update(_enable_json_to_structs_conf, {"spark.sql.legacy.timeParserPolicy": "CORRECTED"}) + assert_gpu_and_cpu_are_equal_collect( + lambda spark : read_json_as_text(spark, std_input_path + '/' + input_file, "json").select(f.col('json'), f.from_json(f.col('json'), schema)), + conf = conf) + @pytest.mark.parametrize('input_file', [ pytest.param("int_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), pytest.param("float_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), @@ -940,7 +1120,11 @@ def test_from_json_doubles(std_input_path, input_file): "int_struct_formatted.json", pytest.param("int_mixed_array_struct_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), pytest.param("escaped_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260'))]) + "nested_escaped_strings.json", + pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), + "mixed_objects.json", + pytest.param("timestamp_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), + pytest.param("timestamp_tz_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260'))]) @pytest.mark.parametrize('read_func', [read_json_df]) # we have done so many tests already that we don't need both read func. They are the same def test_scan_json_long_arrays(std_input_path, read_func, spark_tmp_table_factory, input_file): assert_gpu_and_cpu_are_equal_collect( @@ -963,7 +1147,11 @@ def test_scan_json_long_arrays(std_input_path, read_func, spark_tmp_table_factor "int_struct_formatted.json", pytest.param("int_mixed_array_struct_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), pytest.param("escaped_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260'))]) + "nested_escaped_strings.json", + pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), + "mixed_objects.json", + pytest.param("timestamp_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), + pytest.param("timestamp_tz_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260'))]) @allow_non_gpu(TEXT_INPUT_EXEC, *non_utc_allow) # https://github.com/NVIDIA/spark-rapids/issues/10453 def test_from_json_long_arrays(std_input_path, input_file): schema = StructType([StructField("data", ArrayType(LongType()))]) @@ -985,7 +1173,11 @@ def test_from_json_long_arrays(std_input_path, input_file): "int_struct_formatted.json", pytest.param("int_mixed_array_struct_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), pytest.param("escaped_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260'))]) + "nested_escaped_strings.json", + pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), + "mixed_objects.json", + pytest.param("timestamp_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), + pytest.param("timestamp_tz_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260'))]) @pytest.mark.parametrize('read_func', [read_json_df]) # we have done so many tests already that we don't need both read func. They are the same def test_scan_json_string_arrays(std_input_path, read_func, spark_tmp_table_factory, input_file): assert_gpu_and_cpu_are_equal_collect( @@ -1008,7 +1200,11 @@ def test_scan_json_string_arrays(std_input_path, read_func, spark_tmp_table_fact "int_struct_formatted.json", pytest.param("int_mixed_array_struct_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), pytest.param("escaped_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260'))]) + "nested_escaped_strings.json", + pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), + "mixed_objects.json", + pytest.param("timestamp_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), + pytest.param("timestamp_tz_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260'))]) @allow_non_gpu(TEXT_INPUT_EXEC, *non_utc_allow) # https://github.com/NVIDIA/spark-rapids/issues/10453 def test_from_json_string_arrays(std_input_path, input_file): schema = StructType([StructField("data", ArrayType(StringType()))]) @@ -1030,7 +1226,11 @@ def test_from_json_string_arrays(std_input_path, input_file): pytest.param("int_struct_formatted.json", marks=pytest.mark.xfail(condition=is_before_spark_342(),reason='https://github.com/NVIDIA/spark-rapids/issues/10588')), pytest.param("int_mixed_array_struct_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), pytest.param("escaped_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260'))]) + pytest.param("nested_escaped_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), + pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), + "mixed_objects.json", + pytest.param("timestamp_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), + pytest.param("timestamp_tz_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260'))]) @pytest.mark.parametrize('read_func', [read_json_df]) # we have done so many tests already that we don't need both read func. They are the same def test_scan_json_long_structs(std_input_path, read_func, spark_tmp_table_factory, input_file): assert_gpu_and_cpu_are_equal_collect( @@ -1053,7 +1253,11 @@ def test_scan_json_long_structs(std_input_path, read_func, spark_tmp_table_facto pytest.param("int_struct_formatted.json", marks=pytest.mark.xfail(condition=is_before_spark_342(),reason='https://github.com/NVIDIA/spark-rapids/issues/10588')), pytest.param("int_mixed_array_struct_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), pytest.param("escaped_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260'))]) + pytest.param("nested_escaped_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), + pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), + "mixed_objects.json", + pytest.param("timestamp_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), + pytest.param("timestamp_tz_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260'))]) @allow_non_gpu(TEXT_INPUT_EXEC, *non_utc_allow) # https://github.com/NVIDIA/spark-rapids/issues/10453 def test_from_json_long_structs(std_input_path, input_file): schema = StructType([StructField("data", StructType([StructField("A", LongType()),StructField("B", LongType())]))]) @@ -1075,7 +1279,11 @@ def test_from_json_long_structs(std_input_path, input_file): "int_struct_formatted.json", pytest.param("int_mixed_array_struct_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), pytest.param("escaped_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260'))]) + pytest.param("nested_escaped_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), + pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), + "mixed_objects.json", + pytest.param("timestamp_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), + pytest.param("timestamp_tz_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260'))]) @pytest.mark.parametrize('read_func', [read_json_df]) # we have done so many tests already that we don't need both read func. They are the same def test_scan_json_string_structs(std_input_path, read_func, spark_tmp_table_factory, input_file): assert_gpu_and_cpu_are_equal_collect( @@ -1098,7 +1306,11 @@ def test_scan_json_string_structs(std_input_path, read_func, spark_tmp_table_fac "int_struct_formatted.json", pytest.param("int_mixed_array_struct_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), pytest.param("escaped_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260'))]) + pytest.param("nested_escaped_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), + pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), + "mixed_objects.json", + pytest.param("timestamp_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), + pytest.param("timestamp_tz_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260'))]) @allow_non_gpu(TEXT_INPUT_EXEC, *non_utc_allow) # https://github.com/NVIDIA/spark-rapids/issues/10453 def test_from_json_string_structs(std_input_path, input_file): schema = StructType([StructField("data", StructType([StructField("A", StringType()),StructField("B", StringType())]))]) @@ -1120,8 +1332,11 @@ def test_from_json_string_structs(std_input_path, input_file): pytest.param("int_array_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/10573')), # This does not fail on 38,0 "int_struct_formatted.json", pytest.param("int_mixed_array_struct_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("escaped_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260'))]) + pytest.param("escaped_stringted_.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), + pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), + "mixed_objects.json", + pytest.param("timestamp_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), + pytest.param("timestamp_tz_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260'))]) @pytest.mark.parametrize('read_func', [read_json_df]) # we have done so many tests already that we don't need both read func. They are the same def test_scan_json_dec_arrays(std_input_path, read_func, spark_tmp_table_factory, input_file, dt): assert_gpu_and_cpu_are_equal_collect( @@ -1145,7 +1360,11 @@ def test_scan_json_dec_arrays(std_input_path, read_func, spark_tmp_table_factory "int_struct_formatted.json", pytest.param("int_mixed_array_struct_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), pytest.param("escaped_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260'))]) + "nested_escaped_strings.json", + pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), + "mixed_objects.json", + pytest.param("timestamp_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), + pytest.param("timestamp_tz_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260'))]) @allow_non_gpu(TEXT_INPUT_EXEC, *non_utc_allow) # https://github.com/NVIDIA/spark-rapids/issues/10453 def test_from_json_dec_arrays(std_input_path, input_file, dt): schema = StructType([StructField("data", ArrayType(dt))]) @@ -1153,4 +1372,47 @@ def test_from_json_dec_arrays(std_input_path, input_file, dt): lambda spark : read_json_as_text(spark, std_input_path + '/' + input_file, "json").select(f.col('json'), f.from_json(f.col('json'), schema)), conf =_enable_json_to_structs_conf) +@pytest.mark.parametrize('input_file', [ + pytest.param("int_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), + pytest.param("float_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), + pytest.param("sci_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), + pytest.param("int_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), + pytest.param("float_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), + pytest.param("sci_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), + pytest.param("decimal_locale_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), + pytest.param("single_quoted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), + pytest.param("boolean_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), + "int_array_formatted.json", + "int_struct_formatted.json", + pytest.param("int_mixed_array_struct_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), + pytest.param("escaped_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), + pytest.param("nested_escaped_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), + pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), + "mixed_objects.json", + pytest.param("timestamp_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), + pytest.param("timestamp_tz_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260'))]) +@pytest.mark.parametrize('read_func', [read_json_df]) # we have done so many tests already that we don't need both read func. They are the same +def test_scan_json_mixed_struct(std_input_path, read_func, spark_tmp_table_factory, input_file): + assert_gpu_and_cpu_are_equal_collect( + read_func(std_input_path + '/' + input_file, + StructType([StructField("data", StructType([StructField("A", StringType()),StructField("B", StringType())]))]), + spark_tmp_table_factory), + conf=_enable_all_types_json_scan_conf) +@pytest.mark.parametrize('input_file, schema', [ + ("mixed_objects.json", "id INT, name STRING, tags ARRAY, details STRUCT>"), + ("mixed_objects.json", "user STRUCT, skills: ARRAY, projects: ARRAY>>"), + ("mixed_objects.json", "departments ARRAY>>>"), + ("mixed_objects.json", "data STRUCT>>"), + ("mixed_objects.json", "data STRUCT>>"), + pytest.param("mixed_objects.json", "data STRUCT>>", + marks=pytest.mark.xfail(condition=is_before_spark_330(), reason='https://github.com/NVIDIA/spark-rapids/issues/11390')), + ("mixed_objects.json", "company STRUCT>>>>"), + ]) +@allow_non_gpu(TEXT_INPUT_EXEC, *non_utc_allow) # https://github.com/NVIDIA/spark-rapids/issues/10453 +def test_from_json_mixed_corrected(std_input_path, input_file, schema): + conf = copy_and_update(_enable_json_to_structs_conf, {"spark.sql.legacy.timeParserPolicy": "CORRECTED"}) + assert_gpu_and_cpu_are_equal_collect( + lambda spark : read_json_as_text(spark, std_input_path + '/' + input_file, "json").selectExpr('json', + "from_json(json, '" + schema + "') as parsed"), + conf = conf) diff --git a/integration_tests/src/test/resources/escaped_strings.json b/integration_tests/src/test/resources/escaped_strings.json index 43637e14eae..ea7b0bbe8d2 100644 --- a/integration_tests/src/test/resources/escaped_strings.json +++ b/integration_tests/src/test/resources/escaped_strings.json @@ -38,3 +38,19 @@ {"data": "\u0031\u0034\u002E\u0030\u0031"} {"data": "\'TESTING\'"} {"\u0064\u0061t\u0061": "TEST"} +{"data": "This is a simple string"} +{"data": "This string contains a newline character\n"} +{"data": "This string contains a tab character\t"} +{"data": "This string contains a backslash \\"} +{"data": "This string contains a double quote \""} +{"data": "This string contains a unicode character \u00A9"} +{"data": "This string contains a smiley face \u263A"} +{"data": "This string contains a backspace character\b"} +{"data": "This string contains a form feed character\f"} +{"data": "This string contains a form feed character\u000C"} +{"data": "This string contains a carriage return\r"} +{"data": "This string contains a carriage return\u000D"} +{"data": "This string contains an illegal control character \u0007 (bell)"} +{"data": "This string contains an illegal character \u001F (unit separator)"} +{"data": "This string contains a mix of normal and escaped characters: \n \" \t \u0041"} +{"data": "This string contains an illegal control character directly: \u0001"} diff --git a/integration_tests/src/test/resources/mixed_objects.json b/integration_tests/src/test/resources/mixed_objects.json new file mode 100644 index 00000000000..91225124c6a --- /dev/null +++ b/integration_tests/src/test/resources/mixed_objects.json @@ -0,0 +1,35 @@ +{"id": 1,"name": "John","tags": ["developer", "python"],"details": {"age": 30,"address": {"city": "San Francisco","zip": "94105"}}} +{"user": { "profile": { "username": "jane_doe","email": "jane.doe@example.com"},"skills": ["java", "spark", "sql"],"projects": [{"name": "Project X", "status": "completed"},{"name": "Project Y", "status": "ongoing"}]}} +{"departments": [{"name": "Engineering","employees": [{"id": 101, "name": "Alice"},{"id": 102, "name": "Bob"}]},{"name": "Sales","employees": [{"id": 201, "name": "Charlie"},{"id": 202, "name": "David"}]}]} +{"data": {"numeric": 123, "text": "example", "flag": true, "details": { "timestamp": "2024-08-22T10:00:00Z","list": [1, 2, 3]}}} +{"company": {"departments": [{"department_name": "HR","employees": [{"name": "Emily", "role": "Recruiter"},{"name": "Frank", "role": "HR Manager"}]},{"department_name": "IT","employees": [{"name": "Grace", "role": "Software Engineer"},{"name": "Hank", "role": "System Admin"}]}]}} +{"id": 2, "name": "Alice", "tags": ["data scientist", "R"], "details": {"age": 28, "address": {"city": "New York", "zip": "10001"}}} +{"user": {"profile": {"username": "bob_smith", "email": "bob.smith@example.com"}, "skills": ["javascript", "node.js"], "projects": [{"name": "Project A", "status": "completed"}, {"name": "Project B", "status": "pending"}]}} +{"departments": [{"name": "Marketing", "employees": [{"id": 301, "name": "Tom"}, {"id": 302, "name": "Jerry"}]}, {"name": "Finance", "employees": [{"id": 401, "name": "Mickey"}, {"id": 402, "name": "Donald"}]}]} +{"data": {"numeric": 456, "text": "sample", "flag": false, "details": {"timestamp": "2024-08-23T11:00:00Z", "list": [4, 5, 6]}}} +{"company": {"departments": [{"department_name": "Sales", "employees": [{"name": "Olivia", "role": "Sales Manager"}, {"name": "Liam", "role": "Sales Associate"}]}, {"department_name": "Engineering", "employees": [{"name": "Noah", "role": "DevOps Engineer"}, {"name": "Emma", "role": "Frontend Developer"}]}]}} +{"id": 3, "name": "Robert", "tags": ["backend", "java"], "details": {"age": 35, "address": {"city": "Seattle", "zip": "98101"}}} +{"user": {"profile": {"username": "carol_jones", "email": "carol.jones@example.com"}, "skills": ["python", "machine learning"], "projects": [{"name": "Project Z", "status": "ongoing"}, {"name": "Project W", "status": "completed"}]}} +{"departments": [{"name": "HR", "employees": [{"id": 501, "name": "Sophia"}, {"id": 502, "name": "Jackson"}]}, {"name": "IT", "employees": [{"id": 601, "name": "Aiden"}, {"id": 602, "name": "Lucas"}]}]} +{"data": {"numeric": 789, "text": "test", "flag": true, "details": {"timestamp": "2024-08-24T12:00:00Z", "list": [7, 8, 9]}}} +{"company": {"departments": [{"department_name": "Customer Support", "employees": [{"name": "Mia", "role": "Support Specialist"}, {"name": "Ethan", "role": "Support Manager"}]}, {"department_name": "Development", "employees": [{"name": "Isabella", "role": "Backend Developer"}, {"name": "James", "role": "Frontend Developer"}]}]}} +{"id": 4, "name": "Emily", "tags": ["UI/UX", "design"], "details": {"age": 29, "address": {"city": "Los Angeles", "zip": "90001"}}} +{"user": {"profile": {"username": "david_clark", "email": "david.clark@example.com"}, "skills": ["sql", "data analysis"], "projects": [{"name": "Project M", "status": "completed"}, {"name": "Project N", "status": "pending"}]}} +{"departments": [{"name": "Operations", "employees": [{"id": 701, "name": "Ella"}, {"id": 702, "name": "Liam"}]}, {"name": "Legal", "employees": [{"id": 801, "name": "Ava"}, {"id": 802, "name": "William"}]}]} +{"data": {"numeric": 321, "text": "data", "flag": false, "details": {"timestamp": "2024-08-25T13:00:00Z", "list": [3, 2, 1]}}} +{"company": {"departments": [{"department_name": "Product", "employees": [{"name": "Jack", "role": "Product Manager"}, {"name": "Sophia", "role": "Product Designer"}]}, {"department_name": "Marketing", "employees": [{"name": "Oliver", "role": "Content Writer"}, {"name": "Charlotte", "role": "SEO Specialist"}]}]}} +{"id": 5, "name": "Michael", "tags": ["full-stack", "ruby"], "details": {"age": 32, "address": {"city": "Austin", "zip": "73301"}}} +{"user": {"profile": {"username": "lisa_white", "email": "lisa.white@example.com"}, "skills": ["php", "web development"], "projects": [{"name": "Project O", "status": "completed"}, {"name": "Project P", "status": "ongoing"}]}} +{"departments": [{"name": "Research", "employees": [{"id": 901, "name": "Benjamin"}, {"id": 902, "name": "Mia"}]}, {"name": "Training", "employees": [{"id": 1001, "name": "Zoe"}, {"id": 1002, "name": "Ryan"}]}]} +{"data": {"numeric": 654, "text": "example", "flag": true, "details": {"timestamp": "2024-08-26T14:00:00Z", "list": [6, 5, 4]}}} +{"company": {"departments": [{"department_name": "Finance", "employees": [{"name": "Lucas", "role": "Financial Analyst"}, {"name": "Emma", "role": "Finance Director"}]}, {"department_name": "Legal", "employees": [{"name": "Liam", "role": "Legal Counsel"}, {"name": "Olivia", "role": "Paralegal"}]}]}} +{"id": 6, "name": "Sophia", "tags": ["cloud", "AWS"], "details": {"age": 31, "address": {"city": "San Diego", "zip": "92101"}}} +{"user": {"profile": {"username": "aaron_lee", "email": "aaron.lee@example.com"}, "skills": ["c++", "system programming"], "projects": [{"name": "Project Q", "status": "ongoing"}, {"name": "Project R", "status": "completed"}]}} +{"departments": [{"name": "Design", "employees": [{"id": 1101, "name": "Ella"}, {"id": 1102, "name": "Jack"}]}, {"name": "Strategy", "employees": [{"id": 1201, "name": "Mason"}, {"id": 1202, "name": "Ava"}]}]} +{"data": {"numeric": 987, "text": "test", "flag": false, "details": {"timestamp": "2024-08-27T15:00:00Z", "list": [9, 8, 7]}}} +{"company": {"departments": [{"department_name": "Sales", "employees": [{"name": "Aiden", "role": "Sales Director"}, {"name": "Emily", "role": "Sales Representative"}]}, {"department_name": "Development", "employees": [{"name": "James", "role": "Lead Developer"}, {"name": "Mia", "role": "Junior Developer"}]}]}} +{"id": 7, "name": "David", "tags": ["embedded systems", "IoT"], "details": {"age": 34, "address": {"city": "Boston", "zip": "02101"}}} +{"user": {"profile": {"username": "nina_garcia", "email": "nina.garcia@example.com"}, "skills": ["ruby", "backend"], "projects": [{"name": "Project S", "status": "completed"}, {"name": "Project T", "status": "pending"}]}} +{"departments": [{"name": "Customer Support", "employees": [{"id": 1301, "name": "Daniel"}, {"id": 1302, "name": "Sophia"}]}, {"name": "Administration", "employees": [{"id": 1401, "name": "Olivia"}, {"id": 1402, "name": "Ethan"}]}]} +{"data": {"numeric": 111, "text": "sample", "flag": true, "details": {"timestamp": "2024-08-28T16:00:00Z", "list": [1, 1, 1]}}} +{"company": {"departments": [{"department_name": "Operations", "employees": [{"name": "Ryan", "role": "Operations Manager"}, {"name": "Emma", "role": "Operations Analyst"}]}, {"department_name": "Product", "employees": [{"name": "Olivia", "role": "Product Owner"}, {"name": "Mason", "role": "Product Designer"}]}]}} diff --git a/integration_tests/src/test/resources/nested_escaped_strings.json b/integration_tests/src/test/resources/nested_escaped_strings.json new file mode 100644 index 00000000000..bd67e16aeb0 --- /dev/null +++ b/integration_tests/src/test/resources/nested_escaped_strings.json @@ -0,0 +1,55 @@ +{"data": {"a": "ABCDEFGHIJKLMNOPQRSTUVWXYZ"}} +{"data": {"a": "\a"}} +{"data": {"a": "\b"}} +{"data": {"a": "\c"}} +{"data": {"a": "\d"}} +{"data": {"a": "\e"}} +{"data": {"a": "\f"}} +{"data": {"a": "\g"}} +{"data": {"a": "\h"}} +{"data": {"a": "\i"}} +{"data": {"a": "\j"}} +{"data": {"a": "\k"}} +{"data": {"a": "\l"}} +{"data": {"a": "\m"}} +{"data": {"a": "\n"}} +{"data": {"a": "\o"}} +{"data": {"a": "\p"}} +{"data": {"a": "\q"}} +{"data": {"a": "\r"}} +{"data": {"a": "\s"}} +{"data": {"a": "\t"}} +{"data": {"a": "\u"}} +{"data": {"a": "\v"}} +{"data": {"a": "\w"}} +{"data": {"a": "\x"}} +{"data": {"a": "\y"}} +{"data": {"a": "\z"}} +{"data": {"a": "\\"}} +{"data": {"a": "\""}} +{"data": {"a": "\'"}} +{"data": {"a": "\u0000"}} +{"data": {"a": "\u0001"}} +{"data": {"a": "\u0002"}} +{"data": {"a": "\u0003"}} +{"data": {"a": "\u0004"}} +{"data": {"a": "This\ris\nA\ttest\u0009to\u000Asee\u000awhat\u000Bhappens"}} +{"data": {"a": "\u0031\u0034"}} +{"data": {"a": "\u0031\u0034\u002E\u0030\u0031"}} +{"data": {"a": "\'TESTING\'"}} +{"data": {"a": "This is a simple string"}} +{"data": {"a": "This string contains a newline character\n"}} +{"data": {"a": "This string contains a tab character\t"}} +{"data": {"a": "This string contains a backslash \\"}} +{"data": {"a": "This string contains a double quote \""}} +{"data": {"a": "This string contains a unicode character \u00A9"}} +{"data": {"a": "This string contains a smiley face \u263A"}} +{"data": {"a": "This string contains a backspace character\b"}} +{"data": {"a": "This string contains a form feed character\f"}} +{"data": {"a": "This string contains a form feed character\u000C"}} +{"data": {"a": "This string contains a carriage return\r"}} +{"data": {"a": "This string contains a carriage return\u000D"}} +{"data": {"a": "This string contains an illegal control character \u0007 (bell)"}} +{"data": {"a": "This string contains an illegal character \u001F (unit separator)"}} +{"data": {"a": "This string contains a mix of normal and escaped characters: \n \" \t \u0041"}} +{"data": {"a": "This string contains an illegal control character directly: \u0001"}} diff --git a/integration_tests/src/test/resources/timestamp_formatted_strings.json b/integration_tests/src/test/resources/timestamp_formatted_strings.json new file mode 100644 index 00000000000..d6399b86a0c --- /dev/null +++ b/integration_tests/src/test/resources/timestamp_formatted_strings.json @@ -0,0 +1,38 @@ +{"data": "2024-08-22"} +{"data": "2023-02-28T14:45:00Z"} +{"data": "2023-02-28T14:45:00.123Z"} +{"data": "2024-02-29T23:59:59.999Z"} +{"data": "not-a-date"} +{"data": "2024/08/22"} +{"data": "2023-02-28 14:45:00"} +{"data": "2023-02-28 14:45:00.123"} +{"data": "2020-02-25" } +{"data": "2020-02-25 14:46" } +{"data": "2020-02-25T14:46" } +{"data": "2020-02-25 14:46:00" } +{"data": "2020-02-25T14:46:00" } +{"data": "2020-02-25T14:46:00 " } +{"data": "2020-02-25 14:46:00.123" } +{"data": "2020-02-25T14:46:00.123" } +{"data": " 2020-02-25T14:46:00.123" } +{"data": "2020-02-25 14:46:00.123456" } +{"data": "2020-02-25T14:46:00.123456" } +{"data": "1900-01-01"} +{"data": "1969-12-31"} +{"data": "1970-01-01"} +{"data": "0001-01-01"} +{"data": "0999-12-31"} +{"data": "1899-12-31"} +{"data": "2023-02-28T14:45:00Z"} +{"data": "1969-07-20T20:17:40Z"} +{"data": "0001-01-01T00:00:00Z"} +{"data": "9999-12-31T23:59:59.999Z"} +{"data": "1960-04-15T12:30:45.123Z"} +{"data": "1945-05-08T00:01:00Z"} +{"data": "1970-01-01T00:00:00Z"} +{"data": "1865-04-09T12:00:00Z"} +{"data": "1815-06-18T10:30:00Z"} +{"data": "1582-10-15"} +{"data": "1899-12-31T23:59:59.999Z"} +{"data": "0000-12-31T23:59:59.999Z"} +{"data": "22-08-2024"} diff --git a/integration_tests/src/test/resources/timestamp_tz_formatted_strings.json b/integration_tests/src/test/resources/timestamp_tz_formatted_strings.json new file mode 100644 index 00000000000..8d36aa4a39b --- /dev/null +++ b/integration_tests/src/test/resources/timestamp_tz_formatted_strings.json @@ -0,0 +1,12 @@ +{"data": "2024-08-22T14:45:00-05:00"} +{"data": "2023-02-28T09:30:00+01:00"} +{"data": "1969-07-20T20:17:40-04:00"} +{"data": "1970-01-01T00:00:00+00:00"} +{"data": "1999-12-31T23:59:59-08:00"} +{"data": "2012-06-30T23:59:60+09:00"} +{"data": "1945-05-08T02:01:00+02:00"} +{"data": "2024-08-22T14:45:00+10:00"} +{"data": "2023-02-28T14:45:00-03:30"} +{"data": "2021-12-31T23:59:59+05:30"} +{"data": "2023-02-28T14:45:00.123-05:00"} + diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuJsonTuple.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuJsonTuple.scala index e8fbd7e5e61..3b7767117fb 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuJsonTuple.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuJsonTuple.scala @@ -24,7 +24,7 @@ import com.nvidia.spark.rapids.shims.ShimExpression import org.apache.spark.sql.catalyst.analysis.TypeCheckResult import org.apache.spark.sql.catalyst.expressions.Expression -import org.apache.spark.sql.types.{DataType, StringType, StructField, StructType} +import org.apache.spark.sql.types.{StringType, StructField, StructType} import org.apache.spark.sql.vectorized.ColumnarBatch case class GpuJsonTuple(children: Seq[Expression]) extends GpuGenerator @@ -57,7 +57,6 @@ case class GpuJsonTuple(children: Seq[Expression]) extends GpuGenerator withRetry(inputBatches, splitSpillableInHalfByRows) { attempt => withResource(attempt.getColumnarBatch()) { inputBatch => val json = inputBatch.column(generatorOffset).asInstanceOf[GpuColumnVector].getBase - val schema = Array.fill[DataType](fieldExpressions.length)(StringType) val fieldInstructions = fieldExpressions.map { field => withResourceIfAllowed(field.columnarEvalAny(inputBatch)) { @@ -72,8 +71,8 @@ case class GpuJsonTuple(children: Seq[Expression]) extends GpuGenerator withResource(fieldInstructions.safeMap(field => JSONUtils.getJsonObject(json, field))) { resultCols => - val generatorCols = resultCols.safeMap(_.incRefCount).zip(schema).safeMap { - case (col, dataType) => GpuColumnVector.from(col, dataType) + val generatorCols = resultCols.safeMap(_.incRefCount).safeMap { + col => GpuColumnVector.from(col, StringType) } val nonGeneratorCols = (0 until generatorOffset).safeMap { i => inputBatch.column(i).asInstanceOf[GpuColumnVector].incRefCount