Skip to content

Commit

Permalink
JSON tests for corrected date, timestamp, and mixed types (#11388)
Browse files Browse the repository at this point in the history
Signed-off-by: Robert (Bobby) Evans <[email protected]>
  • Loading branch information
revans2 authored Aug 23, 2024
1 parent d53de06 commit 6148257
Show file tree
Hide file tree
Showing 7 changed files with 449 additions and 32 deletions.
318 changes: 290 additions & 28 deletions integration_tests/src/main/python/json_matrix_test.py

Large diffs are not rendered by default.

16 changes: 16 additions & 0 deletions integration_tests/src/test/resources/escaped_strings.json
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,19 @@
{"data": "\u0031\u0034\u002E\u0030\u0031"}
{"data": "\'TESTING\'"}
{"\u0064\u0061t\u0061": "TEST"}
{"data": "This is a simple string"}
{"data": "This string contains a newline character\n"}
{"data": "This string contains a tab character\t"}
{"data": "This string contains a backslash \\"}
{"data": "This string contains a double quote \""}
{"data": "This string contains a unicode character \u00A9"}
{"data": "This string contains a smiley face \u263A"}
{"data": "This string contains a backspace character\b"}
{"data": "This string contains a form feed character\f"}
{"data": "This string contains a form feed character\u000C"}
{"data": "This string contains a carriage return\r"}
{"data": "This string contains a carriage return\u000D"}
{"data": "This string contains an illegal control character \u0007 (bell)"}
{"data": "This string contains an illegal character \u001F (unit separator)"}
{"data": "This string contains a mix of normal and escaped characters: \n \" \t \u0041"}
{"data": "This string contains an illegal control character directly: \u0001"}
35 changes: 35 additions & 0 deletions integration_tests/src/test/resources/mixed_objects.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
{"id": 1,"name": "John","tags": ["developer", "python"],"details": {"age": 30,"address": {"city": "San Francisco","zip": "94105"}}}
{"user": { "profile": { "username": "jane_doe","email": "[email protected]"},"skills": ["java", "spark", "sql"],"projects": [{"name": "Project X", "status": "completed"},{"name": "Project Y", "status": "ongoing"}]}}
{"departments": [{"name": "Engineering","employees": [{"id": 101, "name": "Alice"},{"id": 102, "name": "Bob"}]},{"name": "Sales","employees": [{"id": 201, "name": "Charlie"},{"id": 202, "name": "David"}]}]}
{"data": {"numeric": 123, "text": "example", "flag": true, "details": { "timestamp": "2024-08-22T10:00:00Z","list": [1, 2, 3]}}}
{"company": {"departments": [{"department_name": "HR","employees": [{"name": "Emily", "role": "Recruiter"},{"name": "Frank", "role": "HR Manager"}]},{"department_name": "IT","employees": [{"name": "Grace", "role": "Software Engineer"},{"name": "Hank", "role": "System Admin"}]}]}}
{"id": 2, "name": "Alice", "tags": ["data scientist", "R"], "details": {"age": 28, "address": {"city": "New York", "zip": "10001"}}}
{"user": {"profile": {"username": "bob_smith", "email": "[email protected]"}, "skills": ["javascript", "node.js"], "projects": [{"name": "Project A", "status": "completed"}, {"name": "Project B", "status": "pending"}]}}
{"departments": [{"name": "Marketing", "employees": [{"id": 301, "name": "Tom"}, {"id": 302, "name": "Jerry"}]}, {"name": "Finance", "employees": [{"id": 401, "name": "Mickey"}, {"id": 402, "name": "Donald"}]}]}
{"data": {"numeric": 456, "text": "sample", "flag": false, "details": {"timestamp": "2024-08-23T11:00:00Z", "list": [4, 5, 6]}}}
{"company": {"departments": [{"department_name": "Sales", "employees": [{"name": "Olivia", "role": "Sales Manager"}, {"name": "Liam", "role": "Sales Associate"}]}, {"department_name": "Engineering", "employees": [{"name": "Noah", "role": "DevOps Engineer"}, {"name": "Emma", "role": "Frontend Developer"}]}]}}
{"id": 3, "name": "Robert", "tags": ["backend", "java"], "details": {"age": 35, "address": {"city": "Seattle", "zip": "98101"}}}
{"user": {"profile": {"username": "carol_jones", "email": "[email protected]"}, "skills": ["python", "machine learning"], "projects": [{"name": "Project Z", "status": "ongoing"}, {"name": "Project W", "status": "completed"}]}}
{"departments": [{"name": "HR", "employees": [{"id": 501, "name": "Sophia"}, {"id": 502, "name": "Jackson"}]}, {"name": "IT", "employees": [{"id": 601, "name": "Aiden"}, {"id": 602, "name": "Lucas"}]}]}
{"data": {"numeric": 789, "text": "test", "flag": true, "details": {"timestamp": "2024-08-24T12:00:00Z", "list": [7, 8, 9]}}}
{"company": {"departments": [{"department_name": "Customer Support", "employees": [{"name": "Mia", "role": "Support Specialist"}, {"name": "Ethan", "role": "Support Manager"}]}, {"department_name": "Development", "employees": [{"name": "Isabella", "role": "Backend Developer"}, {"name": "James", "role": "Frontend Developer"}]}]}}
{"id": 4, "name": "Emily", "tags": ["UI/UX", "design"], "details": {"age": 29, "address": {"city": "Los Angeles", "zip": "90001"}}}
{"user": {"profile": {"username": "david_clark", "email": "[email protected]"}, "skills": ["sql", "data analysis"], "projects": [{"name": "Project M", "status": "completed"}, {"name": "Project N", "status": "pending"}]}}
{"departments": [{"name": "Operations", "employees": [{"id": 701, "name": "Ella"}, {"id": 702, "name": "Liam"}]}, {"name": "Legal", "employees": [{"id": 801, "name": "Ava"}, {"id": 802, "name": "William"}]}]}
{"data": {"numeric": 321, "text": "data", "flag": false, "details": {"timestamp": "2024-08-25T13:00:00Z", "list": [3, 2, 1]}}}
{"company": {"departments": [{"department_name": "Product", "employees": [{"name": "Jack", "role": "Product Manager"}, {"name": "Sophia", "role": "Product Designer"}]}, {"department_name": "Marketing", "employees": [{"name": "Oliver", "role": "Content Writer"}, {"name": "Charlotte", "role": "SEO Specialist"}]}]}}
{"id": 5, "name": "Michael", "tags": ["full-stack", "ruby"], "details": {"age": 32, "address": {"city": "Austin", "zip": "73301"}}}
{"user": {"profile": {"username": "lisa_white", "email": "[email protected]"}, "skills": ["php", "web development"], "projects": [{"name": "Project O", "status": "completed"}, {"name": "Project P", "status": "ongoing"}]}}
{"departments": [{"name": "Research", "employees": [{"id": 901, "name": "Benjamin"}, {"id": 902, "name": "Mia"}]}, {"name": "Training", "employees": [{"id": 1001, "name": "Zoe"}, {"id": 1002, "name": "Ryan"}]}]}
{"data": {"numeric": 654, "text": "example", "flag": true, "details": {"timestamp": "2024-08-26T14:00:00Z", "list": [6, 5, 4]}}}
{"company": {"departments": [{"department_name": "Finance", "employees": [{"name": "Lucas", "role": "Financial Analyst"}, {"name": "Emma", "role": "Finance Director"}]}, {"department_name": "Legal", "employees": [{"name": "Liam", "role": "Legal Counsel"}, {"name": "Olivia", "role": "Paralegal"}]}]}}
{"id": 6, "name": "Sophia", "tags": ["cloud", "AWS"], "details": {"age": 31, "address": {"city": "San Diego", "zip": "92101"}}}
{"user": {"profile": {"username": "aaron_lee", "email": "[email protected]"}, "skills": ["c++", "system programming"], "projects": [{"name": "Project Q", "status": "ongoing"}, {"name": "Project R", "status": "completed"}]}}
{"departments": [{"name": "Design", "employees": [{"id": 1101, "name": "Ella"}, {"id": 1102, "name": "Jack"}]}, {"name": "Strategy", "employees": [{"id": 1201, "name": "Mason"}, {"id": 1202, "name": "Ava"}]}]}
{"data": {"numeric": 987, "text": "test", "flag": false, "details": {"timestamp": "2024-08-27T15:00:00Z", "list": [9, 8, 7]}}}
{"company": {"departments": [{"department_name": "Sales", "employees": [{"name": "Aiden", "role": "Sales Director"}, {"name": "Emily", "role": "Sales Representative"}]}, {"department_name": "Development", "employees": [{"name": "James", "role": "Lead Developer"}, {"name": "Mia", "role": "Junior Developer"}]}]}}
{"id": 7, "name": "David", "tags": ["embedded systems", "IoT"], "details": {"age": 34, "address": {"city": "Boston", "zip": "02101"}}}
{"user": {"profile": {"username": "nina_garcia", "email": "[email protected]"}, "skills": ["ruby", "backend"], "projects": [{"name": "Project S", "status": "completed"}, {"name": "Project T", "status": "pending"}]}}
{"departments": [{"name": "Customer Support", "employees": [{"id": 1301, "name": "Daniel"}, {"id": 1302, "name": "Sophia"}]}, {"name": "Administration", "employees": [{"id": 1401, "name": "Olivia"}, {"id": 1402, "name": "Ethan"}]}]}
{"data": {"numeric": 111, "text": "sample", "flag": true, "details": {"timestamp": "2024-08-28T16:00:00Z", "list": [1, 1, 1]}}}
{"company": {"departments": [{"department_name": "Operations", "employees": [{"name": "Ryan", "role": "Operations Manager"}, {"name": "Emma", "role": "Operations Analyst"}]}, {"department_name": "Product", "employees": [{"name": "Olivia", "role": "Product Owner"}, {"name": "Mason", "role": "Product Designer"}]}]}}
55 changes: 55 additions & 0 deletions integration_tests/src/test/resources/nested_escaped_strings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
{"data": {"a": "ABCDEFGHIJKLMNOPQRSTUVWXYZ"}}
{"data": {"a": "\a"}}
{"data": {"a": "\b"}}
{"data": {"a": "\c"}}
{"data": {"a": "\d"}}
{"data": {"a": "\e"}}
{"data": {"a": "\f"}}
{"data": {"a": "\g"}}
{"data": {"a": "\h"}}
{"data": {"a": "\i"}}
{"data": {"a": "\j"}}
{"data": {"a": "\k"}}
{"data": {"a": "\l"}}
{"data": {"a": "\m"}}
{"data": {"a": "\n"}}
{"data": {"a": "\o"}}
{"data": {"a": "\p"}}
{"data": {"a": "\q"}}
{"data": {"a": "\r"}}
{"data": {"a": "\s"}}
{"data": {"a": "\t"}}
{"data": {"a": "\u"}}
{"data": {"a": "\v"}}
{"data": {"a": "\w"}}
{"data": {"a": "\x"}}
{"data": {"a": "\y"}}
{"data": {"a": "\z"}}
{"data": {"a": "\\"}}
{"data": {"a": "\""}}
{"data": {"a": "\'"}}
{"data": {"a": "\u0000"}}
{"data": {"a": "\u0001"}}
{"data": {"a": "\u0002"}}
{"data": {"a": "\u0003"}}
{"data": {"a": "\u0004"}}
{"data": {"a": "This\ris\nA\ttest\u0009to\u000Asee\u000awhat\u000Bhappens"}}
{"data": {"a": "\u0031\u0034"}}
{"data": {"a": "\u0031\u0034\u002E\u0030\u0031"}}
{"data": {"a": "\'TESTING\'"}}
{"data": {"a": "This is a simple string"}}
{"data": {"a": "This string contains a newline character\n"}}
{"data": {"a": "This string contains a tab character\t"}}
{"data": {"a": "This string contains a backslash \\"}}
{"data": {"a": "This string contains a double quote \""}}
{"data": {"a": "This string contains a unicode character \u00A9"}}
{"data": {"a": "This string contains a smiley face \u263A"}}
{"data": {"a": "This string contains a backspace character\b"}}
{"data": {"a": "This string contains a form feed character\f"}}
{"data": {"a": "This string contains a form feed character\u000C"}}
{"data": {"a": "This string contains a carriage return\r"}}
{"data": {"a": "This string contains a carriage return\u000D"}}
{"data": {"a": "This string contains an illegal control character \u0007 (bell)"}}
{"data": {"a": "This string contains an illegal character \u001F (unit separator)"}}
{"data": {"a": "This string contains a mix of normal and escaped characters: \n \" \t \u0041"}}
{"data": {"a": "This string contains an illegal control character directly: \u0001"}}
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
{"data": "2024-08-22"}
{"data": "2023-02-28T14:45:00Z"}
{"data": "2023-02-28T14:45:00.123Z"}
{"data": "2024-02-29T23:59:59.999Z"}
{"data": "not-a-date"}
{"data": "2024/08/22"}
{"data": "2023-02-28 14:45:00"}
{"data": "2023-02-28 14:45:00.123"}
{"data": "2020-02-25" }
{"data": "2020-02-25 14:46" }
{"data": "2020-02-25T14:46" }
{"data": "2020-02-25 14:46:00" }
{"data": "2020-02-25T14:46:00" }
{"data": "2020-02-25T14:46:00 " }
{"data": "2020-02-25 14:46:00.123" }
{"data": "2020-02-25T14:46:00.123" }
{"data": " 2020-02-25T14:46:00.123" }
{"data": "2020-02-25 14:46:00.123456" }
{"data": "2020-02-25T14:46:00.123456" }
{"data": "1900-01-01"}
{"data": "1969-12-31"}
{"data": "1970-01-01"}
{"data": "0001-01-01"}
{"data": "0999-12-31"}
{"data": "1899-12-31"}
{"data": "2023-02-28T14:45:00Z"}
{"data": "1969-07-20T20:17:40Z"}
{"data": "0001-01-01T00:00:00Z"}
{"data": "9999-12-31T23:59:59.999Z"}
{"data": "1960-04-15T12:30:45.123Z"}
{"data": "1945-05-08T00:01:00Z"}
{"data": "1970-01-01T00:00:00Z"}
{"data": "1865-04-09T12:00:00Z"}
{"data": "1815-06-18T10:30:00Z"}
{"data": "1582-10-15"}
{"data": "1899-12-31T23:59:59.999Z"}
{"data": "0000-12-31T23:59:59.999Z"}
{"data": "22-08-2024"}
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{"data": "2024-08-22T14:45:00-05:00"}
{"data": "2023-02-28T09:30:00+01:00"}
{"data": "1969-07-20T20:17:40-04:00"}
{"data": "1970-01-01T00:00:00+00:00"}
{"data": "1999-12-31T23:59:59-08:00"}
{"data": "2012-06-30T23:59:60+09:00"}
{"data": "1945-05-08T02:01:00+02:00"}
{"data": "2024-08-22T14:45:00+10:00"}
{"data": "2023-02-28T14:45:00-03:30"}
{"data": "2021-12-31T23:59:59+05:30"}
{"data": "2023-02-28T14:45:00.123-05:00"}

Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ import com.nvidia.spark.rapids.shims.ShimExpression

import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.types.{DataType, StringType, StructField, StructType}
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.sql.vectorized.ColumnarBatch

case class GpuJsonTuple(children: Seq[Expression]) extends GpuGenerator
Expand Down Expand Up @@ -57,7 +57,6 @@ case class GpuJsonTuple(children: Seq[Expression]) extends GpuGenerator
withRetry(inputBatches, splitSpillableInHalfByRows) { attempt =>
withResource(attempt.getColumnarBatch()) { inputBatch =>
val json = inputBatch.column(generatorOffset).asInstanceOf[GpuColumnVector].getBase
val schema = Array.fill[DataType](fieldExpressions.length)(StringType)

val fieldInstructions = fieldExpressions.map { field =>
withResourceIfAllowed(field.columnarEvalAny(inputBatch)) {
Expand All @@ -72,8 +71,8 @@ case class GpuJsonTuple(children: Seq[Expression]) extends GpuGenerator

withResource(fieldInstructions.safeMap(field => JSONUtils.getJsonObject(json, field))) {
resultCols =>
val generatorCols = resultCols.safeMap(_.incRefCount).zip(schema).safeMap {
case (col, dataType) => GpuColumnVector.from(col, dataType)
val generatorCols = resultCols.safeMap(_.incRefCount).safeMap {
col => GpuColumnVector.from(col, StringType)
}
val nonGeneratorCols = (0 until generatorOffset).safeMap { i =>
inputBatch.column(i).asInstanceOf[GpuColumnVector].incRefCount
Expand Down

0 comments on commit 6148257

Please sign in to comment.