Skip to content

Commit

Permalink
fix(ingest/json-schema): handle property inheritance in unions (datah…
Browse files Browse the repository at this point in the history
  • Loading branch information
hsheth2 authored May 31, 2023
1 parent b42f518 commit a29b576
Show file tree
Hide file tree
Showing 2 changed files with 156 additions and 3 deletions.
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import json
import logging
import unittest
import unittest.mock
from hashlib import md5
from typing import Any, Callable, Dict, Iterable, List, Optional, Type

Expand Down Expand Up @@ -455,7 +455,25 @@ def _field_from_complex_type(
(union_category, union_category_schema) = [
(k, v) for k, v in union_category_map.items() if v
][0]
if not field_path.has_field_name() and len(union_category_schema) == 1:
# Special case: If this is a top-level field AND there is only one type in the
# union, we collapse down the union to avoid extra nesting.
union_schema = union_category_schema[0]
merged_union_schema = (
JsonSchemaTranslator._retain_parent_schema_props_in_union(
union_schema=union_schema, parent_schema=schema
)
)
yield from JsonSchemaTranslator.get_fields(
JsonSchemaTranslator._get_type_from_schema(merged_union_schema),
merged_union_schema,
required=required,
base_field_path=field_path,
)
return # this one is done
if field_path.has_field_name():
# The frontend expects the top-level field to be a record, so we only
# include the UnionTypeClass if we're not at the top level.
yield SchemaField(
fieldPath=field_path.expand_type("union", schema).as_string(),
type=type_override or SchemaFieldDataTypeClass(UnionTypeClass()),
Expand All @@ -481,16 +499,41 @@ def _field_from_complex_type(
union_field_path._set_parent_type_if_not_exists(
DataHubType(type=UnionTypeClass, nested_type=union_type)
)
merged_union_schema = (
JsonSchemaTranslator._retain_parent_schema_props_in_union(
union_schema=union_schema, parent_schema=schema
)
)
yield from JsonSchemaTranslator.get_fields(
JsonSchemaTranslator._get_type_from_schema(union_schema),
union_schema,
JsonSchemaTranslator._get_type_from_schema(merged_union_schema),
merged_union_schema,
required=required,
base_field_path=union_field_path,
specific_type=union_type,
)
else:
raise Exception(f"Unhandled type {datahub_field_type}")

@staticmethod
def _retain_parent_schema_props_in_union(
union_schema: Dict, parent_schema: Dict
) -> Dict:
"""Merge the "properties" and the "required" fields from the parent schema into the child union schema."""

union_schema = union_schema.copy()
if "properties" in parent_schema:
union_schema["properties"] = {
**parent_schema["properties"],
**union_schema.get("properties", {}),
}
if "required" in parent_schema:
union_schema["required"] = [
*parent_schema["required"],
*union_schema.get("required", []),
]

return union_schema

@staticmethod
def get_type_mapping(json_type: str) -> Type:
return JsonSchemaTranslator.field_type_mapping.get(json_type, NullTypeClass)
Expand Down
110 changes: 110 additions & 0 deletions metadata-ingestion/tests/unit/schema/test_json_schema_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -710,3 +710,113 @@ def test_required_field():
assert fields[1].nullable is True
assert json.loads(fields[1].jsonProps or "{}")["required"] is True
assert json.loads(fields[0].jsonProps or "{}")["required"] is False


def test_anyof_with_properties():
# We expect the event / timestamp fields to be included in both branches of the anyOf.

schema = {
"$id": "test",
"$schema": "https://json-schema.org/draft/2020-12/schema",
"additionalProperties": False,
"anyOf": [{"required": ["anonymousId"]}, {"required": ["userId"]}],
"properties": {
"anonymousId": {
"description": "A temporary user id, used before a user logs in.",
"format": "uuid",
"type": "string",
},
"userId": {
"description": "Unique user id.",
"type": "string",
},
"event": {"description": "Unique name of the event.", "type": "string"},
"timestamp": {
"description": "Timestamp of when the message itself took place.",
"type": "string",
},
},
"required": ["event"],
"type": "object",
}

fields = list(JsonSchemaTranslator.get_fields_from_schema(schema))
expected_field_paths: List[str] = [
"[version=2.0].[type=union].[type=union_0].[type=string(uuid)].anonymousId",
"[version=2.0].[type=union].[type=union_0].[type=string].userId",
"[version=2.0].[type=union].[type=union_0].[type=string].event",
"[version=2.0].[type=union].[type=union_0].[type=string].timestamp",
"[version=2.0].[type=union].[type=union_1].[type=string(uuid)].anonymousId",
"[version=2.0].[type=union].[type=union_1].[type=string].userId",
"[version=2.0].[type=union].[type=union_1].[type=string].event",
"[version=2.0].[type=union].[type=union_1].[type=string].timestamp",
]
assert_field_paths_match(fields, expected_field_paths)
assert_fields_are_valid(fields)

# In the first one, the anonymousId is required, but the userId is not.
assert json.loads(fields[0].jsonProps or "{}")["required"] is True
assert json.loads(fields[1].jsonProps or "{}")["required"] is False

# In the second one, the userId is required, but the anonymousId is not.
assert json.loads(fields[4].jsonProps or "{}")["required"] is False
assert json.loads(fields[5].jsonProps or "{}")["required"] is True

# The event field is required in both branches.
assert json.loads(fields[2].jsonProps or "{}")["required"] is True
assert json.loads(fields[6].jsonProps or "{}")["required"] is True

# The timestamp field is not required in either branch.
assert json.loads(fields[3].jsonProps or "{}")["required"] is False
assert json.loads(fields[7].jsonProps or "{}")["required"] is False


def test_top_level_trival_allof():
schema = {
"$schema": "https://json-schema.org/draft/2020-12/schema",
"$id": "event-wrapper",
"type": "object",
"allOf": [
{
"$schema": "https://json-schema.org/draft/2020-12/schema",
"$id": "event",
"properties": {
"userId": {
"description": "Unique user id.",
"type": "string",
},
"event": {
"description": "Unique name of the event.",
"type": "string",
},
"timestamp": {
"description": "Timestamp of when the message itself took place.",
"type": "string",
},
},
"required": ["event"],
"type": "object",
"additionalProperties": False,
},
],
"properties": {
"extra-top-level-property": {
"type": "string",
},
},
}

fields = list(JsonSchemaTranslator.get_fields_from_schema(schema))
expected_field_paths: List[str] = [
"[version=2.0].[type=object].[type=string].extra-top-level-property",
"[version=2.0].[type=object].[type=string].userId",
"[version=2.0].[type=object].[type=string].event",
"[version=2.0].[type=object].[type=string].timestamp",
]
assert_field_paths_match(fields, expected_field_paths)
assert_fields_are_valid(fields)

assert json.loads(fields[0].jsonProps or "{}")["required"] is False
assert json.loads(fields[1].jsonProps or "{}")["required"] is False
assert json.loads(fields[2].jsonProps or "{}")["required"] is True
assert json.loads(fields[3].jsonProps or "{}")["required"] is False

0 comments on commit a29b576

Please sign in to comment.