docs(model): auto-generated docs and hand-written docs for the metada…

…ta model (datahub-project#4189)
ppfenning · Feb 18, 2022 · eaf7b02 · eaf7b02
1 parent db8c215
commit eaf7b02
Show file tree

Hide file tree

Showing 27 changed files with 1,173 additions and 65 deletions.
diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml
@@ -19,6 +19,11 @@ jobs:
         uses: actions/setup-java@v1
         with:
           java-version: 1.8
+      - uses: actions/setup-python@v2
+        with:
+          python-version: 3.9.9
+      - name: Install Python dependencies
+        run: ./metadata-ingestion/scripts/install_deps.sh
       - name: Build Docs
         run: |
           ./gradlew --info docs-website:build

diff --git a/docs-website/build.gradle b/docs-website/build.gradle
@@ -63,7 +63,7 @@ task generateGraphQLDocumentation(type: YarnTask, dependsOn: [yarnInstall, gener
   args = ['docusaurus', 'docs:generate:graphql']
 }
 
-task yarnGenerate(type: YarnTask, dependsOn: [yarnInstall, generateGraphQLDocumentation] ) {
+task yarnGenerate(type: YarnTask, dependsOn: [yarnInstall, generateGraphQLDocumentation, ':metadata-ingestion:modelDocGen'] ) {
   inputs.files(projectMdFiles)
   outputs.cacheIf { true }
   args = ['run', 'generate']

diff --git a/docs-website/generateDocsDir.ts b/docs-website/generateDocsDir.ts
@@ -62,6 +62,14 @@ function list_markdown_files(): string[] {
     .toString()
     .trim()
     .split("\n");
+  let all_generated_markdown_files = execSync(
+    "cd .. && ls docs/generated/metamodel/**/*.md"
+  )
+    .toString()
+    .trim()
+    .split("\n");
+  all_markdown_files = [...all_markdown_files, ...all_generated_markdown_files];
+
   if (!process.env.CI) {
     // If not in CI, we also include "untracked" files.
     const untracked_files = execSync(
@@ -70,7 +78,7 @@ function list_markdown_files(): string[] {
       .toString()
       .trim()
       .split("\n")
-      .filter((filepath) => filepath);
+      .filter((filepath) => !all_generated_markdown_files.includes(filepath));
 
     if (untracked_files.length > 0) {
       console.log(`Including untracked files in docs list: ${untracked_files}`);
@@ -88,6 +96,7 @@ function list_markdown_files(): string[] {
     // Keep main docs for kubernetes, but skip the inner docs.
     /^datahub-kubernetes\//,
     // Various other docs/directories to ignore.
+    /^metadata-models\/docs\//, // these are used to generate docs, so we don't want to consider them here
     /^metadata-ingestion-examples\//,
     /^docker\/(?!README|datahub-upgrade|airflow\/local_airflow)/, // Drop all but a few docker docs.
     /^docs\/rfc\/templates\/000-template\.md$/,
@@ -492,7 +501,7 @@ function write_markdown_file(
   }
 
   // Error if a doc is not accounted for in a sidebar.
-  const autogenerated_sidebar_directories = [];
+  const autogenerated_sidebar_directories = ["docs/generated/metamodel"];
   for (const filepath of markdown_files) {
     if (
       autogenerated_sidebar_directories.some((dir) => filepath.startsWith(dir))

diff --git a/docs-website/sidebars.js b/docs-website/sidebars.js
@@ -112,6 +112,14 @@ module.exports = {
     "Metadata Modeling": [
       "docs/modeling/metadata-model",
       "docs/modeling/extending-the-metadata-model",
+      {
+        Entities: [
+          {
+            type: "autogenerated",
+            dirName: "docs/generated/metamodel/entities", // '.' means the current docs folder
+          },
+        ],
+      },
       // TODO: change the titles of these, removing the "What is..." portion from the sidebar"
       // "docs/what/entity",
       // "docs/what/aspect",

diff --git a/docs/modeling/metadata-model.md b/docs/modeling/metadata-model.md
@@ -36,12 +36,14 @@ Here is an example graph consisting of 3 types of entity (CorpUser, Chart, Dashb
 
 DataHub's "core" Entity types model the Data Assets that comprise the Modern Data Stack. They include 
 
-1. **Data Platform**: A type of Data "Platform". That is, an external system that is involved in processing, storing, or visualizing Data Assets. Examples include MySQL, Snowflake, Redshift, and S3. 
-2. **Dataset**: A collection of data. Tables, Views, Streams, Document Collections, and Files are all modeled as "Datasets" on DataHub. Datasets can have tags, owners, links, glossary terms, and descriptions attached to them. They can also have specific sub-types, such as "View", "Collection", "Stream", "Explore", and more. Examples include Postgres Tables, MongoDB Collections, or S3 files.
-3. **Chart**: A single data vizualization derived from a Dataset. A single Chart can be a part of multiple Dashboards. Charts can have tags, owners, links, glossary terms, and descriptions attached to them. Examples include a Superset or Looker Chart.
-4. **Dashboard**: A collection of Charts for visualization. Dashboards can have tags, owners, links, glossary terms, and descriptions attached to them. Examples include a Superset or Mode Dashboard.
-5. **Data Job** (Task): An executable job that processes data assets, where "processing" implies consuming data, producing data, or both. Data Jobs can have tags, owners, links, glossary terms, and descriptions attached to them. They must belong to a single Data Flow. Examples include an Airflow Task.
-6. **Data Flow** (Pipeline): An executable collection of Data Jobs with dependencies among them, or a DAG. Data Jobs can have tags, owners, links, glossary terms, and descriptions attached to them. Examples include an Airflow DAG. 
+1. **[Data Platform](docs/generated/metamodel/entities/dataPlatform.md)**: A type of Data "Platform". That is, an external system that is involved in processing, storing, or visualizing Data Assets. Examples include MySQL, Snowflake, Redshift, and S3. 
+2. **[Dataset](docs/generated/metamodel/entities/dataset.md)**: A collection of data. Tables, Views, Streams, Document Collections, and Files are all modeled as "Datasets" on DataHub. Datasets can have tags, owners, links, glossary terms, and descriptions attached to them. They can also have specific sub-types, such as "View", "Collection", "Stream", "Explore", and more. Examples include Postgres Tables, MongoDB Collections, or S3 files.
+3. **[Chart](docs/generated/metamodel/entities/chart.md)**: A single data vizualization derived from a Dataset. A single Chart can be a part of multiple Dashboards. Charts can have tags, owners, links, glossary terms, and descriptions attached to them. Examples include a Superset or Looker Chart.
+4. **[Dashboard](docs/generated/metamodel/entities/dashboard.md)**: A collection of Charts for visualization. Dashboards can have tags, owners, links, glossary terms, and descriptions attached to them. Examples include a Superset or Mode Dashboard.
+5. **[Data Job](docs/generated/metamodel/entities/dataJob.md)** (Task): An executable job that processes data assets, where "processing" implies consuming data, producing data, or both. Data Jobs can have tags, owners, links, glossary terms, and descriptions attached to them. They must belong to a single Data Flow. Examples include an Airflow Task.
+6. **[Data Flow](docs/generated/metamodel/entities/dataFlow.md)** (Pipeline): An executable collection of Data Jobs with dependencies among them, or a DAG. Data Jobs can have tags, owners, links, glossary terms, and descriptions attached to them. Examples include an Airflow DAG.
+
+See the **Metadata Modeling/Entities** section on the left to explore the entire model.
 
 ## The Entity Registry
 
@@ -73,29 +75,20 @@ to the YAML configuration, instead of creating new Snapshot / Aspect files.
 To explore the current DataHub metadata model, you can inspect this high-level picture that shows the different entities and edges between them showing the relationships between them. 
 ![Metadata Model Graph](../imgs/datahub-metadata-model.png)
 
-To navigate the aspect model for specific entities and explore relationships using the `foreign-key` concept, you can view them in our demo environment.
+To navigate the aspect model for specific entities and explore relationships using the `foreign-key` concept, you can view them in our demo environment or navigate the auto-generated docs in the **Metadata Modeling/Entities** section on the left.
 
 For example, here are helpful links to the most popular entities in DataHub's metadata model: 
-* Dataset: [Profile](https://demo.datahubproject.io/dataset/urn:li:dataset:(urn:li:dataPlatform:datahub,Dataset,PROD)/Schema?is_lineage_mode=false) [Documentation](https://demo.datahubproject.io/dataset/urn:li:dataset:(urn:li:dataPlatform:datahub,Dataset,PROD)/Documentation?is_lineage_mode=false)
-* Dashboard: [Profile](https://demo.datahubproject.io/dataset/urn:li:dataset:(urn:li:dataPlatform:datahub,Dashboard,PROD)/Schema?is_lineage_mode=false) [Documentation](https://demo.datahubproject.io/dataset/urn:li:dataset:(urn:li:dataPlatform:datahub,Dashboard,PROD)/Documentation?is_lineage_mode=false)
-* User (a.k.a CorpUser): [Profile](https://demo.datahubproject.io/dataset/urn:li:dataset:(urn:li:dataPlatform:datahub,Corpuser,PROD)/Schema?is_lineage_mode=false) [Documentation](https://demo.datahubproject.io/dataset/urn:li:dataset:(urn:li:dataPlatform:datahub,Corpuser,PROD)/Documentation?is_lineage_mode=false)
-* Pipeline (a.k.a DataFlow): [Profile](https://demo.datahubproject.io/dataset/urn:li:dataset:(urn:li:dataPlatform:datahub,DataFlow,PROD)/Schema?is_lineage_mode=false) [Documentation](https://demo.datahubproject.io/dataset/urn:li:dataset:(urn:li:dataPlatform:datahub,DataFlow,PROD)/Documentation?is_lineage_mode=false)
-* Feature Table (a.k.a. MLFeatureTable): [Profile](https://demo.datahubproject.io/dataset/urn:li:dataset:(urn:li:dataPlatform:datahub,MlFeatureTable,PROD)/Schema?is_lineage_mode=false) [Documentation](https://demo.datahubproject.io/dataset/urn:li:dataset:(urn:li:dataPlatform:datahub,MlFeatureTable,PROD)/Documentation?is_lineage_mode=false)
-* For the full list of entities in the metadata model, browse them [here](https://demo.datahubproject.io/browse/dataset/prod/datahub/entities)
-
-During metadata ingestion, these entities are represented using [metadata events](../what/mxe.md).
+* [Dataset](docs/generated/metamodel/entities/dataset.md): [Profile](https://demo.datahubproject.io/dataset/urn:li:dataset:(urn:li:dataPlatform:datahub,Dataset,PROD)/Schema?is_lineage_mode=false) [Documentation](https://demo.datahubproject.io/dataset/urn:li:dataset:(urn:li:dataPlatform:datahub,Dataset,PROD)/Documentation?is_lineage_mode=false)
+* [Dashboard](docs/generated/metamodel/entities/dashboard.md): [Profile](https://demo.datahubproject.io/dataset/urn:li:dataset:(urn:li:dataPlatform:datahub,Dashboard,PROD)/Schema?is_lineage_mode=false) [Documentation](https://demo.datahubproject.io/dataset/urn:li:dataset:(urn:li:dataPlatform:datahub,Dashboard,PROD)/Documentation?is_lineage_mode=false)
+* [User (a.k.a CorpUser)](docs/generated/metamodel/entities/corpuser.md): [Profile](https://demo.datahubproject.io/dataset/urn:li:dataset:(urn:li:dataPlatform:datahub,Corpuser,PROD)/Schema?is_lineage_mode=false) [Documentation](https://demo.datahubproject.io/dataset/urn:li:dataset:(urn:li:dataPlatform:datahub,Corpuser,PROD)/Documentation?is_lineage_mode=false)
+* [Pipeline (a.k.a DataFlow)](docs/generated/metamodel/entities/dataFlow.md): [Profile](https://demo.datahubproject.io/dataset/urn:li:dataset:(urn:li:dataPlatform:datahub,DataFlow,PROD)/Schema?is_lineage_mode=false) [Documentation](https://demo.datahubproject.io/dataset/urn:li:dataset:(urn:li:dataPlatform:datahub,DataFlow,PROD)/Documentation?is_lineage_mode=false)
+* [Feature Table (a.k.a. MLFeatureTable)](docs/generated/metamodel/entities/mlFeatureTable.md): [Profile](https://demo.datahubproject.io/dataset/urn:li:dataset:(urn:li:dataPlatform:datahub,MlFeatureTable,PROD)/Schema?is_lineage_mode=false) [Documentation](https://demo.datahubproject.io/dataset/urn:li:dataset:(urn:li:dataPlatform:datahub,MlFeatureTable,PROD)/Documentation?is_lineage_mode=false)
+* For the full list of entities in the metadata model, browse them [here](https://demo.datahubproject.io/browse/dataset/prod/datahub/entities) or use the **Metadata Modeling/Entities** section on the left.
 
 ### Generating documentation for the Metadata Model
 
-The metadata model documentation can be generated and uploaded into a running DataHub instance using the following command below.
-
-```console
-./gradlew :metadata-ingestion:modelDocUpload
-```
-
-**_NOTE_**: This will upload the model documentation to the DataHub instance running at the environment variable `$DATAHUB_SERVER` (http://localhost:8080 by default)
-
-It will also generate a few files under `metadata-ingestion/generated/docs` such as a dot file called `metadata_graph.dot` that you can use to visualize the relationships among the entities.
+- This website: Metadata model documentation for this website is generated using `./gradlew :docs-website:yarnBuild`, which delegates the model doc generation to the `modelDocGen` task in the `metadata-ingestion` module.
+- Uploading documentation to a running DataHub Instance: The metadata model documentation can be generated and uploaded into a running DataHub instance using the command `./gradlew :metadata-ingestion:modelDocUpload`. **_NOTE_**: This will upload the model documentation to the DataHub instance running at the environment variable `$DATAHUB_SERVER` (http://localhost:8080 by default)
 
 ## Querying the Metadata Graph 
 

diff --git a/entity-registry/src/main/java/com/linkedin/metadata/models/registry/config/Entity.java b/entity-registry/src/main/java/com/linkedin/metadata/models/registry/config/Entity.java
@@ -5,11 +5,14 @@
 import lombok.AllArgsConstructor;
 import lombok.NoArgsConstructor;
 import lombok.Value;
+import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
+
 
 
 @Value
 @NoArgsConstructor(force = true, access = AccessLevel.PRIVATE)
 @AllArgsConstructor
+@JsonIgnoreProperties(ignoreUnknown = true)
 public class Entity {
   String name;
   String doc;

diff --git a/entity-registry/src/test/resources/test-entity-registry.yml b/entity-registry/src/test/resources/test-entity-registry.yml
@@ -2,6 +2,7 @@ id: test-registry
 entities:
   - name: dataset
     keyAspect: datasetKey
+    category: core
     aspects:
       - datasetProperties
       - schemaMetadata

diff --git a/metadata-ingestion/build.gradle b/metadata-ingestion/build.gradle
@@ -43,10 +43,15 @@ task installDev(type: Exec, dependsOn: [install]) {
     "${venv_name}/bin/pip install -e .[dev] && touch ${venv_name}/.build_install_dev_sentinel"
 }
 
-task modelDocGen(type: Exec, dependsOn: [codegen, installDev]) {
-    inputs.files(project.fileTree(dir: "../metadata-events/mxe-schemas/src/", include: "**/*.avsc"))
-    outputs.dir('generated/docs')
-    commandLine 'bash', '-c', "source ${venv_name}/bin/activate && ./scripts/modeldocgen.sh"
+task modelDocGen(type: Exec, dependsOn: [codegen]) {
+  inputs.files(
+    file('scripts/modeldocgen.py'),
+    project.fileTree(dir: "../metadata-models/docs/entities/", include: "**/*.md"),
+    project.fileTree(dir: "examples/", include: "**/*.py"),
+    project.fileTree(dir: "../metadata-events/mxe-schemas/src/", include: "**/*.avsc")
+  )
+  outputs.dir('../docs/generated')
+  commandLine 'bash', '-c', "source ${venv_name}/bin/activate && ./scripts/modeldocgen.sh"
 }
 
 task modelDocUpload(type: Exec, dependsOn: [modelDocGen]) {
@@ -127,5 +132,6 @@ clean {
   delete 'build'
   delete 'dist'
   delete 'src/datahub/metadata'
+  delete '../docs/generated'
 }
 clean.dependsOn cleanPythonCache
diff --git a/metadata-ingestion/examples/library/data_quality_mcpw_rest.py b/metadata-ingestion/examples/library/data_quality_mcpw_rest.py
@@ -30,9 +30,7 @@ def assertionUrn(info: AssertionInfo) -> str:
     return builder.make_assertion_urn(assertionId)
 
 
-def emitAssertionResult(
-    assertionResult: AssertionResult, datasetUrn: str
-) -> None:
+def emitAssertionResult(assertionResult: AssertionResult, datasetUrn: str) -> None:
 
     dataset_assertionResult_mcp = MetadataChangeProposalWrapper(
         entityType="dataset",

diff --git a/metadata-ingestion/examples/library/dataset_add_column_tag.py b/metadata-ingestion/examples/library/dataset_add_column_tag.py
@@ -0,0 +1,110 @@
+import logging
+import time
+
+from datahub.emitter.mce_builder import make_dataset_urn, make_tag_urn
+from datahub.emitter.mcp import MetadataChangeProposalWrapper
+
+# read-modify-write requires access to the DataHubGraph (RestEmitter is not enough)
+from datahub.ingestion.graph.client import DatahubClientConfig, DataHubGraph
+
+# Imports for metadata model classes
+from datahub.metadata.schema_classes import (
+    AuditStampClass,
+    ChangeTypeClass,
+    EditableSchemaFieldInfoClass,
+    EditableSchemaMetadataClass,
+    GlobalTagsClass,
+    TagAssociationClass,
+)
+
+log = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+
+
+def get_simple_field_path_from_v2_field_path(field_path: str) -> str:
+    """A helper function to extract simple . path notation from the v2 field path"""
+    if field_path.startswith("[version=2.0]"):
+        # this is a v2 field path
+        tokens = [
+            t
+            for t in field_path.split(".")
+            if not (t.startswith("[") or t.endswith("]"))
+        ]
+        path = ".".join(tokens)
+        return path
+    else:
+        # not a v2, we assume this is a simple path
+        return field_path
+
+
+# Inputs -> the column, dataset and the tag to set
+column = "address.zipcode"
+dataset_urn = make_dataset_urn(platform="hive", name="realestate_db.sales", env="PROD")
+tag_to_add = make_tag_urn("location")
+
+
+# First we get the current editable schema metadata
+gms_endpoint = "http://localhost:8080"
+graph = DataHubGraph(DatahubClientConfig(server=gms_endpoint))
+
+
+current_editable_schema_metadata = graph.get_aspect(
+    entity_urn=dataset_urn,
+    aspect="editableSchemaMetadata",
+    aspect_type=EditableSchemaMetadataClass,
+)
+
+
+# Some pre-built objects to help all the conditional pathways
+tag_association_to_add = TagAssociationClass(tag=tag_to_add)
+tags_aspect_to_set = GlobalTagsClass(tags=[tag_association_to_add])
+field_info_to_set = EditableSchemaFieldInfoClass(
+    fieldPath=column, globalTags=tags_aspect_to_set
+)
+
+
+need_write = False
+field_match = False
+if current_editable_schema_metadata:
+    for fieldInfo in current_editable_schema_metadata.editableSchemaFieldInfo:
+        if get_simple_field_path_from_v2_field_path(fieldInfo.fieldPath) == column:
+            # we have some editable schema metadata for this field
+            field_match = True
+            if fieldInfo.globalTags:
+                if tag_to_add not in [x.tag for x in fieldInfo.globalTags.tags]:
+                    # this tag is not present
+                    fieldInfo.globalTags.tags.append(tag_association_to_add)
+                    need_write = True
+            else:
+                fieldInfo.globalTags = tags_aspect_to_set
+                need_write = True
+
+    if not field_match:
+        # this field isn't present in the editable schema metadata aspect, add it
+        field_info = field_info_to_set
+        current_editable_schema_metadata.editableSchemaFieldInfo.append(field_info)
+        need_write = True
+
+else:
+    # create a brand new editable schema metadata aspect
+    now = int(time.time() * 1000)  # milliseconds since epoch
+    current_timestamp = AuditStampClass(time=now, actor="urn:li:corpuser:ingestion")
+    current_editable_schema_metadata = EditableSchemaMetadataClass(
+        editableSchemaFieldInfo=[field_info_to_set],
+        created=current_timestamp,
+    )
+    need_write = True
+
+if need_write:
+    event: MetadataChangeProposalWrapper = MetadataChangeProposalWrapper(
+        entityType="dataset",
+        changeType=ChangeTypeClass.UPSERT,
+        entityUrn=dataset_urn,
+        aspectName="editableSchemaMetadata",
+        aspect=current_editable_schema_metadata,
+    )
+    graph.emit(event)
+    log.info(f"Tag {tag_to_add} added to column {column} of dataset {dataset_urn}")
+
+else:
+    log.info(f"Tag {tag_to_add} already attached to column {column}, omitting write")