inst/database/schema.yml

# By default first element is primary key, unless `primary_key` is
# given (may be multiple columns)
#
# By default fields are marked not null unless `nullable: true`

# This is just a standalone table with limited schema information.  I
# don't know what more we really need here.
orderly_schema:
  columns:
    - schema_version: {type: TEXT}
    - orderly_version: {type: TEXT}
    - created: {type: TIMESTAMP}

orderly_schema_tables:
  columns:
    - name: {type: TEXT}

# It might be more useful to put more information against this, but
# for now it doesn't really exist because all that information might
# vary with report version.  However, this could be used in order to
# support renames (VIMC-881), allowing us to override the previously
# used report name.
report:
  columns:
    - name: {type: TEXT}
    - latest: {type: TEXT, nullable: true, fk: report_version.id}

# The core table
report_version:
  columns:
    - id: {type: TEXT}
    - report: {fk: report.name}
    - date: {type: TIMESTAMP}
    - displayname: {type: TEXT, nullable: true}
    - description: {type: TEXT, nullable: true}
    - connection: {type: BOOLEAN}
    - published: {type: BOOLEAN}
    - elapsed: {type: DOUBLE PRECISION}
    - git_sha: {type: TEXT, nullable: true}
    - git_branch: {type: TEXT, nullable: true}
    - git_clean: {type: BOOLEAN, nullable: true}
    # NOTE: fields listed in orderly_config.yml will also be
    # included here.

# Custom fields - all coerced into text.
report_version_custom_fields:
  columns:
    - id: {type: SERIAL}
    - report_version: {fk: report_version.id}
    - key: {fk: custom_fields.id}
    - value: {type: TEXT}

custom_fields:
  columns:
    - id: {type: TEXT}
    - description: {type: TEXT, nullable: true}

# The extracted data
#
# I think that it would be nice to record the number of rows and
# columns here but that's going to require some additional work as
# that's not been recorded into any of the previous versions of the
# data.  So we'll have to get that into the rds/yml files and that
# requires some sort of data migration.  In an effort to simplify
# this, let's hold off for now.
data:
  columns:
    - hash: {type: TEXT}
    - size_csv: {type: BIGINT}
    - size_rds: {type: BIGINT}

# Every time we see a new file it'll end up here.  This would be a
# useful starting place for deduplicating the orderly archive in a
# fairly safe way.  I don't think that we want to (necessarily) try
# and index all files but certainly key inputs and outputs.
file:
  columns:
    - hash: {type: TEXT}
    - size: {type: BIGINT}

# Enum table for types of file use
file_purpose:
  columns:
    - name: {type: TEXT}
  values:
    - {name: source}
    - {name: script}
    - {name: resource}
    - {name: orderly_yml}
    - {name: global}
    - {name: readme}

# A realisation of a file requires a filename.  These are scoped by
# the report version and by the use within the report.
file_input:
  columns:
    - id: {type: SERIAL}
    - report_version: {fk: report_version.id}
    - file_hash: {fk: file.hash}
    - filename: {type: TEXT}
    # In theory 'use' would be derivable from where the other end of
    # the key goes but that's going to be super annoying to work out
    - file_purpose: {fk: file_purpose.name}
    # This would be great to have but seems overkill
    # - mime: {type: TEXT}

# global files might have been renamed so we track their original
# names here:
file_input_global:
  columns:
    - id: {type: SERIAL}
    - file_input: {fk: file_input.id}
    - filename: {type: TEXT}

# Link views into the report
report_version_view:
  columns:
    - id: {type: SERIAL}
    - report_version: {fk: report_version.id}
    - name: {type: TEXT}
    - database: {type: TEXT}
    - query: {type: TEXT}

# This is the input to get the data
report_version_data:
  columns:
    - id: {type: SERIAL}
    - report_version: {fk: report_version.id}
    - name: {type: TEXT}
    - database: {type: TEXT}
    - query: {type: TEXT}
    - hash: {fk: data.hash}

# What versions of packages were loaded?
report_version_package:
  columns:
    - id: {type: SERIAL}
    - report_version: {fk: report_version.id}
    - package_name: {type: TEXT}
    - package_version: {type: TEXT}

# Database instances the report version pulled data from
report_version_instance:
  columns:
    - id: {type: SERIAL}
    - report_version: {fk: report_version.id}
    - type: {type: TEXT}
    - instance: {type: TEXT}

# Values here will be populated by orderly as we keep a list in the
# package and we should not repeat that definition.
artefact_format:
  columns:
    - name: {type: TEXT}

report_version_artefact:
  columns:
    - id: {type: SERIAL}
    - report_version: {fk: report_version.id}
    - format: {fk: artefact_format.name}
    - description: {type: TEXT}
    - order: {type: INTEGER}

# This is _close_ to the same structure as file_input
file_artefact:
  columns:
    - id: {type: SERIAL}
    - artefact: {fk: report_version_artefact.id}
    - file_hash: {fk: file.hash}
    - filename: {type: TEXT}


# We could go more detailed here and get the report name and id but
# that would duplicate information so I'm not doing that here.  (The
# information redundant in the yml/rds but that's ok I feel).
depends:
  columns:
    - id: {type: SERIAL}
    - report_version: {fk: report_version.id}
    - use: {fk: file_artefact.id}
    - as: {type: TEXT}
    - is_pinned:
        type: BOOLEAN
        comment: >-
          Was the requested id a specific id, rather than "latest"?
    - is_latest:
        type: BOOLEAN
        comment: >-
          Was this the latest version when run? Not necessarily the
          same as "NOT is_pinned" as a report might have been pinned
          to the most recent report when it was run.

# Changelog
changelog_label:
  columns:
    - id: {type: TEXT}
    - public: {type: BOOLEAN}

changelog:
  columns:
    - id: {type: TEXT}
    - report_version: {fk: report_version.id}
    - report_version_public: {fk: report_version.id, nullable: true}
    - label: {type: TEXT, fk: changelog_label.id}
    - value: {type: TEXT}
    - from_file: {type: BOOLEAN}
    - ordering: {type: INTEGER}

parameters_type:
  columns:
    - name: {type: TEXT}
  values:
    - {name: text}
    - {name: number}
    - {name: boolean}

parameters:
  columns:
    - id: {type: SERIAL}
    - report_version: {fk: report_version.id}
    - name: {type: TEXT}
    - type: {fk: parameters_type.name}
    - value:
        type: TEXT
        comment: >-
          The value is always stored as text but might represent a
          number or a boolean.  If so then we store the text
          representation of the number or boolean.  Like JSON (and R
          in the way tyhat these are being used) we don't distinguish
          between integers and floating pointe numbers, instead using
          just "number" for both.  When storing booleans, we store
          lower case "true"/"false", like in JSON, and not like R.

tag:
  columns:
    - id: {type: TEXT}

report_version_tag:
  columns:
    - id: {type: SERIAL}
    - report_version: {fk: report_version.id}
    - tag: {fk: tag.id}

report_batch:
  columns:
    - id: {type: TEXT}

report_version_batch:
  columns:
    - report_version: {fk: report_version.id}
    - report_batch: {fk: report_batch.id}

workflow:
  columns:
    - id: {type: TEXT}

report_version_workflow:
  columns:
    - report_version: {type: TEXT}
    - workflow_id: {type: TEXT}