Merge branch 'develop' into 7932-zip-download-limit

niomictomi · Jul 21, 2021 · 95b9223 · 95b9223
2 parents b0b9425 + 5557145
commit 95b9223
Show file tree

Hide file tree

Showing 10 changed files with 248 additions and 102 deletions.
diff --git a/doc/release-notes/7548-stored-procedure-update.md b/doc/release-notes/7548-stored-procedure-update.md
@@ -0,0 +1,15 @@
+### Upgrade Notes
+
+**If your installation relies on the database-side stored procedure for generating sequential numeric identifiers:**
+
+*(Note: You can skip the following paragraph if your installation uses the default-style, randomly-generated six alphanumeric 
+character-long identifiers for your datasets!)*
+
+The underlying database framework has been modified in this release, to make it easier for installations 
+to create custom procedures for generating identifier strings that suit their needs. Your current configuration will 
+be automatically updated by the database upgrade (Flyway) script incorporated in the release. No manual configuration 
+changes should be necessary. However, after the upgrade, we recommend that you confirm that your installation can still 
+create new datasets, and that they are still assigned sequential numeric identifiers. In the unlikely chance that this 
+is no longer working, please re-create the stored procedure following the steps described in the documentation for the 
+`:IdentifierGenerationStyle` setting in the *Configuration* section of the Installation Guide for this release (v5.6). 
+(Running the script supplied there will NOT overwrite the position on the sequence you are currently using!)
diff --git a/doc/sphinx-guides/source/_static/util/createsequence.sql b/doc/sphinx-guides/source/_static/util/createsequence.sql
@@ -1,14 +1,14 @@
--- A script for creating a numeric identifier sequence, and an external 
--- stored procedure, for accessing the sequence from inside the application, 
--- in a non-hacky, JPA way. 
+-- A script for creating a numeric identifier sequence, and an external
+-- stored procedure, for accessing the sequence from inside the application,
+-- in a non-hacky, JPA way.
 
 -- NOTE:
 
 -- 1. The database user name "dvnapp" is hard-coded here - it may
 -- need to be changed to match your database user name;
-  
+
 -- 2. In the code below, the sequence starts with 1, but it can be adjusted by
--- changing the MINVALUE as needed. 
+-- changing the MINVALUE as needed.
 
 CREATE SEQUENCE datasetidentifier_seq
   INCREMENT 1
@@ -22,12 +22,12 @@ ALTER TABLE datasetidentifier_seq OWNER TO "dvnapp";
 -- And now create a PostgreSQL FUNCTION, for JPA to
 -- access as a NamedStoredProcedure:
 
-CREATE OR REPLACE FUNCTION generateIdentifierAsSequentialNumber(
-    OUT identifier int)
-  RETURNS int AS
-$BODY$
+CREATE OR REPLACE FUNCTION generateIdentifierFromStoredProcedure()
+RETURNS varchar AS $$
+DECLARE
+  identifier varchar;
 BEGIN
-    select nextval('datasetidentifier_seq') into identifier;
+  identifier := nextval('datasetidentifier_seq')::varchar;
+  RETURN identifier;
 END;
-$BODY$
-  LANGUAGE plpgsql;
+$$ LANGUAGE plpgsql IMMUTABLE;
diff --git a/doc/sphinx-guides/source/_static/util/identifier_from_timestamp.sql b/doc/sphinx-guides/source/_static/util/identifier_from_timestamp.sql
@@ -0,0 +1,46 @@
+-- A script for creating, through a database stored procedure, sequential
+-- 8 character identifiers from a base36 representation of current timestamp.
+
+CREATE OR REPLACE FUNCTION base36_encode(
+  IN digits bigint, IN min_width int = 0)
+RETURNS varchar AS $$
+DECLARE
+    chars char[];
+    ret varchar;
+    val bigint;
+BEGIN
+    chars := ARRAY[
+      '0','1','2','3','4','5','6','7','8','9',
+      'a','b','c','d','e','f','g','h','i','j',
+      'k','l','m','n','o','p','q','r','s','t',
+      'u','v','w','x','y','z'];
+    val := digits;
+    ret := '';
+    IF val < 0 THEN
+        val := val * -1;
+    END IF;
+    WHILE val != 0 LOOP
+        ret := chars[(val % 36)+1] || ret;
+        val := val / 36;
+    END LOOP;
+
+    IF min_width > 0 AND char_length(ret) < min_width THEN
+        ret := lpad(ret, min_width, '0');
+    END IF;
+
+    RETURN ret;
+END;
+$$ LANGUAGE plpgsql IMMUTABLE;
+
+
+CREATE OR REPLACE FUNCTION generateIdentifierFromStoredProcedure()
+RETURNS varchar AS $$
+DECLARE
+    curr_time_msec bigint;
+    identifier varchar;
+BEGIN
+    curr_time_msec := extract(epoch from now())*1000;
+    identifier := base36_encode(curr_time_msec);
+    RETURN identifier;
+END;
+$$ LANGUAGE plpgsql IMMUTABLE;
diff --git a/doc/sphinx-guides/source/installation/config.rst b/doc/sphinx-guides/source/installation/config.rst
@@ -529,24 +529,25 @@ Lastly, go ahead and restart your Payara server. With Dataverse deployed and the
 S3 Storage Options
 ##################
 
-===========================================  ==================  =========================================================================  =============
-JVM Option                                   Value               Description                                                                Default value
-===========================================  ==================  =========================================================================  =============
-dataverse.files.storage-driver-id            <id>                Enable <id> as the default storage driver.                                 ``file``
-dataverse.files.<id>.bucket-name             <?>                 The bucket name. See above.                                                (none)
-dataverse.files.<id>.download-redirect       ``true``/``false``  Enable direct download or proxy through Dataverse.                         ``false``
-dataverse.files.<id>.upload-redirect         ``true``/``false``  Enable direct upload of files added to a dataset  to the S3 store.         ``false``
-dataverse.files.<id>.ingestsizelimit         <size in bytes>     Maximum size of directupload files that should be ingested                 (none)
-dataverse.files.<id>.url-expiration-minutes  <?>                 If direct uploads/downloads: time until links expire. Optional.            60
-dataverse.files.<id>.min-part-size           <?>                 Multipart direct uploads will occur for files larger than this. Optional.  ``1024**3``
-dataverse.files.<id>.custom-endpoint-url     <?>                 Use custom S3 endpoint. Needs URL either with or without protocol.         (none)
-dataverse.files.<id>.custom-endpoint-region  <?>                 Only used when using custom endpoint. Optional.                            ``dataverse``
-dataverse.files.<id>.proxy-url               <?>                 URL of a proxy protecting the S3 store. Optional.                          (none)
-dataverse.files.<id>.path-style-access       ``true``/``false``  Use path style buckets instead of subdomains. Optional.                    ``false``
-dataverse.files.<id>.payload-signing         ``true``/``false``  Enable payload signing. Optional                                           ``false``
-dataverse.files.<id>.chunked-encoding        ``true``/``false``  Disable chunked encoding. Optional                                         ``true``
-dataverse.files.<id>.connection-pool-size    <?>                 The maximum number of open connections to the S3 server                    ``256``
-===========================================  ==================  =========================================================================  =============
+===========================================  ==================  ==========================================================================  =============
+JVM Option                                   Value               Description                                                                 Default value
+===========================================  ==================  ==========================================================================  =============
+dataverse.files.storage-driver-id            <id>                Enable <id> as the default storage driver.                                  ``file``
+dataverse.files.<id>.bucket-name             <?>                 The bucket name. See above.                                                 (none)
+dataverse.files.<id>.download-redirect       ``true``/``false``  Enable direct download or proxy through Dataverse.                          ``false``
+dataverse.files.<id>.upload-redirect         ``true``/``false``  Enable direct upload of files added to a dataset  to the S3 store.          ``false``
+dataverse.files.<id>.ingestsizelimit         <size in bytes>     Maximum size of directupload files that should be ingested                  (none)
+dataverse.files.<id>.url-expiration-minutes  <?>                 If direct uploads/downloads: time until links expire. Optional.             60
+dataverse.files.<id>.min-part-size           <?>                 Multipart direct uploads will occur for files larger than this. Optional.   ``1024**3``
+dataverse.files.<id>.custom-endpoint-url     <?>                 Use custom S3 endpoint. Needs URL either with or without protocol.          (none)
+dataverse.files.<id>.custom-endpoint-region  <?>                 Only used when using custom endpoint. Optional.                             ``dataverse``
+dataverse.files.<id>.profile                 <?>                 Allows the use of AWS profiles for storage spanning multiple AWS accounts.  (none)
+dataverse.files.<id>.proxy-url               <?>                 URL of a proxy protecting the S3 store. Optional.                           (none)
+dataverse.files.<id>.path-style-access       ``true``/``false``  Use path style buckets instead of subdomains. Optional.                     ``false``
+dataverse.files.<id>.payload-signing         ``true``/``false``  Enable payload signing. Optional                                            ``false``
+dataverse.files.<id>.chunked-encoding        ``true``/``false``  Disable chunked encoding. Optional                                          ``true``
+dataverse.files.<id>.connection-pool-size    <?>                 The maximum number of open connections to the S3 server                     ``256``
+===========================================  ==================  ==========================================================================  =============
 
 Reported Working S3-Compatible Storage
 ######################################
@@ -1477,49 +1478,96 @@ Out of the box, the DOI shoulder is set to "FK2/" but this is for testing only!
 :IdentifierGenerationStyle
 ++++++++++++++++++++++++++
 
-By default, the Dataverse Software generates a random 6 character string, pre-pended by the Shoulder if set, to use as the identifier
-for a Dataset. Set this to ``sequentialNumber`` to use sequential numeric values
-instead (again pre-pended by the Shoulder if set). (the assumed default setting is ``randomString``).
-In addition to this setting, a database sequence must be created in the database.
-We provide the script below (downloadable :download:`here </_static/util/createsequence.sql>`).
-You may need to make some changes to suit your system setup, see the comments for more information:
+By default, the Dataverse Software generates a random 6 character string,
+pre-pended by the Shoulder if set, to use as the identifier for a Dataset.
+Set this to ``storedProcGenerated`` to generate instead a custom *unique*
+identifier (again pre-pended by the Shoulder if set) through a database
+stored procedure or function (the assumed default setting is ``randomString``).
+In addition to this setting, a stored procedure or function must be created in
+the database.
+
+As a first example, the script below (downloadable
+:download:`here </_static/util/createsequence.sql>`) produces
+sequential numerical values. You may need to make some changes to suit your
+system setup, see the comments for more information:
 
 .. literalinclude:: ../_static/util/createsequence.sql
+   :language: plpgsql
+
+As a second example, the script below (downloadable
+:download:`here </_static/util/identifier_from_timestamp.sql>`) produces
+sequential 8 character identifiers from a base36 representation of current
+timestamp.
+
+.. literalinclude:: ../_static/util/identifier_from_timestamp.sql
+   :language: plpgsql
 
-Note that the SQL above is Postgres-specific. If necessary, it can be reimplemented
-in any other SQL flavor - the standard JPA code in the application simply expects
-the database to have a saved function ("stored procedure") named ``generateIdentifierAsSequentialNumber``
-with the single return argument ``identifier``.
+Note that the SQL in these examples scripts is Postgres-specific.
+If necessary, it can be reimplemented in any other SQL flavor - the standard
+JPA code in the application simply expects the database to have a saved
+function ("stored procedure") named ``generateIdentifierFromStoredProcedure()``
+returning a single ``varchar`` argument.
 
-Please note that ``:IdentifierGenerationStyle`` also plays a role for the "identifier" for files. See the section on ``:DataFilePIDFormat`` below for more details.
+Please note that ``:IdentifierGenerationStyle`` also plays a role for the
+"identifier" for files. See the section on :ref:`:DataFilePIDFormat` below for
+more details.
 
 .. _:DataFilePIDFormat:
 
 :DataFilePIDFormat
 ++++++++++++++++++
 
-This setting controls the way that the "identifier" component of a file's persistent identifier (PID) relates to the PID of its "parent" dataset.
-
-By default the identifier for a file is dependent on its parent dataset. For example, if the identifier of a dataset is "TJCLKP", the identifier for a file within that dataset will consist of the parent dataset's identifier followed by a slash ("/"), followed by a random 6 character string, yielding "TJCLKP/MLGWJO". Identifiers in this format are what you should expect if you leave ``:DataFilePIDFormat`` undefined or set it to ``DEPENDENT`` and have not changed the ``:IdentifierGenerationStyle`` setting from its default.
-
-Alternatively, the identifier for File PIDs can be configured to be independent of Dataset PIDs using the setting "``INDEPENDENT``". In this case, file PIDs will not contain the PIDs of their parent datasets, and their PIDs will be generated the exact same way that datasets' PIDs are, based on the ``:IdentifierGenerationStyle`` setting described above (random 6 character strings or sequential numbers, pre-pended by any shoulder).
-
-The chart below shows examples from each possible combination of parameters from the two settings. ``:IdentifierGenerationStyle`` can be either ``randomString`` (the default) or ``sequentialNumber`` and ``:DataFilePIDFormat`` can be either ``DEPENDENT`` (the default) or ``INDEPENDENT``. In the examples below the "identifier" for the dataset is "TJCLKP" for "randomString" and "100001" for "sequentialNumber".
-
-+-----------------+---------------+------------------+
-|                 | randomString  | sequentialNumber |
-|                 |               |                  |
-+=================+===============+==================+
-| **DEPENDENT**   | TJCLKP/MLGWJO | 100001/1         |
-+-----------------+---------------+------------------+
-| **INDEPENDENT** | MLGWJO        | 100002           |
-+-----------------+---------------+------------------+
-
-As seen above, in cases where ``:IdentifierGenerationStyle`` is set to *sequentialNumber* and ``:DataFilePIDFormat`` is set to *DEPENDENT*, each file within a dataset will be assigned a number *within* that dataset starting with "1".
-
-Otherwise, if ``:DataFilePIDFormat`` is set to *INDEPENDENT*, then each file will be assigned a PID with the next number in the overall sequence, regardless of what dataset it is in. If the file is created after a dataset with the PID 100001, then the file will be assigned the PID 100002. This option is functional, but it is not a recommended use case.
-
-Note that in either case, when using the ``sequentialNumber`` option, datasets and files share the same database sequence that was created as part of the setup described in ``:IdentifierGenerationStyle`` above.
+This setting controls the way that the "identifier" component of a file's
+persistent identifier (PID) relates to the PID of its "parent" dataset.
+
+By default the identifier for a file is dependent on its parent dataset.
+For example, if the identifier of a dataset is "TJCLKP", the identifier for
+a file within that dataset will consist of the parent dataset's identifier
+followed by a slash ("/"), followed by a random 6 character string,
+yielding "TJCLKP/MLGWJO". Identifiers in this format are what you should
+expect if you leave ``:DataFilePIDFormat`` undefined or set it to
+``DEPENDENT`` and have not changed the ``:IdentifierGenerationStyle``
+setting from its default.
+
+Alternatively, the identifier for File PIDs can be configured to be
+independent of Dataset PIDs using the setting ``INDEPENDENT``.
+In this case, file PIDs will not contain the PIDs of their parent datasets,
+and their PIDs will be generated the exact same way that datasets' PIDs are,
+based on the ``:IdentifierGenerationStyle`` setting described above
+(random 6 character strings or custom unique identifiers through a stored
+procedure, pre-pended by any shoulder).
+
+The chart below shows examples from each possible combination of parameters
+from the two settings. ``:IdentifierGenerationStyle`` can be either
+``randomString`` (the default) or ``storedProcGenerated`` and
+``:DataFilePIDFormat`` can be either ``DEPENDENT`` (the default) or
+``INDEPENDENT``. In the examples below the "identifier" for the dataset is
+"TJCLKP" for ``randomString`` and "100001" for ``storedProcGenerated`` (when
+using sequential numerical values, as described in
+:ref:`:IdentifierGenerationStyle` above), or "krby26qt" for
+``storedProcGenerated`` (when using base36 timestamps, as described in
+:ref:`:IdentifierGenerationStyle` above).
+
++-----------------+---------------+----------------------+---------------------+
+|                 | randomString  | storedProcGenerated  | storedProcGenerated |
+|                 |               |                      |                     |
+|                 |               | (sequential numbers) | (base36 timestamps) |
++=================+===============+======================+=====================+
+| **DEPENDENT**   | TJCLKP/MLGWJO | 100001/1             | krby26qt/1          |
++-----------------+---------------+----------------------+---------------------+
+| **INDEPENDENT** | MLGWJO        | 100002               | krby27pz            |
++-----------------+---------------+----------------------+---------------------+
+
+As seen above, in cases where ``:IdentifierGenerationStyle`` is set to
+``storedProcGenerated`` and ``:DataFilePIDFormat`` is set to ``DEPENDENT``,
+each file within a dataset will be assigned a number *within* that dataset
+starting with "1".
+
+Otherwise, if ``:DataFilePIDFormat`` is set to ``INDEPENDENT``, each file
+within the dataset is assigned with a new PID which is the next available
+identifier provided from the database stored procedure. In our example:
+"100002" when using sequential numbers or "krby27pz" when using base36
+timestamps.
 
 .. _:FilePIDsEnabled:
 
@@ -2257,4 +2305,4 @@ In the DDI metadata exports, the default behavior is to always add the repositor
 A comma-separated list of field type names that should be 'withheld' when dataset access occurs via a Private Url with Anonymized Access (e.g. to support anonymized review). 
 A suggested minimum includes author, datasetContact, and contributor, but additional fields such as depositor, grantNumber, and publication might also need to be included.
 
-``curl -X PUT -d 'author, datasetContact, contributor, depositor, grantNumber, publication' http://localhost:8080/api/admin/settings/:AnonymizedFieldTypeNames`` 
+``curl -X PUT -d 'author, datasetContact, contributor, depositor, grantNumber, publication' http://localhost:8080/api/admin/settings/:AnonymizedFieldTypeNames``