Skip to content

Commit

Permalink
Merge pull request IQSS#8067 from QualitativeDataRepository/IQSS/8066…
Browse files Browse the repository at this point in the history
…-bag_updates

IQSS/8066 bag updates
  • Loading branch information
kcondon authored Aug 24, 2021
2 parents d132d90 + 634c6f7 commit 8e29651
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 24 deletions.
3 changes: 3 additions & 0 deletions doc/release-notes/8066-bag_updates.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
### Bag update to support file paths

The original Bag generation code stored all dataset files directly under the /data directory. With the addition in Dataverse of a directory path for files and then a change to allow files with different paths to have the same name, archival Bags will now use the directory path from Dataverse to avoid name collisions within the /data directory. Prior to this update, Bags from Datasets with multiple files with the same name would have been created with only one of the files with that name (with warnings in the log, but still generating the Bag).
53 changes: 29 additions & 24 deletions src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.ResourceBundle;
import java.util.Set;
import java.util.TreeSet;
import java.util.Map.Entry;
Expand All @@ -46,13 +45,11 @@
import org.apache.commons.compress.archivers.zip.ZipFile;
import org.apache.commons.compress.parallel.InputStreamSupplier;
import org.apache.commons.compress.utils.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.text.WordUtils;
import org.apache.commons.text.WordUtils;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.protocol.HttpClientContext;
import org.apache.http.config.Registry;
import org.apache.http.config.RegistryBuilder;
import org.apache.http.conn.socket.ConnectionSocketFactory;
Expand Down Expand Up @@ -199,7 +196,7 @@ public static void println(String s) {
* @return success true/false
*/
public boolean generateBag(OutputStream outputStream) throws Exception {
logger.info("Generating: Bag to the Future!");


File tmp = File.createTempFile("qdr-scatter-dirs", "tmp");
dirs = ScatterZipOutputStream.fileBased(tmp);
Expand All @@ -208,6 +205,8 @@ public boolean generateBag(OutputStream outputStream) throws Exception {

bagID = aggregation.get("@id").getAsString() + "v."
+ aggregation.get(JsonLDTerm.schemaOrg("version").getLabel()).getAsString();

logger.info("Generating Bag: " + bagID);
try {
// Create valid filename from identifier and extend path with
// two levels of hash-based subdirs to help distribute files
Expand Down Expand Up @@ -525,11 +524,16 @@ private void processContainer(JsonObject item, String currentPath) throws IOExce
String childTitle = child.get(JsonLDTerm.schemaOrg("name").getLabel()).getAsString();
if (titles.contains(childTitle)) {
logger.warning("**** Multiple items with the same title in: " + currentPath);
logger.warning("**** Will cause failure in hash and size validation.");
logger.warning("**** Will cause failure in hash and size validation in: " + bagID);
} else {
titles.add(childTitle);
}
String childPath = currentPath + childTitle;
JsonElement directoryLabel = child.get(JsonLDTerm.DVCore("directoryLabel").getLabel());
if(directoryLabel!=null) {
childPath=currentPath + directoryLabel.getAsString() + "/" + childTitle;
}


String childHash = null;
if (child.has(JsonLDTerm.checksum.getLabel())) {
Expand All @@ -538,19 +542,19 @@ private void processContainer(JsonObject item, String currentPath) throws IOExce
if (hashtype == null) {
//If one wasn't set as a default, pick up what the first child with one uses
hashtype = childHashType;
}
if (hashtype != null && !hashtype.equals(childHashType)) {
logger.warning("Multiple hash values in use - will calculate " + hashtype.toString()
+ " hashes for " + childTitle);
} else {
childHash = child.getAsJsonObject(JsonLDTerm.checksum.getLabel()).get("@value").getAsString();
if (checksumMap.containsValue(childHash)) {
// Something else has this hash
logger.warning("Duplicate/Collision: " + child.get("@id").getAsString() + " has SHA1 Hash: "
+ childHash);
}
checksumMap.put(childPath, childHash);
}
}
if (hashtype != null && !hashtype.equals(childHashType)) {
logger.warning("Multiple hash values in use - will calculate " + hashtype.toString()
+ " hashes for " + childTitle);
} else {
childHash = child.getAsJsonObject(JsonLDTerm.checksum.getLabel()).get("@value").getAsString();
if (checksumMap.containsValue(childHash)) {
// Something else has this hash
logger.warning("Duplicate/Collision: " + child.get("@id").getAsString() + " has SHA1 Hash: "
+ childHash + " in: " + bagID);
}
checksumMap.put(childPath, childHash);
}
}
if ((hashtype == null) | ignorehashes) {
// Pick sha512 when ignoring hashes or none exist
Expand Down Expand Up @@ -860,11 +864,12 @@ private String generateInfoFile() {
}

/**
* Kludge - handle when a single string is sent as an array of 1 string and, for
* cases where multiple values are sent when only one is expected, create a
* concatenated string so that information is not lost.
* Kludge - compound values (e.g. for descriptions) are sent as an array of
* objects containing key/values whereas a single value is sent as one object.
* For cases where multiple values are sent, create a concatenated string so
* that information is not lost.
*
* @param jsonObject
* @param jsonElement
* - the root json object
* @param key
* - the key to find a value(s) for
Expand All @@ -883,7 +888,7 @@ String getSingleValue(JsonElement jsonElement, String key) {
stringArray.add(iter.next().getAsJsonObject().getAsJsonPrimitive(key).getAsString());
}
if (stringArray.size() > 1) {
val = StringUtils.join(stringArray.toArray(), ",");
val = String.join(",", stringArray);
} else {
val = stringArray.get(0);
}
Expand Down

0 comments on commit 8e29651

Please sign in to comment.