Skip to content

Commit

Permalink
New framework for file type recognition: new version of Jhove; update…
Browse files Browse the repository at this point in the history
…d file utility methods; mime.types file to be supplied with the application.
  • Loading branch information
landreev committed Apr 8, 2014
1 parent a212a2d commit 313d3a9
Show file tree
Hide file tree
Showing 14 changed files with 1,150 additions and 2 deletions.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd" xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<modelVersion>4.0.0</modelVersion>
<groupId>edu.harvard.hul.ois.jhove</groupId>
<artifactId>jhove-handler</artifactId>
<version>1.11.0</version>
</project>
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd" xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<modelVersion>4.0.0</modelVersion>
<groupId>edu.harvard.hul.ois.jhove</groupId>
<artifactId>jhove-module</artifactId>
<version>1.11.0</version>
</project>
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd" xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<modelVersion>4.0.0</modelVersion>
<groupId>edu.harvard.hul.ois.jhove</groupId>
<artifactId>jhove</artifactId>
<version>1.11.0</version>
</project>
16 changes: 16 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,21 @@
<artifactId>poi-examples</artifactId>
<version>3.10-FINAL</version>
</dependency>
<dependency>
<groupId>edu.harvard.hul.ois.jhove</groupId>
<artifactId>jhove</artifactId>
<version>1.11.0</version>
</dependency>
<dependency>
<groupId>edu.harvard.hul.ois.jhove</groupId>
<artifactId>jhove-module</artifactId>
<version>1.11.0</version>
</dependency>
<dependency>
<groupId>edu.harvard.hul.ois.jhove</groupId>
<artifactId>jhove-handler</artifactId>
<version>1.11.0</version>
</dependency>
</dependencies>

<build>
Expand All @@ -162,6 +177,7 @@
<directory>src/main/java</directory>
<includes>
<include>*.properties</include>
<include>**/mime.types</include>
<include>**/*.R</include>
</includes>
</resource>
Expand Down
41 changes: 41 additions & 0 deletions src/main/java/META-INF/mime.types
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# Common document formats
application/pdf pdf PDF
application/msword doc DOC
application/vnd.ms-excel xls XLS xlc XLC xll XLL xlm XLM xlw XLW
text/plain txt TXT
# Common statistical data formats
text/tab-separated-values tab TAB tsv TSV
text/xml xml XML
text/csv csv CSV
text/x-fixed-field dat DAT asc ASC
application/x-rlang-transport Rdata RData rdata RDATA
type/x-r-syntax r R
application/x-stata dta DTA
text/x-stata-syntax do DO
application/x-spss-sav sav SAV
application/x-spss-por por POR
text/x-spss-syntax sps SPS
application/x-sas-transport xpt XPT cport CPORT v5x V5X v6x V6X v7x V7X
application/x-sas-system sas7bdat SAS7BDAT sd1 SD1 sd2 SD2 sd7 SD7 ssd01 SSD01 ssd SSD ssd04 SSD04
text/x-sas-syntax sas SAS
# Common image formats
image/gif gif GIF
image/jpeg jpeg JPEG jpg JPG jpe JPE
image/bmp bmp BMP
image/x-portable-bitmap pbm PBM
image/x-portable-graymap pgm PGM
image/png png PNG
image/x-portable-anymap pnm PNM
image/x-portable-pixmap ppm PPM
image/cmu-raster ras RAS
image/x-rgb rgb RGB
image/tiff tif TIF tiff TIFF
image/x-xbitmap xbm XBM
image/x-xpixmap xpm XPM
image/x-xwindowdump xwd XWD
# Common archive formats
application/zip zip ZIP
application/x-gzip gz GZ
application/x-tar tar TAR
# Rdata
application/octet-stream
1 change: 1 addition & 0 deletions src/main/java/MimeTypeDisplay.properties
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ application/pdf=Adobe PDF
application/msword=MS Word
application/vnd.ms-excel=MS Excel
application/vnd.openxmlformats-officedocument.spreadsheetml.sheet=MS Excel (XLSX)
application/vnd.openxmlformats-officedocument.wordprocessingml.document=MS Word (docx)
application/zip=ZIP Archive
text/plain=Plain Text
text/xml=XML
Expand Down
1 change: 1 addition & 0 deletions src/main/java/MimeTypeFacets.properties
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ application/pdf=document
application/msword=document
application/vnd.ms-excel=document
application/vnd.openxmlformats-officedocument.spreadsheetml.sheet=document
application/vnd.openxmlformats-officedocument.wordprocessingml.document=document
# Text:
text/plain=text
text/xml=text
Expand Down
45 changes: 43 additions & 2 deletions src/main/java/edu/harvard/iq/dataverse/DatasetPage.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import edu.harvard.iq.dataverse.util.MD5Checksum;
import edu.harvard.iq.dataverse.datavariable.VariableServiceBean;
import edu.harvard.iq.dataverse.ingest.IngestServiceBean;
import edu.harvard.iq.dataverse.util.FileUtil;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
Expand Down Expand Up @@ -535,7 +536,23 @@ public void handleDropBoxUpload(ActionEvent e) {
fmd.setDatasetVersion(editVersion);
dataset.getFiles().add(dFile);


// When downloading files from dropBox, we don't get the benefit of
// having the browser recognize the mime type of the file. So we'll
// have to rely on our own utilities (Jhove, etc.) to try and determine
// what it is.

String fileType = null;
try {
fileType = FileUtil.determineFileType(Paths.get(getFilesTempDirectory(), dFile.getFileSystemName()).toFile(), dFile.getName());
Logger.getLogger(DatasetPage.class.getName()).log(Level.FINE, "File utility recognized the file as "+fileType);
if (fileType != null && !fileType.equals("")) {
dFile.setContentType(fileType);
}
}
catch (IOException ex) {
Logger.getLogger(DatasetPage.class.getName()).log(Level.WARNING, "Failed to run the file utility mime type check on file " + dFile.getName());
}

newFiles.add(dFile);
}
}
Expand All @@ -562,12 +579,36 @@ public void handleFileUpload(FileUploadEvent event) {
if (getFilesTempDirectory() != null) {
try {

Logger.getLogger(DatasetPage.class.getName()).log(Level.INFO, "Will attempt to save the file as: " + getFilesTempDirectory() + "/" + dFile.getFileSystemName());
Logger.getLogger(DatasetPage.class.getName()).log(Level.FINE, "Will attempt to save the file as: " + getFilesTempDirectory() + "/" + dFile.getFileSystemName());
Files.copy(uFile.getInputstream(), Paths.get(getFilesTempDirectory(), dFile.getFileSystemName()), StandardCopyOption.REPLACE_EXISTING);
} catch (IOException ioex) {
Logger.getLogger(DatasetPage.class.getName()).log(Level.WARNING, "Failed to save the file " + dFile.getFileSystemName());
return;
}
}

// Let's try our own utilities (Jhove, etc.) to determine the file type
// of the uploaded file. (we may or may not do better than the browser,
// which may have already recognized the type correctly...)

String fileType = null;
try {
fileType = FileUtil.determineFileType(Paths.get(getFilesTempDirectory(), dFile.getFileSystemName()).toFile(), dFile.getName());
Logger.getLogger(DatasetPage.class.getName()).log(Level.FINE, "File utility recognized the file as "+fileType);
if (fileType != null && !fileType.equals("")) {
// let's look at the browser's guess regarding the mime type
// of the file:
String bgType = dFile.getContentType();
Logger.getLogger(DatasetPage.class.getName()).log(Level.FINE, "Browser recognized the file as "+bgType);

if (bgType == null || bgType.equals("") || bgType.equalsIgnoreCase("application/octet-stream")) {
dFile.setContentType(fileType);
}
}
} catch (IOException ex) {
Logger.getLogger(DatasetPage.class.getName()).log(Level.WARNING, "Failed to run the file utility mime type check on file " + dFile.getName());
}

newFiles.add(dFile);

}
Expand Down
Loading

0 comments on commit 313d3a9

Please sign in to comment.