Skip to content

Commit

Permalink
SAK-49498 improve mime handling by upgrading Tika parsers (sakaiproje…
Browse files Browse the repository at this point in the history
  • Loading branch information
ottenhoff authored Feb 13, 2024
1 parent 3271968 commit 50af3b4
Show file tree
Hide file tree
Showing 9 changed files with 82 additions and 14 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,7 @@ application/x-futuresplash=spl
application/x-gtar=gtar
application/x-gzip=gz tgz
application/x-hdf=hdf
application/x-ipynb+json=ipynb
application/x-javascript=js
application/x-jmp-data=jmp
application/x-kword=kwd kwt
Expand Down
6 changes: 5 additions & 1 deletion kernel/kernel-impl/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers</artifactId>
<artifactId>tika-parsers-standard-package</artifactId>
</dependency>
<dependency>
<groupId>com.vdurmont</groupId>
Expand Down Expand Up @@ -136,6 +136,10 @@
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpmime</artifactId>
</dependency>
<dependency>
<groupId>com.opencsv</groupId>
<artifactId>opencsv</artifactId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@
import org.apache.tika.detect.Detector;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MimeTypes;
import org.apache.tika.parser.txt.CharsetDetector;
import org.apache.tika.parser.txt.CharsetMatch;
Expand Down Expand Up @@ -5964,7 +5965,7 @@ public void commitResource(ContentResourceEdit edit, int priority) throws OverQu

final Metadata metadata = new Metadata();
//This might not want to be set as it would advise the detector
metadata.set(Metadata.RESOURCE_NAME_KEY, edit.getId());
metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, edit.getId());
metadata.set(Metadata.CONTENT_TYPE, currentContentType);
String newmatch = "";
//If we are ignoring the content for this extension, don't give it any data
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,14 @@
import java.io.InputStream;
import java.net.URLDecoder;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Iterator;
import java.sql.Connection;
import java.sql.Statement;

import java.security.MessageDigest;
import java.util.Map;

import lombok.extern.slf4j.Slf4j;

Expand Down Expand Up @@ -277,9 +279,18 @@ public void testMimeDetection() throws Exception {
//Next is an really and excel file with no extension
//Last is a html snippet with correct extension

List <String> fileNames = Arrays.asList("testEXCEL.mp3","testWORD.doc","testHTML.html","testEXCEL","LSNBLDR-359-snippet.html", "testCSS.css", "testHTMLbody.html","jquery-1.6.1.min.js");
List <String> expectedMimes = Arrays.asList("application/vnd.ms-excel","application/msword","text/html","application/vnd.ms-excel","text/html", "text/css", "text/html","application/javascript");

final Map<String, String> fileNamesToMimes = new HashMap<>() {{
put("testCSV.csv", "text/csv");
put("testNotebook.ipynb", "application/x-ipynb+json");
put("testEXCEL.mp3", "application/vnd.ms-excel");
put("testWORD.doc", "application/msword");
put("testHTML.html", "text/html");
put("testEXCEL", "application/vnd.ms-excel");
put("LSNBLDR-359-snippet.html", "text/html");
put("testCSS.css", "text/css");
put("testHTMLbody.html", "text/html");
put("jquery-1.6.1.min.js", "application/javascript");
}};
//Set the mime magic to be true
ServerConfigurationService serv = getService(ServerConfigurationService.class);
serv.registerConfigItem(BasicConfigItem.makeConfigItem("content.useMimeMagic","true",ServerConfigurationService.UNKNOWN));
Expand All @@ -296,9 +307,8 @@ public void testMimeDetection() throws Exception {
ContentResource cr;
InputStream stream;
//Insert all resources to CHS
for (int i=0;i<fileNames.size();i++) {
//Add in a slash for CHS
String fileName = fileNames.get(i);
for (Map.Entry<String, String> entry : fileNamesToMimes.entrySet()) {
String fileName = entry.getKey();
//Stored in CHS it needs a slash
String CHSfileName = "/"+fileName;
log.debug("Loading up file: {}", fileName);
Expand All @@ -310,8 +320,8 @@ public void testMimeDetection() throws Exception {
ch.addResource(CHSfileName, "", stream, props ,0);
//Now get it back and check the mime type
cr = ch.getResource(CHSfileName);
log.debug("Expecting mime:{} and got {}", expectedMimes.get(i), cr.getContentType());
Assert.assertEquals(cr.getContentType(), expectedMimes.get(i));
log.debug("Expecting mime:{} and got {}", entry.getValue(), cr.getContentType());
Assert.assertEquals(cr.getContentType(), entry.getValue());
stream.close();
}
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
Username,First,Last
Username,Mike,McLaughlin
Username,Chase,Green
Username,Duane,Sandoval
Username,Allen,Daniel
Username,Jeff,Simon
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Untitled Notebook\n",
"\n",
"This is an initial placeholder notebook. Feel free to edit and rename as well as create your own notebooks, to use Google Cloud Datalab."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Code cell ready to be run...\n",
"print('Hello!')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.9"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
4 changes: 2 additions & 2 deletions master/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@
<sakai.spring.security.version>5.5.8</sakai.spring.security.version>
<sakai.spring.hateoas.version>1.5.6</sakai.spring.hateoas.version>
<sakai.spring.plugin.version>2.0.0.RELEASE</sakai.spring.plugin.version>
<sakai.tika.version>1.28.5</sakai.tika.version>
<sakai.tika.version>3.0.0-BETA</sakai.tika.version>
<sakai.tomcat.version>9.0.85</sakai.tomcat.version>
<sakai.thymeleaf.version>3.0.15.RELEASE</sakai.thymeleaf.version>
<sakai.velocity.version>1.6.4</sakai.velocity.version>
Expand Down Expand Up @@ -1171,7 +1171,7 @@
since it is used in multiple places it is declared here so its done right -->
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers</artifactId>
<artifactId>tika-parsers-standard-package</artifactId>
<version>${sakai.tika.version}</version>
<exclusions>
<exclusion>
Expand Down
2 changes: 1 addition & 1 deletion microsoft-integration/collaborative-documents/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers</artifactId>
<artifactId>tika-parsers-standard-package</artifactId>
<exclusions>
<exclusion>
<groupId>org.apache.poi</groupId>
Expand Down
2 changes: 1 addition & 1 deletion search/search-impl/impl/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers</artifactId>
<artifactId>tika-parsers-standard-package</artifactId>
</dependency>
<dependency>
<groupId>commons-codec</groupId>
Expand Down

0 comments on commit 50af3b4

Please sign in to comment.