OAK-5415 - Refactor Binary text extraction logic from LuceneIndexEditor

git-svn-id: https://svn.apache.org/repos/asf/jackrabbit/oak/trunk@1777676 13f79535-47bb-0310-9956-ffa450edef68
wildbits · Jan 6, 2017 · 2b57d45 · 2b57d45
1 parent f854062
commit 2b57d45
Show file tree

Hide file tree

Showing 5 changed files with 362 additions and 279 deletions.
diff --git a/...cene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/ExtractedTextCache.java b/...cene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/ExtractedTextCache.java
@@ -38,7 +38,7 @@
 
 import static org.apache.jackrabbit.oak.commons.PathUtils.concat;
 
-class ExtractedTextCache {
+public class ExtractedTextCache {
     private static final String EMPTY_STRING = "";
     private static final Logger log = LoggerFactory.getLogger(ExtractedTextCache.class);
     private volatile PreExtractedTextProvider extractedTextProvider;

diff --git a/...ucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditor.java b/...ucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditor.java
@@ -27,26 +27,19 @@
 import com.google.common.collect.Iterables;
 import com.google.common.collect.Lists;
 import com.google.common.collect.Sets;
-import com.google.common.io.CountingInputStream;
-import org.apache.jackrabbit.JcrConstants;
-import org.apache.jackrabbit.oak.api.Blob;
 import org.apache.jackrabbit.oak.api.CommitFailedException;
 import org.apache.jackrabbit.oak.api.PropertyState;
 import org.apache.jackrabbit.oak.api.Type;
 import org.apache.jackrabbit.oak.commons.PathUtils;
-import org.apache.jackrabbit.oak.commons.io.LazyInputStream;
 import org.apache.jackrabbit.oak.plugins.index.IndexEditor;
 import org.apache.jackrabbit.oak.plugins.index.PathFilter;
-import org.apache.jackrabbit.oak.plugins.index.fulltext.ExtractedText;
-import org.apache.jackrabbit.oak.plugins.index.fulltext.ExtractedText.ExtractionResult;
 import org.apache.jackrabbit.oak.plugins.index.lucene.Aggregate.Matcher;
 import org.apache.jackrabbit.oak.plugins.index.lucene.util.FunctionIndexProcessor;
 import org.apache.jackrabbit.oak.plugins.index.lucene.writer.LuceneIndexWriter;
 import org.apache.jackrabbit.oak.plugins.memory.EmptyNodeState;
 import org.apache.jackrabbit.oak.plugins.memory.StringPropertyState;
 import org.apache.jackrabbit.oak.spi.commit.Editor;
 import org.apache.jackrabbit.oak.spi.state.NodeState;
-import org.apache.jackrabbit.oak.util.BlobByteSource;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.DoubleDocValuesField;
 import org.apache.lucene.document.DoubleField;
@@ -57,13 +50,9 @@
 import org.apache.lucene.document.StringField;
 import org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetField;
 import org.apache.lucene.util.BytesRef;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.WriteOutContentHandler;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import static org.apache.jackrabbit.JcrConstants.JCR_DATA;
 import static org.apache.jackrabbit.oak.commons.PathUtils.concat;
 import static org.apache.jackrabbit.oak.commons.PathUtils.getName;
 import static org.apache.jackrabbit.oak.plugins.index.lucene.FieldFactory.*;
@@ -80,9 +69,7 @@ public class LuceneIndexEditor implements IndexEditor, Aggregate.AggregateRoot {
     private static final Logger log =
             LoggerFactory.getLogger(LuceneIndexEditor.class);
 
-    private static final long SMALL_BINARY = Long.getLong("oak.lucene.smallBinary", 16 * 1024);
-
-    static final String TEXT_EXTRACTION_ERROR = "TextExtractionError";
+    public static final String TEXT_EXTRACTION_ERROR = "TextExtractionError";
 
     private final LuceneIndexEditorContext context;
 
@@ -586,40 +573,12 @@ private static boolean isVisible(String name) {
 
     private List<Field> newBinary(
             PropertyState property, NodeState state, String nodePath, String path) {
-        List<Field> fields = new ArrayList<Field>();
-        Metadata metadata = new Metadata();
-
-        //jcr:mimeType is mandatory for a binary to be indexed
-        String type = state.getString(JcrConstants.JCR_MIMETYPE);
-
-        if (type == null || !isSupportedMediaType(type)) {
-            log.trace(
-                    "[{}] Ignoring binary content for node {} due to unsupported (or null) jcr:mimeType [{}]",
-                    getIndexName(), path, type);
-            return fields;
-        }
-
-        metadata.set(Metadata.CONTENT_TYPE, type);
-        if (JCR_DATA.equals(property.getName())) {
-            String encoding = state.getString(JcrConstants.JCR_ENCODING);
-            if (encoding != null) { // not mandatory
-                metadata.set(Metadata.CONTENT_ENCODING, encoding);
-            }
+        if (!context.isAsyncIndexing()){
+            //Skip text extraction for sync indexing
+            return Collections.emptyList();
         }
 
-        for (Blob v : property.getValue(Type.BINARIES)) {
-            String value = parseStringValue(v, metadata, path, property.getName());
-            if (value == null){
-                continue;
-            }
-
-            if (nodePath != null){
-                fields.add(newFulltextField(nodePath, value, true));
-            } else {
-                fields.add(newFulltextField(value, true));
-            }
-        }
-        return fields;
+        return context.getTextExtractor().newBinary(property, state, nodePath, path);
     }
 
     private boolean augmentCustomFields(final String path, final List<Field> fields,
@@ -941,81 +900,6 @@ private PathFilter.Result getPathFilterResult(String childNodeName) {
         return context.getDefinition().getPathFilter().filter(concat(getPath(), childNodeName));
     }
 
-    private boolean isSupportedMediaType(String type) {
-        return context.isSupportedMediaType(type);
-    }
-
-    private String parseStringValue(Blob v, Metadata metadata, String path, String propertyName) {
-        if (!context.isAsyncIndexing()){
-            //Skip text extraction for sync indexing
-            return null;
-        }
-        String text = context.getExtractedTextCache().get(path, propertyName, v, context.isReindex());
-        if (text == null){
-            text = parseStringValue0(v, metadata, path);
-        }
-        return text;
-    }
-
-    private String parseStringValue0(Blob v, Metadata metadata, String path) {
-        WriteOutContentHandler handler = new WriteOutContentHandler(context.getDefinition().getMaxExtractLength());
-        long start = System.currentTimeMillis();
-        long bytesRead = 0;
-        long length = v.length();
-        if (log.isDebugEnabled()) {
-            log.debug("Extracting {}, {} bytes, id {}", path, length, v.getContentIdentity());
-        }
-        String oldThreadName = null;
-        if (length > SMALL_BINARY) {
-            Thread t = Thread.currentThread();
-            oldThreadName = t.getName();
-            t.setName(oldThreadName + ": Extracting " + path + ", " + length + " bytes");
-        }
-        try {
-            CountingInputStream stream = new CountingInputStream(new LazyInputStream(new BlobByteSource(v)));
-            try {
-                context.getParser().parse(stream, handler, metadata, new ParseContext());
-            } finally {
-                bytesRead = stream.getCount();
-                stream.close();
-            }
-        } catch (LinkageError e) {
-            // Capture and ignore errors caused by extraction libraries
-            // not being present. This is equivalent to disabling
-            // selected media types in configuration, so we can simply
-            // ignore these errors.
-        } catch (Throwable t) {
-            // Capture and report any other full text extraction problems.
-            // The special STOP exception is used for normal termination.
-            if (!handler.isWriteLimitReached(t)) {
-                log.debug(
-                        "[{}] Failed to extract text from a binary property: {}."
-                        + " This is a fairly common case, and nothing to"
-                        + " worry about. The stack trace is included to"
-                        + " help improve the text extraction feature.",
-                        getIndexName(), path, t);
-                context.getExtractedTextCache().put(v, ExtractedText.ERROR);
-                return TEXT_EXTRACTION_ERROR;
-            }
-        } finally {
-            if (oldThreadName != null) {
-                Thread.currentThread().setName(oldThreadName);
-            }
-        }
-        String result = handler.toString();
-        if (bytesRead > 0) {
-            long time = System.currentTimeMillis() - start;
-            int len = result.length();
-            context.recordTextExtractionStats(time, bytesRead, len);
-            if (log.isDebugEnabled()) {
-                log.debug("Extracting {} took {} ms, {} bytes read, {} text size", 
-                        path, time, bytesRead, len);
-            }
-        }
-        context.getExtractedTextCache().put(v,  new ExtractedText(ExtractionResult.SUCCESS, result));
-        return result;
-    }
-
     private String getIndexName() {
         return context.getDefinition().getIndexName();
     }