From 2517504f3bb731371b011551bebda8bdaca8a4f4 Mon Sep 17 00:00:00 2001 From: David Pilato Date: Thu, 2 Aug 2018 14:03:12 +0200 Subject: [PATCH] Add a filter by content option (#585) You can filter out documents you would like to index by adding one or more regular expression that match the extracted content. Documents which are not matching will be simply ignored and not indexed. If you define the following `fs.filters` property in your `~/.fscrawler/test/_settings.json` file: ```json { "name" : "test", "fs": { "filters": [ ".*foo.*", "^4\\d{3}([\\ \\-]?)\\d{4}\\1\\d{4}\\1\\d{4}$" ] } } ``` With this example, only documents which contains the word `foo` and a VISA credit card number with the form like `4012888888881881`, `4012 8888 8888 1881` or `4012-8888-8888-1881` will be indexed. Closes #463. --- .../elasticsearch/crawler/fs/FsParser.java | 14 ++- docs/source/admin/fs/local-fs.rst | 31 ++++++ .../crawler/fs/framework/FsCrawlerUtil.java | 35 ++++++ .../integration/FsCrawlerTestFiltersIT.java | 57 ++++++++++ .../samples/test_filter_one_term/foo.txt | 1 + .../samples/test_filter_one_term/sample.txt | 1 + .../visa-number-with-foo.txt | 8 ++ .../test_filter_one_term/visa-number.txt | 8 ++ .../samples/test_filter_visa_pattern/foo.txt | 1 + .../test_filter_visa_pattern/sample.txt | 1 + .../visa-number-with-foo.txt | 8 ++ .../test_filter_visa_pattern/visa-number.txt | 8 ++ .../test_filter_visa_pattern_plus_foo/foo.txt | 1 + .../sample.txt | 1 + .../visa-number-with-foo.txt | 8 ++ .../visa-number.txt | 8 ++ .../elasticsearch/crawler/fs/settings/Fs.java | 105 ++++++++++-------- .../crawler/fs/settings/Ocr.java | 27 +++++ .../fs/settings/FsMatchContentTest.java | 83 ++++++++++++++ .../fs/settings/FsSettingsParserTest.java | 1 + 20 files changed, 355 insertions(+), 52 deletions(-) create mode 100644 integration-tests/src/test/java/fr/pilato/elasticsearch/crawler/fs/test/integration/FsCrawlerTestFiltersIT.java create mode 100644 integration-tests/src/test/resources-binary/samples/test_filter_one_term/foo.txt create mode 100644 integration-tests/src/test/resources-binary/samples/test_filter_one_term/sample.txt create mode 100644 integration-tests/src/test/resources-binary/samples/test_filter_one_term/visa-number-with-foo.txt create mode 100644 integration-tests/src/test/resources-binary/samples/test_filter_one_term/visa-number.txt create mode 100644 integration-tests/src/test/resources-binary/samples/test_filter_visa_pattern/foo.txt create mode 100644 integration-tests/src/test/resources-binary/samples/test_filter_visa_pattern/sample.txt create mode 100644 integration-tests/src/test/resources-binary/samples/test_filter_visa_pattern/visa-number-with-foo.txt create mode 100644 integration-tests/src/test/resources-binary/samples/test_filter_visa_pattern/visa-number.txt create mode 100644 integration-tests/src/test/resources-binary/samples/test_filter_visa_pattern_plus_foo/foo.txt create mode 100644 integration-tests/src/test/resources-binary/samples/test_filter_visa_pattern_plus_foo/sample.txt create mode 100644 integration-tests/src/test/resources-binary/samples/test_filter_visa_pattern_plus_foo/visa-number-with-foo.txt create mode 100644 integration-tests/src/test/resources-binary/samples/test_filter_visa_pattern_plus_foo/visa-number.txt create mode 100644 settings/src/test/java/fr/pilato/elasticsearch/crawler/fs/settings/FsMatchContentTest.java diff --git a/core/src/main/java/fr/pilato/elasticsearch/crawler/fs/FsParser.java b/core/src/main/java/fr/pilato/elasticsearch/crawler/fs/FsParser.java index 7a6b2b13d..8705c3b6e 100644 --- a/core/src/main/java/fr/pilato/elasticsearch/crawler/fs/FsParser.java +++ b/core/src/main/java/fr/pilato/elasticsearch/crawler/fs/FsParser.java @@ -63,7 +63,6 @@ import java.util.stream.Collectors; import static fr.pilato.elasticsearch.crawler.fs.framework.FsCrawlerUtil.computeVirtualPathName; -import static fr.pilato.elasticsearch.crawler.fs.framework.FsCrawlerUtil.isExcluded; import static fr.pilato.elasticsearch.crawler.fs.framework.FsCrawlerUtil.isFileSizeUnderLimit; import static fr.pilato.elasticsearch.crawler.fs.framework.FsCrawlerUtil.isIndexable; import static fr.pilato.elasticsearch.crawler.fs.framework.FsCrawlerUtil.localDateTimeToDate; @@ -485,10 +484,15 @@ private void indexFile(FileAbstractModel fileAbstractModel, ScanStatistic stats, } // We index the data structure - esIndex(esClientManager.bulkProcessorDoc(), fsSettings.getElasticsearch().getIndex(), - generateIdFromFilename(filename, dirname), - DocParser.toJson(doc), - fsSettings.getElasticsearch().getPipeline()); + if (isIndexable(doc.getContent(), fsSettings.getFs().getFilters())) { + esIndex(esClientManager.bulkProcessorDoc(), fsSettings.getElasticsearch().getIndex(), + generateIdFromFilename(filename, dirname), + DocParser.toJson(doc), + fsSettings.getElasticsearch().getPipeline()); + } else { + logger.debug("We ignore file [{}] because it does not match all the patterns {}", filename, + fsSettings.getFs().getFilters()); + } } else { if (fsSettings.getFs().isJsonSupport()) { // We index the json content directly diff --git a/docs/source/admin/fs/local-fs.rst b/docs/source/admin/fs/local-fs.rst index de667173e..f17c38599 100644 --- a/docs/source/admin/fs/local-fs.rst +++ b/docs/source/admin/fs/local-fs.rst @@ -16,6 +16,8 @@ Here is a list of Local FS settings (under ``fs.`` prefix)`: +----------------------------+-----------------------+---------------------------------+ | ``fs.excludes`` | ``["~*"]`` | `Includes and excludes`_ | +----------------------------+-----------------------+---------------------------------+ +| ``fs.filters`` | ``null`` | `Filter content`_ | ++----------------------------+-----------------------+---------------------------------+ | ``fs.json_support`` | ``false`` | `Indexing JSon docs`_ | +----------------------------+-----------------------+---------------------------------+ | ``fs.xml_support`` | ``false`` | `Indexing XML docs`_ | @@ -174,6 +176,35 @@ If you define the following ``fs.excludes`` property in your Then all files but the ones in ``/folderB/subfolderA``, ``/folderB/subfolderB`` and ``/folderB/subfolderC`` will be indexed. +Filter content +^^^^^^^^^^^^^^ + +.. versionadded:: 2.5 + +You can filter out documents you would like to index by adding one or more +regular expression that match the extracted content. +Documents which are not matching will be simply ignored and not indexed. + +If you define the following ``fs.filters`` property in your +``~/.fscrawler/test/_settings.json`` file: + +.. code:: json + + { + "name" : "test", + "fs": { + "filters": [ + ".*foo.*", + "^4\\d{3}([\\ \\-]?)\\d{4}\\1\\d{4}\\1\\d{4}$" + ] + } + } + +With this example, only documents which contains the word ``foo`` and a VISA credit card number +with the form like ``4012888888881881``, ``4012 8888 8888 1881`` or ``4012-8888-8888-1881`` +will be indexed. + + Indexing JSon docs ^^^^^^^^^^^^^^^^^^ diff --git a/framework/src/main/java/fr/pilato/elasticsearch/crawler/fs/framework/FsCrawlerUtil.java b/framework/src/main/java/fr/pilato/elasticsearch/crawler/fs/framework/FsCrawlerUtil.java index 950f6b781..213e58365 100644 --- a/framework/src/main/java/fr/pilato/elasticsearch/crawler/fs/framework/FsCrawlerUtil.java +++ b/framework/src/main/java/fr/pilato/elasticsearch/crawler/fs/framework/FsCrawlerUtil.java @@ -56,6 +56,7 @@ import java.util.Properties; import java.util.Set; import java.util.TimeZone; +import java.util.regex.Pattern; public class FsCrawlerUtil { public static final String INDEX_SUFFIX_FOLDER = "_folder"; @@ -210,6 +211,40 @@ public static boolean isIncluded(String filename, List includes) { return false; } + /** + * We check if we can index the content or skip it + * + * @param content Content to parse + * @param filters regular expressions that all needs to match if we want to index. If empty + * we consider it always matches. + */ + public static boolean isIndexable(String content, List filters) { + logger.debug("content = [{}], filters = {}", content, filters); + + if (isNullOrEmpty(content)) { + logger.trace("Null or empty content always matches."); + return true; + } + + if (filters == null || filters.isEmpty()) { + logger.trace("No pattern always matches."); + return true; + } + + for (String filter : filters) { + Pattern pattern = Pattern.compile(filter, Pattern.MULTILINE | Pattern.UNIX_LINES); + logger.trace("Testing filter [{}]", filter); + if (!pattern.matcher(content).find()) { + logger.trace("Filter [{}] is not matching.", filter); + return false; + } else { + logger.trace("Filter [{}] is matching.", filter); + } + } + + return true; + } + public static String computeVirtualPathName(String rootPath, String realPath) { String result = "/"; if (realPath != null && realPath.length() > rootPath.length()) { diff --git a/integration-tests/src/test/java/fr/pilato/elasticsearch/crawler/fs/test/integration/FsCrawlerTestFiltersIT.java b/integration-tests/src/test/java/fr/pilato/elasticsearch/crawler/fs/test/integration/FsCrawlerTestFiltersIT.java new file mode 100644 index 000000000..1c857a468 --- /dev/null +++ b/integration-tests/src/test/java/fr/pilato/elasticsearch/crawler/fs/test/integration/FsCrawlerTestFiltersIT.java @@ -0,0 +1,57 @@ +/* + * Licensed to David Pilato (the "Author") under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. Author licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package fr.pilato.elasticsearch.crawler.fs.test.integration; + +import fr.pilato.elasticsearch.crawler.fs.settings.Fs; +import org.elasticsearch.action.search.SearchRequest; +import org.junit.Test; + +/** + * Test filters crawler settings + */ +public class FsCrawlerTestFiltersIT extends AbstractFsCrawlerITCase { + @Test + public void test_filter_one_term() throws Exception { + Fs fs = startCrawlerDefinition() + .addFilter(".*foo.*") + .build(); + startCrawler(getCrawlerName(), fs, endCrawlerDefinition(getCrawlerName()), null); + countTestHelper(new SearchRequest(getCrawlerName()), 2L, null); + } + + @Test + public void test_filter_visa_pattern() throws Exception { + Fs fs = startCrawlerDefinition() + .addFilter("^4\\d{3}([\\ \\-]?)\\d{4}\\1\\d{4}\\1\\d{4}$") + .build(); + startCrawler(getCrawlerName(), fs, endCrawlerDefinition(getCrawlerName()), null); + countTestHelper(new SearchRequest(getCrawlerName()), 2L, null); + } + + @Test + public void test_filter_visa_pattern_plus_foo() throws Exception { + Fs fs = startCrawlerDefinition() + .addFilter("^4\\d{3}([\\ \\-]?)\\d{4}\\1\\d{4}\\1\\d{4}$") + .addFilter(".*foo.*") + .build(); + startCrawler(getCrawlerName(), fs, endCrawlerDefinition(getCrawlerName()), null); + countTestHelper(new SearchRequest(getCrawlerName()), 1L, null); + } +} diff --git a/integration-tests/src/test/resources-binary/samples/test_filter_one_term/foo.txt b/integration-tests/src/test/resources-binary/samples/test_filter_one_term/foo.txt new file mode 100644 index 000000000..7a14c0f10 --- /dev/null +++ b/integration-tests/src/test/resources-binary/samples/test_filter_one_term/foo.txt @@ -0,0 +1 @@ +This is containing foo as one of the words. diff --git a/integration-tests/src/test/resources-binary/samples/test_filter_one_term/sample.txt b/integration-tests/src/test/resources-binary/samples/test_filter_one_term/sample.txt new file mode 100644 index 000000000..91a62555d --- /dev/null +++ b/integration-tests/src/test/resources-binary/samples/test_filter_one_term/sample.txt @@ -0,0 +1 @@ +This file contains some words. diff --git a/integration-tests/src/test/resources-binary/samples/test_filter_one_term/visa-number-with-foo.txt b/integration-tests/src/test/resources-binary/samples/test_filter_one_term/visa-number-with-foo.txt new file mode 100644 index 000000000..80d20cc53 --- /dev/null +++ b/integration-tests/src/test/resources-binary/samples/test_filter_one_term/visa-number-with-foo.txt @@ -0,0 +1,8 @@ +This file contains + +4012-8888-8888-1881 + +^^^ This is a fake VISA number. + +This will be indexed unless we check for a word which +is not inside this text, like "foo". diff --git a/integration-tests/src/test/resources-binary/samples/test_filter_one_term/visa-number.txt b/integration-tests/src/test/resources-binary/samples/test_filter_one_term/visa-number.txt new file mode 100644 index 000000000..1e2eb21ef --- /dev/null +++ b/integration-tests/src/test/resources-binary/samples/test_filter_one_term/visa-number.txt @@ -0,0 +1,8 @@ +This file contains + +4012-8888-8888-1881 + +^^^ This is a fake VISA number. + +This will be indexed unless we check for a word which +is not inside this text, like "f o o" without the spaces. diff --git a/integration-tests/src/test/resources-binary/samples/test_filter_visa_pattern/foo.txt b/integration-tests/src/test/resources-binary/samples/test_filter_visa_pattern/foo.txt new file mode 100644 index 000000000..7a14c0f10 --- /dev/null +++ b/integration-tests/src/test/resources-binary/samples/test_filter_visa_pattern/foo.txt @@ -0,0 +1 @@ +This is containing foo as one of the words. diff --git a/integration-tests/src/test/resources-binary/samples/test_filter_visa_pattern/sample.txt b/integration-tests/src/test/resources-binary/samples/test_filter_visa_pattern/sample.txt new file mode 100644 index 000000000..91a62555d --- /dev/null +++ b/integration-tests/src/test/resources-binary/samples/test_filter_visa_pattern/sample.txt @@ -0,0 +1 @@ +This file contains some words. diff --git a/integration-tests/src/test/resources-binary/samples/test_filter_visa_pattern/visa-number-with-foo.txt b/integration-tests/src/test/resources-binary/samples/test_filter_visa_pattern/visa-number-with-foo.txt new file mode 100644 index 000000000..80d20cc53 --- /dev/null +++ b/integration-tests/src/test/resources-binary/samples/test_filter_visa_pattern/visa-number-with-foo.txt @@ -0,0 +1,8 @@ +This file contains + +4012-8888-8888-1881 + +^^^ This is a fake VISA number. + +This will be indexed unless we check for a word which +is not inside this text, like "foo". diff --git a/integration-tests/src/test/resources-binary/samples/test_filter_visa_pattern/visa-number.txt b/integration-tests/src/test/resources-binary/samples/test_filter_visa_pattern/visa-number.txt new file mode 100644 index 000000000..1e2eb21ef --- /dev/null +++ b/integration-tests/src/test/resources-binary/samples/test_filter_visa_pattern/visa-number.txt @@ -0,0 +1,8 @@ +This file contains + +4012-8888-8888-1881 + +^^^ This is a fake VISA number. + +This will be indexed unless we check for a word which +is not inside this text, like "f o o" without the spaces. diff --git a/integration-tests/src/test/resources-binary/samples/test_filter_visa_pattern_plus_foo/foo.txt b/integration-tests/src/test/resources-binary/samples/test_filter_visa_pattern_plus_foo/foo.txt new file mode 100644 index 000000000..7a14c0f10 --- /dev/null +++ b/integration-tests/src/test/resources-binary/samples/test_filter_visa_pattern_plus_foo/foo.txt @@ -0,0 +1 @@ +This is containing foo as one of the words. diff --git a/integration-tests/src/test/resources-binary/samples/test_filter_visa_pattern_plus_foo/sample.txt b/integration-tests/src/test/resources-binary/samples/test_filter_visa_pattern_plus_foo/sample.txt new file mode 100644 index 000000000..91a62555d --- /dev/null +++ b/integration-tests/src/test/resources-binary/samples/test_filter_visa_pattern_plus_foo/sample.txt @@ -0,0 +1 @@ +This file contains some words. diff --git a/integration-tests/src/test/resources-binary/samples/test_filter_visa_pattern_plus_foo/visa-number-with-foo.txt b/integration-tests/src/test/resources-binary/samples/test_filter_visa_pattern_plus_foo/visa-number-with-foo.txt new file mode 100644 index 000000000..80d20cc53 --- /dev/null +++ b/integration-tests/src/test/resources-binary/samples/test_filter_visa_pattern_plus_foo/visa-number-with-foo.txt @@ -0,0 +1,8 @@ +This file contains + +4012-8888-8888-1881 + +^^^ This is a fake VISA number. + +This will be indexed unless we check for a word which +is not inside this text, like "foo". diff --git a/integration-tests/src/test/resources-binary/samples/test_filter_visa_pattern_plus_foo/visa-number.txt b/integration-tests/src/test/resources-binary/samples/test_filter_visa_pattern_plus_foo/visa-number.txt new file mode 100644 index 000000000..1e2eb21ef --- /dev/null +++ b/integration-tests/src/test/resources-binary/samples/test_filter_visa_pattern_plus_foo/visa-number.txt @@ -0,0 +1,8 @@ +This file contains + +4012-8888-8888-1881 + +^^^ This is a fake VISA number. + +This will be indexed unless we check for a word which +is not inside this text, like "f o o" without the spaces. diff --git a/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/Fs.java b/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/Fs.java index 37325c115..a459f3a26 100644 --- a/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/Fs.java +++ b/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/Fs.java @@ -26,6 +26,7 @@ import java.util.ArrayList; import java.util.Collections; import java.util.List; +import java.util.Objects; @SuppressWarnings("SameParameterValue") public class Fs { @@ -33,6 +34,7 @@ public class Fs { private TimeValue updateRate = TimeValue.timeValueMinutes(15); private List includes = null; private List excludes = null; + private List filters = null; private boolean jsonSupport = false; private boolean filenameAsId = false; private boolean addFilesize = true; @@ -65,6 +67,7 @@ public static class Builder { private TimeValue updateRate = TimeValue.timeValueMinutes(15); private List includes = null; private List excludes = null; + private List filters = null; private boolean jsonSupport = false; private boolean filenameAsId = false; private boolean addFilesize = true; @@ -130,6 +133,24 @@ public Builder addExclude(String exclude) { return this; } + public Builder setFilters(List filters) { + this.filters = filters; + return this; + } + + public Builder addFilter(String filter) { + if (this.filters == null) { + this.filters = new ArrayList<>(); + } + + // We refuse to add duplicates + if (!this.filters.contains(filter)) { + this.filters.add(filter); + } + + return this; + } + public Builder setJsonSupport(boolean jsonSupport) { this.jsonSupport = jsonSupport; return this; @@ -221,7 +242,7 @@ public Builder setIgnoreAbove(ByteSizeValue ignoreAbove) { } public Fs build() { - return new Fs(url, updateRate, includes, excludes, jsonSupport, filenameAsId, addFilesize, + return new Fs(url, updateRate, includes, excludes, filters, jsonSupport, filenameAsId, addFilesize, removeDeleted, addAsInnerObject, storeSource, indexedChars, indexContent, attributesSupport, rawMetadata, checksum, xmlSupport, indexFolders, langDetect, continueOnError, pdfOcr, ocr, ignoreAbove); } @@ -231,7 +252,7 @@ public Fs( ) { } - private Fs(String url, TimeValue updateRate, List includes, List excludes, boolean jsonSupport, + private Fs(String url, TimeValue updateRate, List includes, List excludes, List filters, boolean jsonSupport, boolean filenameAsId, boolean addFilesize, boolean removeDeleted, boolean addAsInnerObject, boolean storeSource, Percentage indexedChars, boolean indexContent, boolean attributesSupport, boolean rawMetadata, String checksum, boolean xmlSupport, boolean indexFolders, boolean langDetect, boolean continueOnError, boolean pdfOcr, Ocr ocr, ByteSizeValue ignoreAbove) { @@ -239,6 +260,7 @@ private Fs(String url, TimeValue updateRate, List includes, List this.updateRate = updateRate; this.includes = includes; this.excludes = excludes; + this.filters = filters; this.jsonSupport = jsonSupport; this.filenameAsId = filenameAsId; this.addFilesize = addFilesize; @@ -291,6 +313,14 @@ public void setExcludes(List excludes) { this.excludes = excludes; } + public List getFilters() { + return filters; + } + + public void setFilters(List filters) { + this.filters = filters; + } + public boolean isJsonSupport() { return jsonSupport; } @@ -439,56 +469,36 @@ public void setIgnoreAbove(ByteSizeValue ignoreAbove) { public boolean equals(Object o) { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; - Fs fs = (Fs) o; - - if (jsonSupport != fs.jsonSupport) return false; - if (xmlSupport != fs.xmlSupport) return false; - if (indexFolders != fs.indexFolders) return false; - if (filenameAsId != fs.filenameAsId) return false; - if (addFilesize != fs.addFilesize) return false; - if (removeDeleted != fs.removeDeleted) return false; - if (addAsInnerObject != fs.addAsInnerObject) return false; - if (storeSource != fs.storeSource) return false; - if (indexContent != fs.indexContent) return false; - if (attributesSupport != fs.attributesSupport) return false; - if (rawMetadata != fs.rawMetadata) return false; - if (langDetect != fs.langDetect) return false; - if (continueOnError != fs.continueOnError) return false; - if (pdfOcr != fs.pdfOcr) return false; - if (ignoreAbove != fs.ignoreAbove) return false; - if (url != null ? !url.equals(fs.url) : fs.url != null) return false; - if (updateRate != null ? !updateRate.equals(fs.updateRate) : fs.updateRate != null) return false; - if (includes != null ? !includes.equals(fs.includes) : fs.includes != null) return false; - if (excludes != null ? !excludes.equals(fs.excludes) : fs.excludes != null) return false; - if (indexedChars != null ? !indexedChars.equals(fs.indexedChars) : fs.indexedChars != null) return false; - return checksum != null ? checksum.equals(fs.checksum) : fs.checksum == null; - + return jsonSupport == fs.jsonSupport && + filenameAsId == fs.filenameAsId && + addFilesize == fs.addFilesize && + removeDeleted == fs.removeDeleted && + addAsInnerObject == fs.addAsInnerObject && + storeSource == fs.storeSource && + indexContent == fs.indexContent && + attributesSupport == fs.attributesSupport && + rawMetadata == fs.rawMetadata && + xmlSupport == fs.xmlSupport && + indexFolders == fs.indexFolders && + langDetect == fs.langDetect && + continueOnError == fs.continueOnError && + pdfOcr == fs.pdfOcr && + Objects.equals(url, fs.url) && + Objects.equals(updateRate, fs.updateRate) && + Objects.equals(includes, fs.includes) && + Objects.equals(excludes, fs.excludes) && + Objects.equals(filters, fs.filters) && + Objects.equals(indexedChars, fs.indexedChars) && + Objects.equals(checksum, fs.checksum) && + Objects.equals(ocr, fs.ocr) && + Objects.equals(ignoreAbove, fs.ignoreAbove); } @Override public int hashCode() { - int result = url != null ? url.hashCode() : 0; - result = 31 * result + (updateRate != null ? updateRate.hashCode() : 0); - result = 31 * result + (includes != null ? includes.hashCode() : 0); - result = 31 * result + (excludes != null ? excludes.hashCode() : 0); - result = 31 * result + (jsonSupport ? 1 : 0); - result = 31 * result + (filenameAsId ? 1 : 0); - result = 31 * result + (addFilesize ? 1 : 0); - result = 31 * result + (removeDeleted ? 1 : 0); - result = 31 * result + (addAsInnerObject ? 1 : 0); - result = 31 * result + (storeSource ? 1 : 0); - result = 31 * result + (indexContent ? 1 : 0); - result = 31 * result + (indexedChars != null ? indexedChars.hashCode() : 0); - result = 31 * result + (attributesSupport ? 1 : 0); - result = 31 * result + (rawMetadata ? 1 : 0); - result = 31 * result + (xmlSupport ? 1 : 0); - result = 31 * result + (checksum != null ? checksum.hashCode() : 0); - result = 31 * result + (indexFolders ? 1 : 0); - result = 31 * result + (langDetect ? 1 : 0); - result = 31 * result + (continueOnError ? 1 : 0); - result = 31 * result + (pdfOcr ? 1 : 0); - return result; + return Objects.hash(url, updateRate, includes, excludes, filters, jsonSupport, filenameAsId, addFilesize, + removeDeleted, addAsInnerObject, storeSource, indexContent, indexedChars, attributesSupport, rawMetadata, xmlSupport, checksum, indexFolders, langDetect, continueOnError, pdfOcr, ocr, ignoreAbove); } @Override @@ -497,6 +507,7 @@ public String toString() { ", updateRate=" + updateRate + ", includes=" + includes + ", excludes=" + excludes + + ", filters=" + filters + ", jsonSupport=" + jsonSupport + ", filenameAsId=" + filenameAsId + ", addFilesize=" + addFilesize + diff --git a/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/Ocr.java b/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/Ocr.java index 0742ec2de..ed3403e96 100644 --- a/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/Ocr.java +++ b/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/Ocr.java @@ -19,6 +19,8 @@ package fr.pilato.elasticsearch.crawler.fs.settings; +import java.util.Objects; + public class Ocr { // Language dictionary to be used. private String language = "eng"; @@ -108,4 +110,29 @@ public String getOutputType() { public void setOutputType(String outputType) { this.outputType = outputType; } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + Ocr ocr = (Ocr) o; + return Objects.equals(language, ocr.language) && + Objects.equals(path, ocr.path) && + Objects.equals(dataPath, ocr.dataPath) && + Objects.equals(outputType, ocr.outputType); + } + + @Override + public int hashCode() { + return Objects.hash(language, path, dataPath, outputType); + } + + @Override + public String toString() { + return "Ocr{" + "language='" + language + '\'' + + ", path='" + path + '\'' + + ", dataPath='" + dataPath + '\'' + + ", outputType='" + outputType + '\'' + + '}'; + } } diff --git a/settings/src/test/java/fr/pilato/elasticsearch/crawler/fs/settings/FsMatchContentTest.java b/settings/src/test/java/fr/pilato/elasticsearch/crawler/fs/settings/FsMatchContentTest.java new file mode 100644 index 000000000..6db19222b --- /dev/null +++ b/settings/src/test/java/fr/pilato/elasticsearch/crawler/fs/settings/FsMatchContentTest.java @@ -0,0 +1,83 @@ +/* + * Licensed to David Pilato (the "Author") under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. Author licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package fr.pilato.elasticsearch.crawler.fs.settings; + +import fr.pilato.elasticsearch.crawler.fs.test.framework.AbstractFSCrawlerTestCase; +import org.junit.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +import static fr.pilato.elasticsearch.crawler.fs.framework.FsCrawlerUtil.isIndexable; +import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.Matchers.is; + +public class FsMatchContentTest extends AbstractFSCrawlerTestCase { + + @Test + public void testIncludeAndExcludeTextPattern() { + // Test with null or empty text + regexTester(null, null, true); + regexTester(null, new ArrayList<>(), true); + regexTester(null, Collections.singletonList("foo"), true); + regexTester("", null, true); + regexTester("", new ArrayList<>(), true); + regexTester("", Collections.singletonList("foo"), true); + + // Test with text + regexTester("foo bar", null, true); + regexTester("foo bar", new ArrayList<>(), true); + regexTester("foo bar", Collections.singletonList("^foo$"), false); + regexTester("foo bar", Collections.singletonList(".*foo.*"), true); + regexTester("foo bar", Collections.singletonList("^bar$"), false); + regexTester("foo bar", Collections.singletonList(".*bar.*"), true); + regexTester("foo bar", Arrays.asList(".*foo.*", ".*bar.*"), true); + regexTester("foo bar", Arrays.asList(".*foo.*", "^bar$"), false); + + regexTester("baz", Collections.singletonList("^foo$"), false); + regexTester("baz", Collections.singletonList(".*foo.*"), false); + regexTester("baz", Collections.singletonList("^bar$"), false); + regexTester("baz", Collections.singletonList(".*bar.*"), false); + regexTester("baz", Arrays.asList(".*foo.*", ".*bar.*"), false); + regexTester("baz", Arrays.asList(".*foo.*", "^bar$"), false); + + // Test with multi line text + String text = "This is containing foo as one of the words.\n" + + "Another line which contains bar also.\n"; + regexTester(text, Collections.singletonList("^foo$"), false); + regexTester(text, Collections.singletonList(".*foo.*"), true); + regexTester(text, Collections.singletonList("^bar$"), false); + regexTester(text, Collections.singletonList(".*bar.*"), true); + regexTester(text, Arrays.asList(".*foo.*", ".*bar.*"), true); + regexTester(text, Arrays.asList(".*foo.*", "^bar$"), false); + + // Test a Visa Credit Card pattern + regexTester("4012888888881881", Collections.singletonList("^4\\d{3}([\\ \\-]?)\\d{4}\\1\\d{4}\\1\\d{4}$"), true); + regexTester("4012 8888 8888 1881", Collections.singletonList("^4\\d{3}([\\ \\-]?)\\d{4}\\1\\d{4}\\1\\d{4}$"), true); + regexTester("4012-8888-8888-1881", Collections.singletonList("^4\\d{3}([\\ \\-]?)\\d{4}\\1\\d{4}\\1\\d{4}$"), true); + } + + private void regexTester(String input, List regexes, boolean expected) { + assertThat(regexes + " should " + (expected ? "" : "not ") + "match " + input, + isIndexable(input, regexes), is(expected)); + } +} diff --git a/settings/src/test/java/fr/pilato/elasticsearch/crawler/fs/settings/FsSettingsParserTest.java b/settings/src/test/java/fr/pilato/elasticsearch/crawler/fs/settings/FsSettingsParserTest.java index fefc4fc82..7fac2db91 100644 --- a/settings/src/test/java/fr/pilato/elasticsearch/crawler/fs/settings/FsSettingsParserTest.java +++ b/settings/src/test/java/fr/pilato/elasticsearch/crawler/fs/settings/FsSettingsParserTest.java @@ -49,6 +49,7 @@ public class FsSettingsParserTest extends AbstractFSCrawlerTestCase { .addExclude("resume.doc") .addInclude("*.doc") .addInclude("*.xls") + .addFilter("foo") .setFilenameAsId(true) .setIndexedChars(new Percentage(10000)) .setRemoveDeleted(true)