Skip to content

Commit

Permalink
Merge branch 'master' into wip/workplace_search
Browse files Browse the repository at this point in the history
# Conflicts:
#	.gitignore
#	cli/src/main/resources/log4j2.xml
#	core/src/main/java/fr/pilato/elasticsearch/crawler/fs/FsParserAbstract.java
#	docs/source/fscrawler.ini
#	pom.xml
  • Loading branch information
dadoonet committed Nov 13, 2020
2 parents 252c925 + 1d84d9f commit 6157be9
Show file tree
Hide file tree
Showing 20 changed files with 441 additions and 94 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ target
.idea
*.iml
/.run
/logs/
Original file line number Diff line number Diff line change
Expand Up @@ -23,21 +23,28 @@
import com.beust.jcommander.Parameter;
import fr.pilato.elasticsearch.crawler.fs.FsCrawlerImpl;
import fr.pilato.elasticsearch.crawler.fs.beans.FsJobFileHandler;
import fr.pilato.elasticsearch.crawler.fs.framework.FSCrawlerLogger;
import fr.pilato.elasticsearch.crawler.fs.framework.FsCrawlerUtil;
import fr.pilato.elasticsearch.crawler.fs.framework.MetaFileHandler;
import fr.pilato.elasticsearch.crawler.fs.framework.Version;
import fr.pilato.elasticsearch.crawler.fs.rest.RestServer;
import fr.pilato.elasticsearch.crawler.fs.settings.Elasticsearch;
import fr.pilato.elasticsearch.crawler.fs.settings.Fs;
import fr.pilato.elasticsearch.crawler.fs.settings.FsCrawlerValidator;
import fr.pilato.elasticsearch.crawler.fs.settings.FsSettings;
import fr.pilato.elasticsearch.crawler.fs.settings.FsSettingsFileHandler;
import fr.pilato.elasticsearch.crawler.fs.settings.FsSettingsParser;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.Level;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.logging.log4j.core.Filter;
import org.apache.logging.log4j.core.LoggerContext;
import org.apache.logging.log4j.core.appender.ConsoleAppender;
import org.apache.logging.log4j.core.config.Configuration;
import org.apache.logging.log4j.core.config.LoggerConfig;
import org.apache.logging.log4j.core.filter.LevelMatchFilter;
import org.apache.logging.log4j.core.filter.LevelRangeFilter;

import java.io.IOException;
import java.nio.file.NoSuchFileException;
Expand All @@ -46,9 +53,7 @@
import java.util.List;
import java.util.Scanner;

import static fr.pilato.elasticsearch.crawler.fs.framework.FsCrawlerUtil.copyDefaultResources;
import static fr.pilato.elasticsearch.crawler.fs.framework.FsCrawlerUtil.extractMajorVersion;
import static fr.pilato.elasticsearch.crawler.fs.framework.FsCrawlerUtil.readDefaultJsonVersionedFile;
import static fr.pilato.elasticsearch.crawler.fs.framework.FsCrawlerUtil.*;

/**
* Main entry point to launch FsCrawler
Expand Down Expand Up @@ -110,29 +115,33 @@ public static void main(String[] args) throws Exception {
if (commands.debug || commands.trace || commands.silent) {
LoggerContext ctx = (LoggerContext) LogManager.getContext(false);
Configuration config = ctx.getConfiguration();
LoggerConfig loggerConfig = config.getLoggerConfig(FsCrawlerCli.class.getPackage().getName());
LoggerConfig loggerConfig = config.getLoggerConfig("fr.pilato.elasticsearch.crawler.fs");
ConsoleAppender console = config.getAppender("Console");

if (commands.silent) {
// Check if the user also asked for --debug or --trace which is contradictory
if (commands.debug || commands.trace) {
logger.warn("--debug or --trace can't be used when --silent is set. Only silent mode will be activated.");
}
// If the user did not enter any job name, nothing will be displayed
if (commands.jobName == null) {
banner();
logger.warn("--silent is set but no job has been defined. Add a job name or remove --silent option. Exiting.");
jCommander.usage();
return;
}
// We change the full rootLogger level
LoggerConfig rootLogger = config.getLoggerConfig(LogManager.ROOT_LOGGER_NAME);
loggerConfig.setLevel(Level.OFF);
rootLogger.setLevel(Level.OFF);
// We don't write anything on the console anymore
console.addFilter(LevelMatchFilter.newBuilder().setLevel(Level.ALL).setOnMatch(Filter.Result.DENY).build());
} else {
loggerConfig.setLevel(commands.debug ? Level.DEBUG : Level.TRACE);
console.addFilter(LevelRangeFilter.createFilter(
commands.debug ? Level.DEBUG : Level.TRACE,
Level.ALL,
Filter.Result.DENY,
Filter.Result.ACCEPT));
}

loggerConfig.setLevel(commands.debug ? Level.DEBUG : Level.TRACE);
ctx.updateLoggers();
}

banner();

if (commands.help) {
jCommander.usage();
return;
Expand Down Expand Up @@ -162,23 +171,23 @@ public static void main(String[] args) throws Exception {
if (commands.jobName == null) {
// The user did not enter a job name.
// We can list available jobs for him
logger.info("No job specified. Here is the list of existing jobs:");
FSCrawlerLogger.console("No job specified. Here is the list of existing jobs:");

List<String> files = FsCrawlerJobsUtil.listExistingJobs(configDir);

if (!files.isEmpty()) {
for (int i = 0; i < files.size(); i++) {
logger.info("[{}] - {}", i+1, files.get(i));
FSCrawlerLogger.console("[{}] - {}", i+1, files.get(i));
}
int chosenFile = 0;
while (chosenFile <= 0 || chosenFile > files.size()) {
logger.info("Choose your job [1-{}]...", files.size());
FSCrawlerLogger.console("Choose your job [1-{}]...", files.size());
chosenFile = scanner.nextInt();
}
jobName = files.get(chosenFile - 1);
} else {
logger.info("No job exists in [{}].", configDir);
logger.info("To create your first job, run 'fscrawler job_name' with 'job_name' you want");
FSCrawlerLogger.console("No job exists in [{}].", configDir);
FSCrawlerLogger.console("To create your first job, run 'fscrawler job_name' with 'job_name' you want");
return;
}

Expand Down Expand Up @@ -210,18 +219,24 @@ public static void main(String[] args) throws Exception {
}

if (username != null && fsSettings.getElasticsearch().getPassword() == null) {
logger.info("Password for " + username + ":");
FSCrawlerLogger.console("Password for {}:", username);
String password = scanner.next();
fsSettings.getElasticsearch().setUsername(username);
fsSettings.getElasticsearch().setPassword(password);
}

} catch (NoSuchFileException e) {
logger.warn("job [{}] does not exist", jobName);
// We can only have a dialog with the end user if we are not silent
if (commands.silent) {
logger.error("job [{}] does not exist. Exiting as we are in silent mode.", jobName);
return;
}

FSCrawlerLogger.console("job [{}] does not exist", jobName);

String yesno = null;
while (!"y".equalsIgnoreCase(yesno) && !"n".equalsIgnoreCase(yesno)) {
logger.info("Do you want to create it (Y/N)?");
FSCrawlerLogger.console("Do you want to create it (Y/N)?");
yesno = scanner.next();
}

Expand All @@ -233,7 +248,7 @@ public static void main(String[] args) throws Exception {
fsSettingsFileHandler.write(fsSettings);

Path config = configDir.resolve(jobName).resolve(FsSettingsFileHandler.SETTINGS_YAML);
logger.info("Settings have been created in [{}]. Please review and edit before relaunch", config);
FSCrawlerLogger.console("Settings have been created in [{}]. Please review and edit before relaunch", config);
}

return;
Expand Down Expand Up @@ -293,6 +308,63 @@ public static void main(String[] args) throws Exception {
}
}

private final static int bannerLength = 100;

/**
* This is coming from: https://patorjk.com/software/taag/#p=display&f=3D%20Diagonal&t=FSCrawler
*/
private final static String asciiArt = "" +
" ,---,. .--.--. ,----.. ,--, \n" +
" ,' .' | / / '. / / \\ ,--.'| \n" +
",---.' || : /`. / | : : __ ,-. .---.| | : __ ,-.\n" +
"| | .'; | |--` . | ;. /,' ,'/ /| /. ./|: : ' ,' ,'/ /|\n" +
": : : | : ;_ . ; /--` ' | |' | ,--.--. .-'-. ' || ' | ,---. ' | |' |\n" +
": | |-, \\ \\ `. ; | ; | | ,'/ \\ /___/ \\: |' | | / \\ | | ,'\n" +
"| : ;/| `----. \\| : | ' : / .--. .-. | .-'.. ' ' .| | : / / |' : / \n" +
"| | .' __ \\ \\ |. | '___ | | ' \\__\\/: . ./___/ \\: '' : |__ . ' / || | ' \n" +
"' : ' / /`--' /' ; : .'|; : | ,\" .--.; |. \\ ' .\\ | | '.'|' ; /|; : | \n" +
"| | | '--'. / ' | '/ :| , ; / / ,. | \\ \\ ' \\ |; : ;' | / || , ; \n" +
"| : \\ `--'---' | : / ---' ; : .' \\ \\ \\ |--\" | , / | : | ---' \n" +
"| | ,' \\ \\ .' | , .-./ \\ \\ | ---`-' \\ \\ / \n" +
"`----' `---` `--`---' '---\" `----' \n";

private static void banner() {
FSCrawlerLogger.console(
separatorLine(",", ".") +
centerAsciiArt() +
separatorLine("+", "+") +
bannerLine("You know, for Files!") +
bannerLine("Made from France with Love") +
bannerLine("Source: https://github.com/dadoonet/fscrawler/") +
bannerLine("Documentation: https://fscrawler.readthedocs.io/") +
separatorLine("`", "'"));
}

private static String centerAsciiArt() {
String[] lines = StringUtils.split(asciiArt, '\n');

// Edit line 0 as we want to add the version
String version = Version.getVersion();
String firstLine = StringUtils.stripEnd(StringUtils.center(lines[0], bannerLength), null);
String pad = StringUtils.rightPad(firstLine, bannerLength - version.length() - 1) + version;
lines[0] = pad;

StringBuilder content = new StringBuilder();
for (String line : lines) {
content.append(bannerLine(line));
}

return content.toString();
}

private static String bannerLine(String text) {
return "|" + StringUtils.center(text, bannerLength) + "|\n";
}

private static String separatorLine(String first, String last) {
return first + StringUtils.center("", bannerLength, "-") + last + "\n";
}

private static void checkForDeprecatedResources(Path configDir, String elasticsearchVersion) throws IOException {
try {
// If we are able to read an old configuration file, we should tell the user to check the documentation
Expand Down
65 changes: 54 additions & 11 deletions cli/src/main/resources/log4j2.xml
Original file line number Diff line number Diff line change
@@ -1,28 +1,71 @@
<?xml version="1.0" encoding="UTF-8"?>
<Configuration status="fatal">
<Configuration status="fatal" monitorInterval="30">
<Properties>
<!-- If you want to change the log level for fscrawler.log file -->
<Property name="LOG_LEVEL">info</Property>
<!-- If you want to change the log level for documents.log file -->
<Property name="DOC_LEVEL">info</Property>
<!-- If you want to change the output dir for logs -->
<Property name="LOG_DIR">logs</Property>
</Properties>

<Appenders>
<Console name="CONSOLE" target="SYSTEM_OUT">
<PatternLayout pattern="%d{ABSOLUTE} %highlight{%-5p} [%c{1.}] %m%n"/>
<Console name="Console" target="SYSTEM_OUT" follow="true">
<PatternLayout pattern="%m%n"/>
</Console>

<RollingFile name="RollingFile" fileName="${sys:LOG_DIR}/fscrawler.log"
filePattern="${sys:LOG_DIR}/fscrawler-%d{yyyy-MM-dd}-%i.log.gz">
<PatternLayout pattern="%d{ABSOLUTE} %highlight{%-5p} [%c{1.}] %m%n"/>
<Policies>
<OnStartupTriggeringPolicy />
<SizeBasedTriggeringPolicy size="20 MB" />
<TimeBasedTriggeringPolicy />
</Policies>
<DefaultRolloverStrategy max="7"/>
</RollingFile>

<RollingFile name="Documents" fileName="${sys:LOG_DIR}/documents.log"
filePattern="${sys:LOG_DIR}/documents-%d{yyyy-MM-dd}.log.gz">
<PatternLayout pattern="%d [%highlight{%-5p}] %m%n"/>
<Policies>
<TimeBasedTriggeringPolicy />
</Policies>
<DefaultRolloverStrategy max="7"/>
</RollingFile>
</Appenders>
<Loggers>
<Logger name="fr.pilato.elasticsearch.crawler.fs" level="info" additivity="false">
<AppenderRef ref="CONSOLE"/>
<!-- This logger is used for the console -->
<Logger name="fscrawler.console" level="info" additivity="false">
<AppenderRef ref="Console" />
</Logger>

<!-- This logger is used to trace all information about documents -->
<Logger name="fscrawler.document" level="${sys:DOC_LEVEL}" additivity="false">
<AppenderRef ref="Documents" />
</Logger>

<!-- This logger is used to log FSCrawler code execution -->
<Logger name="fr.pilato.elasticsearch.crawler.fs" level="${sys:LOG_LEVEL}" additivity="false">
<AppenderRef ref="RollingFile" />
</Logger>

<!-- This logger is used to log 3rd party libs execution -->
<Logger name="org.elasticsearch" level="warn" additivity="false">
<AppenderRef ref="CONSOLE"/>
<AppenderRef ref="RollingFile" />
</Logger>
<Logger name="org.glassfish" level="warn" additivity="false">
<AppenderRef ref="CONSOLE"/>
<AppenderRef ref="RollingFile" />
</Logger>
<Logger name="org.apache.tika.parser.ocr.TesseractOCRParser" level="error" additivity="false">
<AppenderRef ref="CONSOLE"/>
<AppenderRef ref="RollingFile" />
</Logger>
<Logger name="com.gargoylesoftware" level="error" additivity="false">
<AppenderRef ref="CONSOLE"/>
<AppenderRef ref="RollingFile"/>
</Logger>
<Root level="info">
<AppenderRef ref="CONSOLE"/>

<Root level="warn">
<AppenderRef ref="RollingFile" />
</Root>
</Loggers>
</Configuration>
1 change: 1 addition & 0 deletions contrib/docker-compose-example/.env
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
ELASTIC_VERSION=7.10.0
6 changes: 3 additions & 3 deletions contrib/docker-compose-example/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
version: '3'
services:
es01:
image: docker.elastic.co/elasticsearch/elasticsearch:7.8.0
image: docker.elastic.co/elasticsearch/elasticsearch:$ELASTIC_VERSION
container_name: es01
environment:
- node.name=es01
Expand All @@ -23,7 +23,7 @@ services:
- fscrawler_net

es02:
image: docker.elastic.co/elasticsearch/elasticsearch:7.8.0
image: docker.elastic.co/elasticsearch/elasticsearch:$ELASTIC_VERSION
container_name: es02
environment:
- node.name=es02
Expand All @@ -45,7 +45,7 @@ services:
- fscrawler_net

kib01:
image: docker.elastic.co/kibana/kibana:7.8.0
image: docker.elastic.co/kibana/kibana:$ELASTIC_VERSION
container_name: kib01
ports:
- 5601:5601
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import fr.pilato.elasticsearch.crawler.fs.crawler.FileAbstractModel;
import fr.pilato.elasticsearch.crawler.fs.crawler.FileAbstractor;
import fr.pilato.elasticsearch.crawler.fs.framework.ByteSizeValue;
import fr.pilato.elasticsearch.crawler.fs.framework.FSCrawlerLogger;
import fr.pilato.elasticsearch.crawler.fs.framework.OsValidator;
import fr.pilato.elasticsearch.crawler.fs.framework.SignTool;
import fr.pilato.elasticsearch.crawler.fs.service.FsCrawlerDocumentService;
Expand Down Expand Up @@ -379,13 +380,13 @@ private void indexFile(FileAbstractModel fileAbstractModel, ScanStatistic stats,
final long size = fileAbstractModel.getSize();

logger.debug("fetching content from [{}],[{}]", dirname, filename);
String fullFilename = new File(dirname, filename).toString();

try {
// Create the Doc object (only needed when we have add_as_inner_object: true (default) or when we don't index json or xml)
String id = generateIdFromFilename(filename, dirname);
if (fsSettings.getFs().isAddAsInnerObject() || (!fsSettings.getFs().isJsonSupport() && !fsSettings.getFs().isXmlSupport())) {

String fullFilename = new File(dirname, filename).toString();

Doc doc = new Doc();

Expand Down Expand Up @@ -437,6 +438,9 @@ private void indexFile(FileAbstractModel fileAbstractModel, ScanStatistic stats,
// We index the data structure
if (isIndexable(doc.getContent(), fsSettings.getFs().getFilters())) {
if (!closed) {
FSCrawlerLogger.documentDebug(id,
computeVirtualPathName(stats.getRootPath(), fullFilename),
"Indexing content");
documentService.index(
fsSettings.getElasticsearch().getIndex(),
id,
Expand All @@ -452,6 +456,9 @@ private void indexFile(FileAbstractModel fileAbstractModel, ScanStatistic stats,
}
} else {
if (fsSettings.getFs().isJsonSupport()) {
FSCrawlerLogger.documentDebug(generateIdFromFilename(filename, dirname),
computeVirtualPathName(stats.getRootPath(), fullFilename),
"Indexing json content");
// We index the json content directly
if (!closed) {
documentService.indexRawJson(
Expand All @@ -464,6 +471,9 @@ private void indexFile(FileAbstractModel fileAbstractModel, ScanStatistic stats,
fsSettings.getElasticsearch().getIndex(), id);
}
} else if (fsSettings.getFs().isXmlSupport()) {
FSCrawlerLogger.documentDebug(generateIdFromFilename(filename, dirname),
computeVirtualPathName(stats.getRootPath(), fullFilename),
"Indexing xml content");
// We index the xml content directly (after transformation to json)
if (!closed) {
documentService.indexRawJson(
Expand Down
7 changes: 7 additions & 0 deletions distribution/src/main/assembly/assembly.xml
Original file line number Diff line number Diff line change
Expand Up @@ -29,5 +29,12 @@
<include>README.md</include>
</includes>
</fileSet>
<fileSet>
<directory>${project.parent.basedir}/../cli/src/main/resources</directory>
<outputDirectory>config</outputDirectory>
<includes>
<include>log4j2.xml</include>
</includes>
</fileSet>
</fileSets>
</assembly>
Loading

0 comments on commit 6157be9

Please sign in to comment.