Skip to content

Commit

Permalink
Minor fixes on top of pr/64
Browse files Browse the repository at this point in the history
  • Loading branch information
yasserg committed May 21, 2015
1 parent df0214a commit 608e5d2
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 32 deletions.
16 changes: 8 additions & 8 deletions src/main/java/edu/uci/ics/crawler4j/crawler/CrawlConfig.java
Original file line number Diff line number Diff line change
Expand Up @@ -115,10 +115,10 @@ public class CrawlConfig {
* Should we follow redirects?
*/
private boolean followRedirects = true;

/**
* Should the TLD list be updated automatically on each run? Alternatively,
* it can be loaded from the embedded tld-names.zip file that was obtained from
* it can be loaded from the embedded tld-names.zip file that was obtained from
* https://publicsuffix.org/list/effective_tld_names.dat
*/
private boolean onlineTldListUpdate = false;
Expand Down Expand Up @@ -378,18 +378,18 @@ public boolean isFollowRedirects() {
public void setFollowRedirects(boolean followRedirects) {
this.followRedirects = followRedirects;
}

public boolean isOnlineTldListUpdate() {
return onlineTldListUpdate;
return onlineTldListUpdate;
}

/**
* Should the TLD list be updated automatically on each run? Alternatively,
* it can be loaded from the embedded tld-names.zip file that was obtained from
* https://publicsuffix.org/list/effective_tld_names.dat
* it can be loaded from the embedded tld-names.txt resource file that was
* obtained from https://publicsuffix.org/list/effective_tld_names.dat
*/
public void setOnlineTldListUpdate(boolean online) {
onlineTldListUpdate = online;
onlineTldListUpdate = online;
}

public String getProxyHost() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,7 @@ public CrawlController(CrawlConfig config, PageFetcher pageFetcher, RobotstxtSer
}

TLDList.setUseOnline(config.isOnlineTldListUpdate());
TLDList.getInstance();


boolean resumable = config.isResumableCrawling();

EnvironmentConfig envConfig = new EnvironmentConfig();
Expand Down
40 changes: 18 additions & 22 deletions src/main/java/edu/uci/ics/crawler4j/url/TLDList.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
Expand All @@ -25,14 +24,13 @@ public class TLDList {
private static final String TLD_NAMES_TXT_FILENAME = "/tld-names.txt";
private static final Logger logger = LoggerFactory.getLogger(TLDList.class);

private static boolean online_update = false;
private static boolean onlineUpdate = false;
private final Set<String> tldSet = new HashSet<>(10000);

private static final TLDList instance = new TLDList(); // Singleton

private TLDList() {
if (online_update)
{
if (onlineUpdate) {
URL url;
try {
url = new URL(TLD_NAMES_ONLINE_URL);
Expand All @@ -41,42 +39,38 @@ private TLDList() {
logger.error("Invalid URL: {}", TLD_NAMES_ONLINE_URL);
throw new RuntimeException(e);
}

try (InputStream stream = url.openStream()) {
logger.debug("Fetching the most updated TLD list online");
int n = readStream(stream);
logger.info("Obtained {} TLD from URL {}", n, TLD_NAMES_ONLINE_URL);
return;
} catch (Exception ex) {
logger.error("Couldn't fetch the online list of TLDs from: {}", TLD_NAMES_ONLINE_URL);
} catch (Exception e) {
logger.error("Couldn't fetch the online list of TLDs from: {}", TLD_NAMES_ONLINE_URL, e);
}
}

File f = new File(TLD_NAMES_TXT_FILENAME);
if (f.exists()) {
logger.debug("Fetching the list from a local file {}", TLD_NAMES_TXT_FILENAME);
try (InputStream tldFile = new FileInputStream(f)) {
int n = readStream(tldFile);
logger.info("Obtained {} TLD from local file {}", n, TLD_NAMES_TXT_FILENAME);
return;
}
catch (FileNotFoundException e)
{} // Should not happen as we just checked this
catch (IOException e) {
logger.error("Couldn't read the TLD list from local file");
} catch (IOException e) {
logger.error("Couldn't read the TLD list from local file", e);
}
}
try (InputStream tldFile = this.getClass().getClassLoader().getResourceAsStream(TLD_NAMES_TXT_FILENAME)) {
try (InputStream tldFile = getClass().getClassLoader().getResourceAsStream(TLD_NAMES_TXT_FILENAME)) {
int n = readStream(tldFile);
logger.info("Obtained {} TLD from packaged file {}", n, TLD_NAMES_TXT_FILENAME);
} catch (IOException e) {
logger.error("Couldn't read the TLD list from file");
throw new RuntimeException(e);
}
}

private int readStream(InputStream stream)
{

private int readStream(InputStream stream) {
try (BufferedReader reader = new BufferedReader(new InputStreamReader(stream))) {
String line;
while ((line = reader.readLine()) != null) {
Expand All @@ -86,9 +80,7 @@ private int readStream(InputStream stream)
}
tldSet.add(line);
}
}
catch (IOException e)
{
} catch (IOException e) {
logger.warn("Error while reading TLD-list: {}", e.getMessage());
}
return tldSet.size();
Expand All @@ -97,9 +89,13 @@ private int readStream(InputStream stream)
public static TLDList getInstance() {
return instance;
}


/**
* If {@code online} is set to true, the list of TLD files will be downloaded and refreshed, otherwise the one
* cached in src/main/resources/tld-names.txt will be used.
*/
public static void setUseOnline(boolean online) {
online_update = online;
onlineUpdate = online;
}

public boolean contains(String str) {
Expand Down

0 comments on commit 608e5d2

Please sign in to comment.