Skip to content

Commit

Permalink
[style] re-format code
Browse files Browse the repository at this point in the history
  • Loading branch information
yasserg committed Feb 4, 2015
1 parent 516799e commit 6ffe56b
Show file tree
Hide file tree
Showing 46 changed files with 467 additions and 389 deletions.
8 changes: 4 additions & 4 deletions src/main/java/edu/uci/ics/crawler4j/crawler/CrawlConfig.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,11 @@

package edu.uci.ics.crawler4j.crawler;

import edu.uci.ics.crawler4j.crawler.authentication.AuthInfo;

import java.util.ArrayList;
import java.util.List;

import edu.uci.ics.crawler4j.crawler.authentication.AuthInfo;

public class CrawlConfig {

/**
Expand Down Expand Up @@ -133,8 +133,8 @@ public class CrawlConfig {
private String proxyPassword = null;

/**
* List of possible authentications needed by crawler
*/
* List of possible authentications needed by crawler
*/
private List<AuthInfo> authInfos;

public CrawlConfig() {
Expand Down
30 changes: 19 additions & 11 deletions src/main/java/edu/uci/ics/crawler4j/crawler/CrawlController.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,21 +17,23 @@

package edu.uci.ics.crawler4j.crawler;

import java.io.File;
import java.util.ArrayList;
import java.util.List;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.sleepycat.je.Environment;
import com.sleepycat.je.EnvironmentConfig;

import edu.uci.ics.crawler4j.fetcher.PageFetcher;
import edu.uci.ics.crawler4j.frontier.DocIDServer;
import edu.uci.ics.crawler4j.frontier.Frontier;
import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer;
import edu.uci.ics.crawler4j.url.URLCanonicalizer;
import edu.uci.ics.crawler4j.url.WebURL;
import edu.uci.ics.crawler4j.util.IO;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.File;
import java.util.ArrayList;
import java.util.List;

/**
* The controller that manages a crawling session. This class creates the
Expand Down Expand Up @@ -74,14 +76,16 @@ public class CrawlController extends Configurable {
protected final Object waitingLock = new Object();
protected final Environment env;

public CrawlController(CrawlConfig config, PageFetcher pageFetcher, RobotstxtServer robotstxtServer) throws Exception {
public CrawlController(CrawlConfig config, PageFetcher pageFetcher, RobotstxtServer robotstxtServer)
throws Exception {
super(config);

config.validate();
File folder = new File(config.getCrawlStorageFolder());
if (!folder.exists()) {
if (!folder.mkdirs()) {
throw new Exception("couldn't create the storage folder: " + folder.getAbsolutePath() + " does it already exist ?");
throw new Exception(
"couldn't create the storage folder: " + folder.getAbsolutePath() + " does it already exist ?");
} else {
logger.debug("Created folder: " + folder.getAbsolutePath());
}
Expand Down Expand Up @@ -214,7 +218,9 @@ public void run() {
if (queueLength > 0) {
continue;
}
logger.info("No thread is working and no more URLs are in queue waiting for another 10 seconds to make sure...");
logger.info(
"No thread is working and no more URLs are in queue waiting for another 10 seconds to make " +
"sure...");
sleep(10);
queueLength = frontier.getQueueLength();
if (queueLength > 0) {
Expand Down Expand Up @@ -282,7 +288,8 @@ public void waitUntilFinish() {
}

/**
* Once the crawling session finishes the controller collects the local data of the crawler threads and stores them in a List.
* Once the crawling session finishes the controller collects the local data of the crawler threads and stores them
* in a List.
* This function returns the reference to this list.
*
* @return List of Objects which are your local data
Expand Down Expand Up @@ -354,7 +361,8 @@ public void addSeed(String pageUrl, int docId) {
webUrl.setDocid(docId);
webUrl.setDepth((short) 0);
if (!robotstxtServer.allows(webUrl)) {
logger.warn("Robots.txt does not allow this seed: {}", pageUrl); // using the WARN level here, as the user specifically asked to add this seed
logger.warn("Robots.txt does not allow this seed: {}",
pageUrl); // using the WARN level here, as the user specifically asked to add this seed
} else {
frontier.schedule(webUrl);
}
Expand Down
12 changes: 6 additions & 6 deletions src/main/java/edu/uci/ics/crawler4j/crawler/Page.java
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,8 @@ public class Page {
protected WebURL url;

/**
* Redirection flag
*/
* Redirection flag
*/
protected boolean redirect;

/**
Expand All @@ -50,8 +50,8 @@ public class Page {
protected String redirectedToUrl;

/**
* Status of the page
*/
* Status of the page
*/
protected int statusCode;

/**
Expand All @@ -78,8 +78,8 @@ public class Page {
protected String contentCharset;

/**
* Language of the Content.
*/
* Language of the Content.
*/
private String language;

/**
Expand Down
103 changes: 53 additions & 50 deletions src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,13 @@

package edu.uci.ics.crawler4j.crawler;

import java.util.ArrayList;
import java.util.List;
import java.util.Locale;

import org.apache.http.HttpStatus;
import org.apache.http.impl.EnglishReasonPhraseCatalog;

import edu.uci.ics.crawler4j.crawler.exceptions.ContentFetchException;
import edu.uci.ics.crawler4j.crawler.exceptions.PageBiggerThanMaxSizeException;
import edu.uci.ics.crawler4j.crawler.exceptions.ParseException;
Expand All @@ -30,18 +37,10 @@
import edu.uci.ics.crawler4j.parser.Parser;
import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer;
import edu.uci.ics.crawler4j.url.WebURL;

import org.apache.http.HttpStatus;

import org.apache.http.impl.EnglishReasonPhraseCatalog;
import uk.org.lidalia.slf4jext.Level;
import uk.org.lidalia.slf4jext.Logger;
import uk.org.lidalia.slf4jext.LoggerFactory;

import java.util.ArrayList;
import java.util.List;
import java.util.Locale;

/**
* WebCrawler class in the Runnable class that is executed by each crawler thread.
*
Expand Down Expand Up @@ -168,26 +167,26 @@ protected void handlePageStatusCode(WebURL webUrl, int statusCode, String status
// Sub-classed can override this to add their custom functionality
}

/**
* This function is called before processing of the page's URL
* It can be overridden by subclasses for tweaking of the url before processing it.
* For example, http://abc.com/def?a=123 - http://abc.com/def
*
* @param curURL current URL which can be tweaked before processing
* @return tweaked WebURL
*/
protected WebURL handleUrlBeforeProcess(WebURL curURL) {
return curURL;
}
/**
* This function is called before processing of the page's URL
* It can be overridden by subclasses for tweaking of the url before processing it.
* For example, http://abc.com/def?a=123 - http://abc.com/def
*
* @param curURL current URL which can be tweaked before processing
* @return tweaked WebURL
*/
protected WebURL handleUrlBeforeProcess(WebURL curURL) {
return curURL;
}

/**
* This function is called if the content of a url is bigger than allowed size.
*
* @param urlStr - The URL which it's content is bigger than allowed size
*/
protected void onPageBiggerThanMaxSize(String urlStr, long pageSize) {
logger.warn("Skipping a URL: {} which was bigger ( {} ) than max allowed size", urlStr, pageSize);
}
/**
* This function is called if the content of a url is bigger than allowed size.
*
* @param urlStr - The URL which it's content is bigger than allowed size
*/
protected void onPageBiggerThanMaxSize(String urlStr, long pageSize) {
logger.warn("Skipping a URL: {} which was bigger ( {} ) than max allowed size", urlStr, pageSize);
}

/**
* This function is called if the crawler encountered an unexpected http status code ( a status code other than 3xx)
Expand All @@ -197,11 +196,11 @@ protected void onPageBiggerThanMaxSize(String urlStr, long pageSize) {
* @param contentType Type of Content
* @param description Error Description
*/
protected void onUnexpectedStatusCode(String urlStr, int statusCode, String contentType, String description) {
logger.warn("Skipping URL: {}, StatusCode: {}, {}, {}", urlStr, statusCode, contentType, description);
// Do nothing by default (except basic logging)
// Sub-classed can override this to add their custom functionality
}
protected void onUnexpectedStatusCode(String urlStr, int statusCode, String contentType, String description) {
logger.warn("Skipping URL: {}, StatusCode: {}, {}, {}", urlStr, statusCode, contentType, description);
// Do nothing by default (except basic logging)
// Sub-classed can override this to add their custom functionality
}

/**
* This function is called if the content of a url could not be fetched.
Expand Down Expand Up @@ -271,18 +270,18 @@ public void run() {
}

/**
* Classes that extends WebCrawler should overwrite this function to tell the
* crawler whether the given url should be crawled or not. The following
* implementation indicates that all urls should be included in the crawl.
*
* @param url
* the url which we are interested to know whether it should be
* included in the crawl or not.
* @param page
* Page context from which this URL was scraped
* @return if the url should be included in the crawl it returns true,
* otherwise false is returned.
*/
* Classes that extends WebCrawler should overwrite this function to tell the
* crawler whether the given url should be crawled or not. The following
* implementation indicates that all urls should be included in the crawl.
*
* @param url
* the url which we are interested to know whether it should be
* included in the crawl or not.
* @param page
* Page context from which this URL was scraped
* @return if the url should be included in the crawl it returns true,
* otherwise false is returned.
*/
public boolean shouldVisit(Page page, WebURL url) {
return true;
}
Expand All @@ -308,15 +307,17 @@ private void processPage(WebURL curURL) {

fetchResult = pageFetcher.fetchPage(curURL);
int statusCode = fetchResult.getStatusCode();
handlePageStatusCode(curURL, statusCode, EnglishReasonPhraseCatalog.INSTANCE.getReason(statusCode, Locale.ENGLISH)); // Finds the status reason for all known statuses
handlePageStatusCode(curURL, statusCode, EnglishReasonPhraseCatalog.INSTANCE
.getReason(statusCode, Locale.ENGLISH)); // Finds the status reason for all known statuses

Page page = new Page(curURL);
page.setFetchResponseHeaders(fetchResult.getResponseHeaders());
page.setStatusCode(statusCode);
if (statusCode != HttpStatus.SC_OK) { // Not 200
if (statusCode == HttpStatus.SC_MOVED_PERMANENTLY || statusCode == HttpStatus.SC_MOVED_TEMPORARILY
|| statusCode == HttpStatus.SC_MULTIPLE_CHOICES || statusCode == HttpStatus.SC_SEE_OTHER
|| statusCode == HttpStatus.SC_TEMPORARY_REDIRECT || statusCode == 308) { // is 3xx todo follow https://issues.apache.org/jira/browse/HTTPCORE-389
if (statusCode == HttpStatus.SC_MOVED_PERMANENTLY || statusCode == HttpStatus.SC_MOVED_TEMPORARILY ||
statusCode == HttpStatus.SC_MULTIPLE_CHOICES || statusCode == HttpStatus.SC_SEE_OTHER ||
statusCode == HttpStatus.SC_TEMPORARY_REDIRECT ||
statusCode == 308) { // is 3xx todo follow https://issues.apache.org/jira/browse/HTTPCORE-389

page.setRedirect(true);
if (myController.getConfig().isFollowRedirects()) {
Expand Down Expand Up @@ -350,8 +351,10 @@ private void processPage(WebURL curURL) {
}
}
} else { // All other http codes other than 3xx & 200
String description = EnglishReasonPhraseCatalog.INSTANCE.getReason(fetchResult.getStatusCode(), Locale.ENGLISH); // Finds the status reason for all known statuses
String contentType = fetchResult.getEntity() == null ? "" : fetchResult.getEntity().getContentType().getValue();
String description = EnglishReasonPhraseCatalog.INSTANCE
.getReason(fetchResult.getStatusCode(), Locale.ENGLISH); // Finds the status reason for all known statuses
String contentType =
fetchResult.getEntity() == null ? "" : fetchResult.getEntity().getContentType().getValue();
onUnexpectedStatusCode(curURL.getURL(), fetchResult.getStatusCode(), contentType, description);
}

Expand Down
Original file line number Diff line number Diff line change
@@ -1,20 +1,23 @@
package edu.uci.ics.crawler4j.crawler.authentication;

import javax.swing.text.html.FormSubmitEvent.MethodType;
import java.net.MalformedURLException;
import java.net.URL;

import javax.swing.text.html.FormSubmitEvent.MethodType;

/**
* Created by Avi Hayun on 11/23/2014.
*
* Abstract class containing authentication information needed to login into a user/password protected site<br>
* This class should be extended by specific authentication types like form authentication and basic authentication etc<br>
* This class should be extended by specific authentication types like form authentication and basic authentication
* etc<br>
* <br>
* This class contains all of the mutual authentication data for all authentication types
*/
public abstract class AuthInfo {
public enum AuthenticationType {
BASIC_AUTHENTICATION, FORM_AUTHENTICATION
BASIC_AUTHENTICATION,
FORM_AUTHENTICATION
}

protected AuthenticationType authenticationType;
Expand All @@ -41,7 +44,8 @@ public AuthInfo() {
*
* @throws MalformedURLException Make sure your URL is valid
*/
protected AuthInfo(AuthenticationType authenticationType, MethodType httpMethod, String loginUrl, String username, String password) throws MalformedURLException {
protected AuthInfo(AuthenticationType authenticationType, MethodType httpMethod, String loginUrl, String username,
String password) throws MalformedURLException {
this.authenticationType = authenticationType;
this.httpMethod = httpMethod;
URL url = new URL(loginUrl);
Expand Down
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
package edu.uci.ics.crawler4j.crawler.authentication;

import javax.swing.text.html.FormSubmitEvent.MethodType;
import java.net.MalformedURLException;

import javax.swing.text.html.FormSubmitEvent.MethodType;

/**
* Created by Avi Hayun on 11/25/2014.
*
* BasicAuthInfo contains the authentication information needed for BASIC authentication (extending AuthInfo which has all common auth info in it)
* BasicAuthInfo contains the authentication information needed for BASIC authentication (extending AuthInfo which
* has all common auth info in it)
*
* BASIC authentication in PHP:
* <ul>
Expand Down
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
package edu.uci.ics.crawler4j.crawler.authentication;

import javax.swing.text.html.FormSubmitEvent.MethodType;
import java.net.MalformedURLException;

import javax.swing.text.html.FormSubmitEvent.MethodType;

/**
* Created by Avi Hayun on 11/25/2014.
*
* FormAuthInfo contains the authentication information needed for FORM authentication (extending AuthInfo which has all common auth info in it)
* Basically, this is the most common authentication, where you will get to a site and you will need to enter a username and password into an HTML form
* FormAuthInfo contains the authentication information needed for FORM authentication (extending AuthInfo which has
* all common auth info in it)
* Basically, this is the most common authentication, where you will get to a site and you will need to enter a
* username and password into an HTML form
*/
public class FormAuthInfo extends AuthInfo {

Expand All @@ -25,7 +28,8 @@ public class FormAuthInfo extends AuthInfo {
*
* @throws MalformedURLException Make sure your URL is valid
*/
public FormAuthInfo(String username, String password, String loginUrl, String usernameFormStr, String passwordFormStr) throws MalformedURLException {
public FormAuthInfo(String username, String password, String loginUrl, String usernameFormStr, String passwordFormStr)
throws MalformedURLException {
super(AuthenticationType.FORM_AUTHENTICATION, MethodType.POST, loginUrl, username, password);

this.usernameFormStr = usernameFormStr;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
/**
* Created by Avi Hayun on 12/8/2014.
*
* Occurs when the crawler encounters a Redirect problem, like redirecting to a visited-already page, or redirecting to nothing
* Occurs when the crawler encounters a Redirect problem, like redirecting to a visited-already page, or redirecting
* to nothing
*/
public class RedirectException extends Exception {
public Level level;
Expand Down
Loading

0 comments on commit 6ffe56b

Please sign in to comment.