[style] re-format code

hellojvm · Feb 4, 2015 · 6ffe56b · 6ffe56b
1 parent 516799e
commit 6ffe56b
Show file tree

Hide file tree

Showing 46 changed files with 467 additions and 389 deletions.
diff --git a/src/main/java/edu/uci/ics/crawler4j/crawler/CrawlConfig.java b/src/main/java/edu/uci/ics/crawler4j/crawler/CrawlConfig.java
@@ -17,11 +17,11 @@
 
 package edu.uci.ics.crawler4j.crawler;
 
-import edu.uci.ics.crawler4j.crawler.authentication.AuthInfo;
-
 import java.util.ArrayList;
 import java.util.List;
 
+import edu.uci.ics.crawler4j.crawler.authentication.AuthInfo;
+
 public class CrawlConfig {
 
   /**
@@ -133,8 +133,8 @@ public class CrawlConfig {
   private String proxyPassword = null;
 
   /**
-  * List of possible authentications needed by crawler
-  */
+   * List of possible authentications needed by crawler
+   */
   private List<AuthInfo> authInfos;
 
   public CrawlConfig() {

diff --git a/src/main/java/edu/uci/ics/crawler4j/crawler/CrawlController.java b/src/main/java/edu/uci/ics/crawler4j/crawler/CrawlController.java
@@ -17,21 +17,23 @@
 
 package edu.uci.ics.crawler4j.crawler;
 
+import java.io.File;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
 import com.sleepycat.je.Environment;
 import com.sleepycat.je.EnvironmentConfig;
+
 import edu.uci.ics.crawler4j.fetcher.PageFetcher;
 import edu.uci.ics.crawler4j.frontier.DocIDServer;
 import edu.uci.ics.crawler4j.frontier.Frontier;
 import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer;
 import edu.uci.ics.crawler4j.url.URLCanonicalizer;
 import edu.uci.ics.crawler4j.url.WebURL;
 import edu.uci.ics.crawler4j.util.IO;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.File;
-import java.util.ArrayList;
-import java.util.List;
 
 /**
  * The controller that manages a crawling session. This class creates the
@@ -74,14 +76,16 @@ public class CrawlController extends Configurable {
   protected final Object waitingLock = new Object();
   protected final Environment env;
 
-  public CrawlController(CrawlConfig config, PageFetcher pageFetcher, RobotstxtServer robotstxtServer) throws Exception {
+  public CrawlController(CrawlConfig config, PageFetcher pageFetcher, RobotstxtServer robotstxtServer)
+      throws Exception {
     super(config);
 
     config.validate();
     File folder = new File(config.getCrawlStorageFolder());
     if (!folder.exists()) {
       if (!folder.mkdirs()) {
-        throw new Exception("couldn't create the storage folder: " + folder.getAbsolutePath() + " does it already exist ?");
+        throw new Exception(
+            "couldn't create the storage folder: " + folder.getAbsolutePath() + " does it already exist ?");
       } else {
         logger.debug("Created folder: " + folder.getAbsolutePath());
       }
@@ -214,7 +218,9 @@ public void run() {
                       if (queueLength > 0) {
                         continue;
                       }
-                      logger.info("No thread is working and no more URLs are in queue waiting for another 10 seconds to make sure...");
+                      logger.info(
+                          "No thread is working and no more URLs are in queue waiting for another 10 seconds to make " +
+                          "sure...");
                       sleep(10);
                       queueLength = frontier.getQueueLength();
                       if (queueLength > 0) {
@@ -282,7 +288,8 @@ public void waitUntilFinish() {
   }
 
   /**
-   * Once the crawling session finishes the controller collects the local data of the crawler threads and stores them in a List.
+   * Once the crawling session finishes the controller collects the local data of the crawler threads and stores them
+   * in a List.
    * This function returns the reference to this list.
    *
    * @return List of Objects which are your local data
@@ -354,7 +361,8 @@ public void addSeed(String pageUrl, int docId) {
       webUrl.setDocid(docId);
       webUrl.setDepth((short) 0);
       if (!robotstxtServer.allows(webUrl)) {
-        logger.warn("Robots.txt does not allow this seed: {}", pageUrl); // using the WARN level here, as the user specifically asked to add this seed
+        logger.warn("Robots.txt does not allow this seed: {}",
+                    pageUrl); // using the WARN level here, as the user specifically asked to add this seed
       } else {
         frontier.schedule(webUrl);
       }

diff --git a/src/main/java/edu/uci/ics/crawler4j/crawler/Page.java b/src/main/java/edu/uci/ics/crawler4j/crawler/Page.java
@@ -40,8 +40,8 @@ public class Page {
   protected WebURL url;
 
   /**
-  * Redirection flag
-  */
+   * Redirection flag
+   */
   protected boolean redirect;
 
   /**
@@ -50,8 +50,8 @@ public class Page {
   protected String redirectedToUrl;
 
   /**
-  * Status of the page
-  */
+   * Status of the page
+   */
   protected int statusCode;
 
   /**
@@ -78,8 +78,8 @@ public class Page {
   protected String contentCharset;
 
   /**
-  * Language of the Content.
-  */
+   * Language of the Content.
+   */
   private String language;
 
   /**

diff --git a/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java b/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java
@@ -17,6 +17,13 @@
 
 package edu.uci.ics.crawler4j.crawler;
 
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Locale;
+
+import org.apache.http.HttpStatus;
+import org.apache.http.impl.EnglishReasonPhraseCatalog;
+
 import edu.uci.ics.crawler4j.crawler.exceptions.ContentFetchException;
 import edu.uci.ics.crawler4j.crawler.exceptions.PageBiggerThanMaxSizeException;
 import edu.uci.ics.crawler4j.crawler.exceptions.ParseException;
@@ -30,18 +37,10 @@
 import edu.uci.ics.crawler4j.parser.Parser;
 import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer;
 import edu.uci.ics.crawler4j.url.WebURL;
-
-import org.apache.http.HttpStatus;
-
-import org.apache.http.impl.EnglishReasonPhraseCatalog;
 import uk.org.lidalia.slf4jext.Level;
 import uk.org.lidalia.slf4jext.Logger;
 import uk.org.lidalia.slf4jext.LoggerFactory;
 
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Locale;
-
 /**
  * WebCrawler class in the Runnable class that is executed by each crawler thread.
  *
@@ -168,26 +167,26 @@ protected void handlePageStatusCode(WebURL webUrl, int statusCode, String status
     // Sub-classed can override this to add their custom functionality
   }
 
-    /**
-     * This function is called before processing of the page's URL
-     * It can be overridden by subclasses for tweaking of the url before processing it.
-     * For example, http://abc.com/def?a=123 - http://abc.com/def
-     *
-     * @param curURL current URL which can be tweaked before processing
-     * @return tweaked WebURL
-     */
-    protected WebURL handleUrlBeforeProcess(WebURL curURL) {
-      return curURL;
-    }
+  /**
+   * This function is called before processing of the page's URL
+   * It can be overridden by subclasses for tweaking of the url before processing it.
+   * For example, http://abc.com/def?a=123 - http://abc.com/def
+   *
+   * @param curURL current URL which can be tweaked before processing
+   * @return tweaked WebURL
+   */
+  protected WebURL handleUrlBeforeProcess(WebURL curURL) {
+    return curURL;
+  }
 
-    /**
-     * This function is called if the content of a url is bigger than allowed size.
-     *
-     * @param urlStr - The URL which it's content is bigger than allowed size
-     */
-    protected void onPageBiggerThanMaxSize(String urlStr, long pageSize) {
-      logger.warn("Skipping a URL: {} which was bigger ( {} ) than max allowed size", urlStr, pageSize);
-    }
+  /**
+   * This function is called if the content of a url is bigger than allowed size.
+   *
+   * @param urlStr - The URL which it's content is bigger than allowed size
+   */
+  protected void onPageBiggerThanMaxSize(String urlStr, long pageSize) {
+    logger.warn("Skipping a URL: {} which was bigger ( {} ) than max allowed size", urlStr, pageSize);
+  }
 
   /**
    * This function is called if the crawler encountered an unexpected http status code ( a status code other than 3xx)
@@ -197,11 +196,11 @@ protected void onPageBiggerThanMaxSize(String urlStr, long pageSize) {
    * @param contentType Type of Content
    * @param description Error Description
    */
-    protected void onUnexpectedStatusCode(String urlStr, int statusCode, String contentType, String description) {
-      logger.warn("Skipping URL: {}, StatusCode: {}, {}, {}", urlStr, statusCode, contentType, description);
-      // Do nothing by default (except basic logging)
-      // Sub-classed can override this to add their custom functionality
-    }
+  protected void onUnexpectedStatusCode(String urlStr, int statusCode, String contentType, String description) {
+    logger.warn("Skipping URL: {}, StatusCode: {}, {}, {}", urlStr, statusCode, contentType, description);
+    // Do nothing by default (except basic logging)
+    // Sub-classed can override this to add their custom functionality
+  }
 
   /**
    * This function is called if the content of a url could not be fetched.
@@ -271,18 +270,18 @@ public void run() {
   }
 
   /**
-  * Classes that extends WebCrawler should overwrite this function to tell the
-  * crawler whether the given url should be crawled or not. The following
-  * implementation indicates that all urls should be included in the crawl.
-  *
-  * @param url
-  *            the url which we are interested to know whether it should be
-  *            included in the crawl or not.
-  * @param page
-  *           Page context from which this URL was scraped
-  * @return if the url should be included in the crawl it returns true,
-  *         otherwise false is returned.
-  */
+   * Classes that extends WebCrawler should overwrite this function to tell the
+   * crawler whether the given url should be crawled or not. The following
+   * implementation indicates that all urls should be included in the crawl.
+   *
+   * @param url
+   *            the url which we are interested to know whether it should be
+   *            included in the crawl or not.
+   * @param page
+   *           Page context from which this URL was scraped
+   * @return if the url should be included in the crawl it returns true,
+   *         otherwise false is returned.
+   */
   public boolean shouldVisit(Page page, WebURL url) {
     return true;
   }
@@ -308,15 +307,17 @@ private void processPage(WebURL curURL) {
 
       fetchResult = pageFetcher.fetchPage(curURL);
       int statusCode = fetchResult.getStatusCode();
-      handlePageStatusCode(curURL, statusCode, EnglishReasonPhraseCatalog.INSTANCE.getReason(statusCode, Locale.ENGLISH)); // Finds the status reason for all known statuses
+      handlePageStatusCode(curURL, statusCode, EnglishReasonPhraseCatalog.INSTANCE
+          .getReason(statusCode, Locale.ENGLISH)); // Finds the status reason for all known statuses
 
       Page page = new Page(curURL);
       page.setFetchResponseHeaders(fetchResult.getResponseHeaders());
       page.setStatusCode(statusCode);
       if (statusCode != HttpStatus.SC_OK) { // Not 200
-        if (statusCode == HttpStatus.SC_MOVED_PERMANENTLY || statusCode == HttpStatus.SC_MOVED_TEMPORARILY
-            || statusCode == HttpStatus.SC_MULTIPLE_CHOICES || statusCode == HttpStatus.SC_SEE_OTHER
-            || statusCode == HttpStatus.SC_TEMPORARY_REDIRECT || statusCode == 308) { // is 3xx  todo follow https://issues.apache.org/jira/browse/HTTPCORE-389
+        if (statusCode == HttpStatus.SC_MOVED_PERMANENTLY || statusCode == HttpStatus.SC_MOVED_TEMPORARILY ||
+            statusCode == HttpStatus.SC_MULTIPLE_CHOICES || statusCode == HttpStatus.SC_SEE_OTHER ||
+            statusCode == HttpStatus.SC_TEMPORARY_REDIRECT ||
+            statusCode == 308) { // is 3xx  todo follow https://issues.apache.org/jira/browse/HTTPCORE-389
 
           page.setRedirect(true);
           if (myController.getConfig().isFollowRedirects()) {
@@ -350,8 +351,10 @@ private void processPage(WebURL curURL) {
             }
           }
         } else { // All other http codes other than 3xx & 200
-          String description = EnglishReasonPhraseCatalog.INSTANCE.getReason(fetchResult.getStatusCode(), Locale.ENGLISH); // Finds the status reason for all known statuses
-          String contentType = fetchResult.getEntity() == null ? "" : fetchResult.getEntity().getContentType().getValue();
+          String description = EnglishReasonPhraseCatalog.INSTANCE
+              .getReason(fetchResult.getStatusCode(), Locale.ENGLISH); // Finds the status reason for all known statuses
+          String contentType =
+              fetchResult.getEntity() == null ? "" : fetchResult.getEntity().getContentType().getValue();
           onUnexpectedStatusCode(curURL.getURL(), fetchResult.getStatusCode(), contentType, description);
         }
 

diff --git a/src/main/java/edu/uci/ics/crawler4j/crawler/authentication/AuthInfo.java b/src/main/java/edu/uci/ics/crawler4j/crawler/authentication/AuthInfo.java
@@ -1,20 +1,23 @@
 package edu.uci.ics.crawler4j.crawler.authentication;
 
-import javax.swing.text.html.FormSubmitEvent.MethodType;
 import java.net.MalformedURLException;
 import java.net.URL;
 
+import javax.swing.text.html.FormSubmitEvent.MethodType;
+
 /**
  * Created by Avi Hayun on 11/23/2014.
  *
  * Abstract class containing authentication information needed to login into a user/password protected site<br>
- * This class should be extended by specific authentication types like form authentication and basic authentication etc<br>
+ * This class should be extended by specific authentication types like form authentication and basic authentication
+ * etc<br>
  * <br>
  * This class contains all of the mutual authentication data for all authentication types
  */
 public abstract class AuthInfo {
   public enum AuthenticationType {
-    BASIC_AUTHENTICATION, FORM_AUTHENTICATION
+    BASIC_AUTHENTICATION,
+    FORM_AUTHENTICATION
   }
 
   protected AuthenticationType authenticationType;
@@ -41,7 +44,8 @@ public AuthInfo() {
    *
    * @throws MalformedURLException Make sure your URL is valid
    */
-  protected AuthInfo(AuthenticationType authenticationType, MethodType httpMethod, String loginUrl, String username, String password) throws MalformedURLException {
+  protected AuthInfo(AuthenticationType authenticationType, MethodType httpMethod, String loginUrl, String username,
+                     String password) throws MalformedURLException {
     this.authenticationType = authenticationType;
     this.httpMethod = httpMethod;
     URL url = new URL(loginUrl);

diff --git a/src/main/java/edu/uci/ics/crawler4j/crawler/authentication/BasicAuthInfo.java b/src/main/java/edu/uci/ics/crawler4j/crawler/authentication/BasicAuthInfo.java
@@ -1,12 +1,14 @@
 package edu.uci.ics.crawler4j.crawler.authentication;
 
-import javax.swing.text.html.FormSubmitEvent.MethodType;
 import java.net.MalformedURLException;
 
+import javax.swing.text.html.FormSubmitEvent.MethodType;
+
 /**
  * Created by Avi Hayun on 11/25/2014.
  *
- * BasicAuthInfo contains the authentication information needed for BASIC authentication (extending AuthInfo which has all common auth info in it)
+ * BasicAuthInfo contains the authentication information needed for BASIC authentication (extending AuthInfo which
+ * has all common auth info in it)
  *
  * BASIC authentication in PHP:
  * <ul>

diff --git a/src/main/java/edu/uci/ics/crawler4j/crawler/authentication/FormAuthInfo.java b/src/main/java/edu/uci/ics/crawler4j/crawler/authentication/FormAuthInfo.java
@@ -1,13 +1,16 @@
 package edu.uci.ics.crawler4j.crawler.authentication;
 
-import javax.swing.text.html.FormSubmitEvent.MethodType;
 import java.net.MalformedURLException;
 
+import javax.swing.text.html.FormSubmitEvent.MethodType;
+
 /**
  * Created by Avi Hayun on 11/25/2014.
  *
- * FormAuthInfo contains the authentication information needed for FORM authentication (extending AuthInfo which has all common auth info in it)
- * Basically, this is the most common authentication, where you will get to a site and you will need to enter a username and password into an HTML form
+ * FormAuthInfo contains the authentication information needed for FORM authentication (extending AuthInfo which has
+ * all common auth info in it)
+ * Basically, this is the most common authentication, where you will get to a site and you will need to enter a
+ * username and password into an HTML form
  */
 public class FormAuthInfo extends AuthInfo {
 
@@ -25,7 +28,8 @@ public class FormAuthInfo extends AuthInfo {
    *
    * @throws MalformedURLException Make sure your URL is valid
    */
-  public FormAuthInfo(String username, String password, String loginUrl, String usernameFormStr, String passwordFormStr) throws MalformedURLException {
+  public FormAuthInfo(String username, String password, String loginUrl, String usernameFormStr, String passwordFormStr)
+      throws MalformedURLException {
     super(AuthenticationType.FORM_AUTHENTICATION, MethodType.POST, loginUrl, username, password);
 
     this.usernameFormStr = usernameFormStr;

diff --git a/src/main/java/edu/uci/ics/crawler4j/crawler/exceptions/RedirectException.java b/src/main/java/edu/uci/ics/crawler4j/crawler/exceptions/RedirectException.java
@@ -5,7 +5,8 @@
 /**
  * Created by Avi Hayun on 12/8/2014.
  *
- * Occurs when the crawler encounters a Redirect problem, like redirecting to a visited-already page, or redirecting to nothing
+ * Occurs when the crawler encounters a Redirect problem, like redirecting to a visited-already page, or redirecting
+ * to nothing
  */
 public class RedirectException extends Exception {
   public Level level;