From 6ffe56b42c2b045c856455ca21302bbce5e38b0c Mon Sep 17 00:00:00 2001 From: Yasser Ganjisaffar Date: Tue, 3 Feb 2015 22:27:09 -0800 Subject: [PATCH] [style] re-format code --- .../ics/crawler4j/crawler/CrawlConfig.java | 8 +- .../crawler4j/crawler/CrawlController.java | 30 +++-- .../edu/uci/ics/crawler4j/crawler/Page.java | 12 +- .../uci/ics/crawler4j/crawler/WebCrawler.java | 103 +++++++++--------- .../crawler/authentication/AuthInfo.java | 12 +- .../crawler/authentication/BasicAuthInfo.java | 6 +- .../crawler/authentication/FormAuthInfo.java | 12 +- .../crawler/exceptions/RedirectException.java | 3 +- .../crawler4j/fetcher/PageFetchResult.java | 8 +- .../ics/crawler4j/fetcher/PageFetcher.java | 84 +++++++------- .../uci/ics/crawler4j/frontier/Counters.java | 24 ++-- .../ics/crawler4j/frontier/DocIDServer.java | 12 +- .../uci/ics/crawler4j/frontier/Frontier.java | 10 +- .../crawler4j/frontier/InProcessPagesDB.java | 9 +- .../ics/crawler4j/frontier/WorkQueues.java | 19 +++- .../ics/crawler4j/parser/BinaryParseData.java | 12 +- .../crawler4j/parser/HtmlContentHandler.java | 11 +- .../ics/crawler4j/parser/HtmlParseData.java | 4 +- .../parser/NotAllowedContentException.java | 6 +- .../uci/ics/crawler4j/parser/ParseData.java | 4 +- .../edu/uci/ics/crawler4j/parser/Parser.java | 12 +- .../ics/crawler4j/parser/TextParseData.java | 4 +- .../crawler4j/robotstxt/RobotstxtServer.java | 21 ++-- .../edu/uci/ics/crawler4j/url/TLDList.java | 14 ++- .../ics/crawler4j/url/URLCanonicalizer.java | 22 ++-- .../uci/ics/crawler4j/url/UrlResolver.java | 13 +-- .../edu/uci/ics/crawler4j/url/WebURL.java | 2 +- .../java/edu/uci/ics/crawler4j/util/IO.java | 7 +- .../java/edu/uci/ics/crawler4j/util/Net.java | 25 ++--- .../java/edu/uci/ics/crawler4j/util/Util.java | 25 +++-- .../examples/basic/BasicCrawlController.java | 6 +- .../examples/basic/BasicCrawler.java | 20 ++-- .../imagecrawler/ImageCrawlController.java | 9 +- .../examples/imagecrawler/ImageCrawler.java | 9 +- .../examples/localdata/Downloader.java | 9 +- .../LocalDataCollectorController.java | 75 ++++++------- .../localdata/LocalDataCollectorCrawler.java | 22 ++-- .../examples/multiple/BasicCrawler.java | 10 +- .../multiple/MultipleCrawlerController.java | 9 +- .../examples/shutdown/BasicCrawler.java | 16 +-- .../shutdown/ControllerWithShutdown.java | 5 +- .../StatusHandlerCrawlController.java | 5 +- .../statushandler/StatusHandlerCrawler.java | 72 ++++++------ .../uci/ics/crawler4j/tests/TLDListTest.java | 9 +- .../crawler4j/tests/URLCanonicalizerTest.java | 43 ++++---- .../uci/ics/crawler4j/tests/WebURLTest.java | 3 +- 46 files changed, 467 insertions(+), 389 deletions(-) diff --git a/src/main/java/edu/uci/ics/crawler4j/crawler/CrawlConfig.java b/src/main/java/edu/uci/ics/crawler4j/crawler/CrawlConfig.java index 941c40a9c..3d79c97e4 100644 --- a/src/main/java/edu/uci/ics/crawler4j/crawler/CrawlConfig.java +++ b/src/main/java/edu/uci/ics/crawler4j/crawler/CrawlConfig.java @@ -17,11 +17,11 @@ package edu.uci.ics.crawler4j.crawler; -import edu.uci.ics.crawler4j.crawler.authentication.AuthInfo; - import java.util.ArrayList; import java.util.List; +import edu.uci.ics.crawler4j.crawler.authentication.AuthInfo; + public class CrawlConfig { /** @@ -133,8 +133,8 @@ public class CrawlConfig { private String proxyPassword = null; /** - * List of possible authentications needed by crawler - */ + * List of possible authentications needed by crawler + */ private List authInfos; public CrawlConfig() { diff --git a/src/main/java/edu/uci/ics/crawler4j/crawler/CrawlController.java b/src/main/java/edu/uci/ics/crawler4j/crawler/CrawlController.java index 0cabbab59..34a90becd 100644 --- a/src/main/java/edu/uci/ics/crawler4j/crawler/CrawlController.java +++ b/src/main/java/edu/uci/ics/crawler4j/crawler/CrawlController.java @@ -17,8 +17,16 @@ package edu.uci.ics.crawler4j.crawler; +import java.io.File; +import java.util.ArrayList; +import java.util.List; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import com.sleepycat.je.Environment; import com.sleepycat.je.EnvironmentConfig; + import edu.uci.ics.crawler4j.fetcher.PageFetcher; import edu.uci.ics.crawler4j.frontier.DocIDServer; import edu.uci.ics.crawler4j.frontier.Frontier; @@ -26,12 +34,6 @@ import edu.uci.ics.crawler4j.url.URLCanonicalizer; import edu.uci.ics.crawler4j.url.WebURL; import edu.uci.ics.crawler4j.util.IO; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.File; -import java.util.ArrayList; -import java.util.List; /** * The controller that manages a crawling session. This class creates the @@ -74,14 +76,16 @@ public class CrawlController extends Configurable { protected final Object waitingLock = new Object(); protected final Environment env; - public CrawlController(CrawlConfig config, PageFetcher pageFetcher, RobotstxtServer robotstxtServer) throws Exception { + public CrawlController(CrawlConfig config, PageFetcher pageFetcher, RobotstxtServer robotstxtServer) + throws Exception { super(config); config.validate(); File folder = new File(config.getCrawlStorageFolder()); if (!folder.exists()) { if (!folder.mkdirs()) { - throw new Exception("couldn't create the storage folder: " + folder.getAbsolutePath() + " does it already exist ?"); + throw new Exception( + "couldn't create the storage folder: " + folder.getAbsolutePath() + " does it already exist ?"); } else { logger.debug("Created folder: " + folder.getAbsolutePath()); } @@ -214,7 +218,9 @@ public void run() { if (queueLength > 0) { continue; } - logger.info("No thread is working and no more URLs are in queue waiting for another 10 seconds to make sure..."); + logger.info( + "No thread is working and no more URLs are in queue waiting for another 10 seconds to make " + + "sure..."); sleep(10); queueLength = frontier.getQueueLength(); if (queueLength > 0) { @@ -282,7 +288,8 @@ public void waitUntilFinish() { } /** - * Once the crawling session finishes the controller collects the local data of the crawler threads and stores them in a List. + * Once the crawling session finishes the controller collects the local data of the crawler threads and stores them + * in a List. * This function returns the reference to this list. * * @return List of Objects which are your local data @@ -354,7 +361,8 @@ public void addSeed(String pageUrl, int docId) { webUrl.setDocid(docId); webUrl.setDepth((short) 0); if (!robotstxtServer.allows(webUrl)) { - logger.warn("Robots.txt does not allow this seed: {}", pageUrl); // using the WARN level here, as the user specifically asked to add this seed + logger.warn("Robots.txt does not allow this seed: {}", + pageUrl); // using the WARN level here, as the user specifically asked to add this seed } else { frontier.schedule(webUrl); } diff --git a/src/main/java/edu/uci/ics/crawler4j/crawler/Page.java b/src/main/java/edu/uci/ics/crawler4j/crawler/Page.java index 39148f5a4..a926ad33d 100644 --- a/src/main/java/edu/uci/ics/crawler4j/crawler/Page.java +++ b/src/main/java/edu/uci/ics/crawler4j/crawler/Page.java @@ -40,8 +40,8 @@ public class Page { protected WebURL url; /** - * Redirection flag - */ + * Redirection flag + */ protected boolean redirect; /** @@ -50,8 +50,8 @@ public class Page { protected String redirectedToUrl; /** - * Status of the page - */ + * Status of the page + */ protected int statusCode; /** @@ -78,8 +78,8 @@ public class Page { protected String contentCharset; /** - * Language of the Content. - */ + * Language of the Content. + */ private String language; /** diff --git a/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java b/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java index ff227ff26..015197dac 100644 --- a/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java +++ b/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java @@ -17,6 +17,13 @@ package edu.uci.ics.crawler4j.crawler; +import java.util.ArrayList; +import java.util.List; +import java.util.Locale; + +import org.apache.http.HttpStatus; +import org.apache.http.impl.EnglishReasonPhraseCatalog; + import edu.uci.ics.crawler4j.crawler.exceptions.ContentFetchException; import edu.uci.ics.crawler4j.crawler.exceptions.PageBiggerThanMaxSizeException; import edu.uci.ics.crawler4j.crawler.exceptions.ParseException; @@ -30,18 +37,10 @@ import edu.uci.ics.crawler4j.parser.Parser; import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer; import edu.uci.ics.crawler4j.url.WebURL; - -import org.apache.http.HttpStatus; - -import org.apache.http.impl.EnglishReasonPhraseCatalog; import uk.org.lidalia.slf4jext.Level; import uk.org.lidalia.slf4jext.Logger; import uk.org.lidalia.slf4jext.LoggerFactory; -import java.util.ArrayList; -import java.util.List; -import java.util.Locale; - /** * WebCrawler class in the Runnable class that is executed by each crawler thread. * @@ -168,26 +167,26 @@ protected void handlePageStatusCode(WebURL webUrl, int statusCode, String status // Sub-classed can override this to add their custom functionality } - /** - * This function is called before processing of the page's URL - * It can be overridden by subclasses for tweaking of the url before processing it. - * For example, http://abc.com/def?a=123 - http://abc.com/def - * - * @param curURL current URL which can be tweaked before processing - * @return tweaked WebURL - */ - protected WebURL handleUrlBeforeProcess(WebURL curURL) { - return curURL; - } + /** + * This function is called before processing of the page's URL + * It can be overridden by subclasses for tweaking of the url before processing it. + * For example, http://abc.com/def?a=123 - http://abc.com/def + * + * @param curURL current URL which can be tweaked before processing + * @return tweaked WebURL + */ + protected WebURL handleUrlBeforeProcess(WebURL curURL) { + return curURL; + } - /** - * This function is called if the content of a url is bigger than allowed size. - * - * @param urlStr - The URL which it's content is bigger than allowed size - */ - protected void onPageBiggerThanMaxSize(String urlStr, long pageSize) { - logger.warn("Skipping a URL: {} which was bigger ( {} ) than max allowed size", urlStr, pageSize); - } + /** + * This function is called if the content of a url is bigger than allowed size. + * + * @param urlStr - The URL which it's content is bigger than allowed size + */ + protected void onPageBiggerThanMaxSize(String urlStr, long pageSize) { + logger.warn("Skipping a URL: {} which was bigger ( {} ) than max allowed size", urlStr, pageSize); + } /** * This function is called if the crawler encountered an unexpected http status code ( a status code other than 3xx) @@ -197,11 +196,11 @@ protected void onPageBiggerThanMaxSize(String urlStr, long pageSize) { * @param contentType Type of Content * @param description Error Description */ - protected void onUnexpectedStatusCode(String urlStr, int statusCode, String contentType, String description) { - logger.warn("Skipping URL: {}, StatusCode: {}, {}, {}", urlStr, statusCode, contentType, description); - // Do nothing by default (except basic logging) - // Sub-classed can override this to add their custom functionality - } + protected void onUnexpectedStatusCode(String urlStr, int statusCode, String contentType, String description) { + logger.warn("Skipping URL: {}, StatusCode: {}, {}, {}", urlStr, statusCode, contentType, description); + // Do nothing by default (except basic logging) + // Sub-classed can override this to add their custom functionality + } /** * This function is called if the content of a url could not be fetched. @@ -271,18 +270,18 @@ public void run() { } /** - * Classes that extends WebCrawler should overwrite this function to tell the - * crawler whether the given url should be crawled or not. The following - * implementation indicates that all urls should be included in the crawl. - * - * @param url - * the url which we are interested to know whether it should be - * included in the crawl or not. - * @param page - * Page context from which this URL was scraped - * @return if the url should be included in the crawl it returns true, - * otherwise false is returned. - */ + * Classes that extends WebCrawler should overwrite this function to tell the + * crawler whether the given url should be crawled or not. The following + * implementation indicates that all urls should be included in the crawl. + * + * @param url + * the url which we are interested to know whether it should be + * included in the crawl or not. + * @param page + * Page context from which this URL was scraped + * @return if the url should be included in the crawl it returns true, + * otherwise false is returned. + */ public boolean shouldVisit(Page page, WebURL url) { return true; } @@ -308,15 +307,17 @@ private void processPage(WebURL curURL) { fetchResult = pageFetcher.fetchPage(curURL); int statusCode = fetchResult.getStatusCode(); - handlePageStatusCode(curURL, statusCode, EnglishReasonPhraseCatalog.INSTANCE.getReason(statusCode, Locale.ENGLISH)); // Finds the status reason for all known statuses + handlePageStatusCode(curURL, statusCode, EnglishReasonPhraseCatalog.INSTANCE + .getReason(statusCode, Locale.ENGLISH)); // Finds the status reason for all known statuses Page page = new Page(curURL); page.setFetchResponseHeaders(fetchResult.getResponseHeaders()); page.setStatusCode(statusCode); if (statusCode != HttpStatus.SC_OK) { // Not 200 - if (statusCode == HttpStatus.SC_MOVED_PERMANENTLY || statusCode == HttpStatus.SC_MOVED_TEMPORARILY - || statusCode == HttpStatus.SC_MULTIPLE_CHOICES || statusCode == HttpStatus.SC_SEE_OTHER - || statusCode == HttpStatus.SC_TEMPORARY_REDIRECT || statusCode == 308) { // is 3xx todo follow https://issues.apache.org/jira/browse/HTTPCORE-389 + if (statusCode == HttpStatus.SC_MOVED_PERMANENTLY || statusCode == HttpStatus.SC_MOVED_TEMPORARILY || + statusCode == HttpStatus.SC_MULTIPLE_CHOICES || statusCode == HttpStatus.SC_SEE_OTHER || + statusCode == HttpStatus.SC_TEMPORARY_REDIRECT || + statusCode == 308) { // is 3xx todo follow https://issues.apache.org/jira/browse/HTTPCORE-389 page.setRedirect(true); if (myController.getConfig().isFollowRedirects()) { @@ -350,8 +351,10 @@ private void processPage(WebURL curURL) { } } } else { // All other http codes other than 3xx & 200 - String description = EnglishReasonPhraseCatalog.INSTANCE.getReason(fetchResult.getStatusCode(), Locale.ENGLISH); // Finds the status reason for all known statuses - String contentType = fetchResult.getEntity() == null ? "" : fetchResult.getEntity().getContentType().getValue(); + String description = EnglishReasonPhraseCatalog.INSTANCE + .getReason(fetchResult.getStatusCode(), Locale.ENGLISH); // Finds the status reason for all known statuses + String contentType = + fetchResult.getEntity() == null ? "" : fetchResult.getEntity().getContentType().getValue(); onUnexpectedStatusCode(curURL.getURL(), fetchResult.getStatusCode(), contentType, description); } diff --git a/src/main/java/edu/uci/ics/crawler4j/crawler/authentication/AuthInfo.java b/src/main/java/edu/uci/ics/crawler4j/crawler/authentication/AuthInfo.java index b2ede2223..b03fbbef3 100644 --- a/src/main/java/edu/uci/ics/crawler4j/crawler/authentication/AuthInfo.java +++ b/src/main/java/edu/uci/ics/crawler4j/crawler/authentication/AuthInfo.java @@ -1,20 +1,23 @@ package edu.uci.ics.crawler4j.crawler.authentication; -import javax.swing.text.html.FormSubmitEvent.MethodType; import java.net.MalformedURLException; import java.net.URL; +import javax.swing.text.html.FormSubmitEvent.MethodType; + /** * Created by Avi Hayun on 11/23/2014. * * Abstract class containing authentication information needed to login into a user/password protected site
- * This class should be extended by specific authentication types like form authentication and basic authentication etc
+ * This class should be extended by specific authentication types like form authentication and basic authentication + * etc
*
* This class contains all of the mutual authentication data for all authentication types */ public abstract class AuthInfo { public enum AuthenticationType { - BASIC_AUTHENTICATION, FORM_AUTHENTICATION + BASIC_AUTHENTICATION, + FORM_AUTHENTICATION } protected AuthenticationType authenticationType; @@ -41,7 +44,8 @@ public AuthInfo() { * * @throws MalformedURLException Make sure your URL is valid */ - protected AuthInfo(AuthenticationType authenticationType, MethodType httpMethod, String loginUrl, String username, String password) throws MalformedURLException { + protected AuthInfo(AuthenticationType authenticationType, MethodType httpMethod, String loginUrl, String username, + String password) throws MalformedURLException { this.authenticationType = authenticationType; this.httpMethod = httpMethod; URL url = new URL(loginUrl); diff --git a/src/main/java/edu/uci/ics/crawler4j/crawler/authentication/BasicAuthInfo.java b/src/main/java/edu/uci/ics/crawler4j/crawler/authentication/BasicAuthInfo.java index 6437f07ef..50b585047 100644 --- a/src/main/java/edu/uci/ics/crawler4j/crawler/authentication/BasicAuthInfo.java +++ b/src/main/java/edu/uci/ics/crawler4j/crawler/authentication/BasicAuthInfo.java @@ -1,12 +1,14 @@ package edu.uci.ics.crawler4j.crawler.authentication; -import javax.swing.text.html.FormSubmitEvent.MethodType; import java.net.MalformedURLException; +import javax.swing.text.html.FormSubmitEvent.MethodType; + /** * Created by Avi Hayun on 11/25/2014. * - * BasicAuthInfo contains the authentication information needed for BASIC authentication (extending AuthInfo which has all common auth info in it) + * BasicAuthInfo contains the authentication information needed for BASIC authentication (extending AuthInfo which + * has all common auth info in it) * * BASIC authentication in PHP: *