Added ParentUrl and Path to WebURL

ThiagoKrug · Feb 4, 2012 · 2f6a89c · 2f6a89c
1 parent 6eb2ef8
commit 2f6a89c
Show file tree

Hide file tree

Showing 6 changed files with 67 additions and 8 deletions.
diff --git a/pom.xml b/pom.xml
@@ -4,7 +4,7 @@
 	<artifactId>crawler4j</artifactId>
 	<packaging>jar</packaging>
 	<name>crawler4j</name>
-	<version>3.2.3-SNAPSHOT</version>
+	<version>3.2.4-SNAPSHOT</version>
 	<description>Open Source Web Crawler for Java</description>
 	<url>http://code.google.com/p/crawler4j/</url>
 	<licenses>

diff --git a/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java b/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java
@@ -247,6 +247,7 @@ private void processPage(WebURL curURL) {
 							WebURL webURL = new WebURL();
 							webURL.setURL(movedToUrl);
 							webURL.setParentDocid(curURL.getParentDocid());
+							webURL.setParentUrl(curURL.getParentUrl());
 							webURL.setDepth(curURL.getDepth());
 							webURL.setDocid(-1);
 							if (shouldVisit(webURL) && robotstxtServer.allows(webURL)) {
@@ -281,6 +282,7 @@ private void processPage(WebURL curURL) {
 					int maxCrawlDepth = myController.getConfig().getMaxDepthOfCrawling();
 					for (WebURL webURL : htmlParseData.getOutgoingUrls()) {
 						webURL.setParentDocid(docid);
+						webURL.setParentUrl(curURL.getURL());
 						int newdocid = docIdServer.getDocId(webURL.getURL());
 						if (newdocid > 0) {
 							// This is not the first time that this Url is

diff --git a/src/main/java/edu/uci/ics/crawler4j/frontier/WebURLTupleBinding.java b/src/main/java/edu/uci/ics/crawler4j/frontier/WebURLTupleBinding.java
@@ -34,6 +34,7 @@ public WebURL entryToObject(TupleInput input) {
 		webURL.setURL(input.readString());
 		webURL.setDocid(input.readInt());
 		webURL.setParentDocid(input.readInt());
+		webURL.setParentUrl(input.readString());
 		webURL.setDepth(input.readShort());
 		return webURL;
 	}
@@ -43,6 +44,7 @@ public void objectToEntry(WebURL url, TupleOutput output) {
 		output.writeString(url.getURL());
 		output.writeInt(url.getDocid());
 		output.writeInt(url.getParentDocid());
+		output.writeString(url.getParentUrl());
 		output.writeShort(url.getDepth());
 	}
 }
diff --git a/src/main/java/edu/uci/ics/crawler4j/url/WebURL.java b/src/main/java/edu/uci/ics/crawler4j/url/WebURL.java
@@ -36,10 +36,15 @@ public class WebURL implements Serializable {
 
 	private int docid;
 	private int parentDocid;
+	private String parentUrl;
 	private short depth;
 	private String domain;
 	private String subDomain;
+	private String path;
 
+	/**
+	 * Returns the unique document id assigned to this Url.
+	 */
 	public int getDocid() {
 		return docid;
 	}
@@ -48,10 +53,7 @@ public void setDocid(int docid) {
 		this.docid = docid;
 	}
 
-	public String getURL() {
-		return url;
-	}
-
+	@Override
 	public boolean equals(Object o) {
 		if (this == o) {
 			return true;
@@ -65,10 +67,18 @@ public boolean equals(Object o) {
 
 	}
 
+	@Override
 	public String toString() {
 		return url;
 	}
 
+	/**
+	 * Returns the Url string
+	 */
+	public String getURL() {
+		return url;
+	}
+
 	public void setURL(String url) {
 		this.url = url;
 
@@ -91,8 +101,18 @@ public void setURL(String url) {
 				subDomain += parts[i];
 			}
 		}
+		path = url.substring(domainEndIdx);
+		int pathEndIdx = path.indexOf('?');
+		if (pathEndIdx >= 0) {
+			path = path.substring(0, pathEndIdx);
+		}
 	}
 
+	/**
+	 * Returns the unique document id of the parent page.
+	 * The parent page is the page in which the Url of this
+	 * page is first observed.
+	 */
 	public int getParentDocid() {
 		return parentDocid;
 	}
@@ -101,6 +121,24 @@ public void setParentDocid(int parentDocid) {
 		this.parentDocid = parentDocid;
 	}
 
+	/**
+	 * Returns the url of the parent page.
+	 * The parent page is the page in which the Url of this
+	 * page is first observed.
+	 */
+	public String getParentUrl() {
+		return parentUrl;
+	}
+
+	public void setParentUrl(String parentUrl) {
+		this.parentUrl = parentUrl;
+	}
+
+	/**
+	 * Returns the crawl depth at which this Url is first observed.
+	 * Seed Urls are at depth 0. Urls that are extracted from seed Urls
+	 * are at depth 1, etc.
+	 */
 	public short getDepth() {
 		return depth;
 	}
@@ -109,6 +147,10 @@ public void setDepth(short depth) {
 		this.depth = depth;
 	}
 
+	/**
+	 * Returns the domain of this Url.
+	 * For 'http://www.example.com/sample.htm', domain will be 'example.com'
+	 */
 	public String getDomain() {
 		return domain;
 	}
@@ -117,4 +159,15 @@ public String getSubDomain() {
 		return subDomain;
 	}
 
+	/**
+	 * Returns the path of this Url.
+	 * For 'http://www.example.com/sample.htm', domain will be 'sample.htm'
+	 */
+	public String getPath() {
+		return path;
+	}
+
+	public void setPath(String path) {
+		this.path = path;
+	}
 }
diff --git a/src/test/java/edu/uci/ics/crawler4j/examples/basic/BasicCrawler.java b/src/test/java/edu/uci/ics/crawler4j/examples/basic/BasicCrawler.java
@@ -52,14 +52,16 @@ public void visit(Page page) {
 		int docid = page.getWebURL().getDocid();
 		String url = page.getWebURL().getURL();
 		String domain = page.getWebURL().getDomain();
+		String path = page.getWebURL().getPath();
 		String subDomain = page.getWebURL().getSubDomain();
-		int parentDocid = page.getWebURL().getParentDocid();
+		String parentUrl = page.getWebURL().getParentUrl();
 
 		System.out.println("Docid: " + docid);
 		System.out.println("URL: " + url);
 		System.out.println("Domain: '" + domain + "'");
 		System.out.println("Sub-domain: '" + subDomain + "'");
-		System.out.println("Docid of parent page: " + parentDocid);
+		System.out.println("Path: '" + path + "'");
+		System.out.println("Parent page: " + parentUrl);
 
 		if (page.getParseData() instanceof HtmlParseData) {
 			HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();

diff --git a/src/test/java/edu/uci/ics/crawler4j/examples/statushandler/StatusHandlerCrawler.java b/src/test/java/edu/uci/ics/crawler4j/examples/statushandler/StatusHandlerCrawler.java
@@ -59,7 +59,7 @@ protected void handlePageStatusCode(WebURL webUrl, int statusCode, String status
 		if (statusCode != HttpStatus.SC_OK) {
 
 			if (statusCode == HttpStatus.SC_NOT_FOUND) {
-				System.out.println("Broken link: " + webUrl.getURL() + ", this link was found in page with docid: " + webUrl.getParentDocid());
+				System.out.println("Broken link: " + webUrl.getURL() + ", this link was found in page: " + webUrl.getParentUrl());
 			} else {
 				System.out.println("Non success status for link: " + webUrl.getURL() + ", status code: " + statusCode + ", description: " + statusDescription);
 			}