Skip to content

Commit

Permalink
Added ParentUrl and Path to WebURL
Browse files Browse the repository at this point in the history
  • Loading branch information
yasserg committed Feb 4, 2012
1 parent 6eb2ef8 commit 2f6a89c
Show file tree
Hide file tree
Showing 6 changed files with 67 additions and 8 deletions.
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
<artifactId>crawler4j</artifactId>
<packaging>jar</packaging>
<name>crawler4j</name>
<version>3.2.3-SNAPSHOT</version>
<version>3.2.4-SNAPSHOT</version>
<description>Open Source Web Crawler for Java</description>
<url>http://code.google.com/p/crawler4j/</url>
<licenses>
Expand Down
2 changes: 2 additions & 0 deletions src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,7 @@ private void processPage(WebURL curURL) {
WebURL webURL = new WebURL();
webURL.setURL(movedToUrl);
webURL.setParentDocid(curURL.getParentDocid());
webURL.setParentUrl(curURL.getParentUrl());
webURL.setDepth(curURL.getDepth());
webURL.setDocid(-1);
if (shouldVisit(webURL) && robotstxtServer.allows(webURL)) {
Expand Down Expand Up @@ -281,6 +282,7 @@ private void processPage(WebURL curURL) {
int maxCrawlDepth = myController.getConfig().getMaxDepthOfCrawling();
for (WebURL webURL : htmlParseData.getOutgoingUrls()) {
webURL.setParentDocid(docid);
webURL.setParentUrl(curURL.getURL());
int newdocid = docIdServer.getDocId(webURL.getURL());
if (newdocid > 0) {
// This is not the first time that this Url is
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ public WebURL entryToObject(TupleInput input) {
webURL.setURL(input.readString());
webURL.setDocid(input.readInt());
webURL.setParentDocid(input.readInt());
webURL.setParentUrl(input.readString());
webURL.setDepth(input.readShort());
return webURL;
}
Expand All @@ -43,6 +44,7 @@ public void objectToEntry(WebURL url, TupleOutput output) {
output.writeString(url.getURL());
output.writeInt(url.getDocid());
output.writeInt(url.getParentDocid());
output.writeString(url.getParentUrl());
output.writeShort(url.getDepth());
}
}
61 changes: 57 additions & 4 deletions src/main/java/edu/uci/ics/crawler4j/url/WebURL.java
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,15 @@ public class WebURL implements Serializable {

private int docid;
private int parentDocid;
private String parentUrl;
private short depth;
private String domain;
private String subDomain;
private String path;

/**
* Returns the unique document id assigned to this Url.
*/
public int getDocid() {
return docid;
}
Expand All @@ -48,10 +53,7 @@ public void setDocid(int docid) {
this.docid = docid;
}

public String getURL() {
return url;
}

@Override
public boolean equals(Object o) {
if (this == o) {
return true;
Expand All @@ -65,10 +67,18 @@ public boolean equals(Object o) {

}

@Override
public String toString() {
return url;
}

/**
* Returns the Url string
*/
public String getURL() {
return url;
}

public void setURL(String url) {
this.url = url;

Expand All @@ -91,8 +101,18 @@ public void setURL(String url) {
subDomain += parts[i];
}
}
path = url.substring(domainEndIdx);
int pathEndIdx = path.indexOf('?');
if (pathEndIdx >= 0) {
path = path.substring(0, pathEndIdx);
}
}

/**
* Returns the unique document id of the parent page.
* The parent page is the page in which the Url of this
* page is first observed.
*/
public int getParentDocid() {
return parentDocid;
}
Expand All @@ -101,6 +121,24 @@ public void setParentDocid(int parentDocid) {
this.parentDocid = parentDocid;
}

/**
* Returns the url of the parent page.
* The parent page is the page in which the Url of this
* page is first observed.
*/
public String getParentUrl() {
return parentUrl;
}

public void setParentUrl(String parentUrl) {
this.parentUrl = parentUrl;
}

/**
* Returns the crawl depth at which this Url is first observed.
* Seed Urls are at depth 0. Urls that are extracted from seed Urls
* are at depth 1, etc.
*/
public short getDepth() {
return depth;
}
Expand All @@ -109,6 +147,10 @@ public void setDepth(short depth) {
this.depth = depth;
}

/**
* Returns the domain of this Url.
* For 'http://www.example.com/sample.htm', domain will be 'example.com'
*/
public String getDomain() {
return domain;
}
Expand All @@ -117,4 +159,15 @@ public String getSubDomain() {
return subDomain;
}

/**
* Returns the path of this Url.
* For 'http://www.example.com/sample.htm', domain will be 'sample.htm'
*/
public String getPath() {
return path;
}

public void setPath(String path) {
this.path = path;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -52,14 +52,16 @@ public void visit(Page page) {
int docid = page.getWebURL().getDocid();
String url = page.getWebURL().getURL();
String domain = page.getWebURL().getDomain();
String path = page.getWebURL().getPath();
String subDomain = page.getWebURL().getSubDomain();
int parentDocid = page.getWebURL().getParentDocid();
String parentUrl = page.getWebURL().getParentUrl();

System.out.println("Docid: " + docid);
System.out.println("URL: " + url);
System.out.println("Domain: '" + domain + "'");
System.out.println("Sub-domain: '" + subDomain + "'");
System.out.println("Docid of parent page: " + parentDocid);
System.out.println("Path: '" + path + "'");
System.out.println("Parent page: " + parentUrl);

if (page.getParseData() instanceof HtmlParseData) {
HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ protected void handlePageStatusCode(WebURL webUrl, int statusCode, String status
if (statusCode != HttpStatus.SC_OK) {

if (statusCode == HttpStatus.SC_NOT_FOUND) {
System.out.println("Broken link: " + webUrl.getURL() + ", this link was found in page with docid: " + webUrl.getParentDocid());
System.out.println("Broken link: " + webUrl.getURL() + ", this link was found in page: " + webUrl.getParentUrl());
} else {
System.out.println("Non success status for link: " + webUrl.getURL() + ", status code: " + statusCode + ", description: " + statusDescription);
}
Expand Down

0 comments on commit 2f6a89c

Please sign in to comment.