Skip to content

Commit

Permalink
Update URLCanonicalizer
Browse files Browse the repository at this point in the history
  • Loading branch information
yasserg committed Jan 8, 2012
1 parent dbc9d3c commit ab57c9b
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 58 deletions.
87 changes: 31 additions & 56 deletions src/main/java/edu/uci/ics/crawler4j/url/URLCanonicalizer.java
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,13 @@
import java.net.URLDecoder;
import java.net.URLEncoder;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.SortedMap;
import java.util.TreeMap;

/**
* See http://en.wikipedia.org/wiki/URL_normalization for a reference
* Note: some parts of the code are adapted from: http://stackoverflow.com/a/4057470/405418
* See http://en.wikipedia.org/wiki/URL_normalization for a reference Note: some
* parts of the code are adapted from: http://stackoverflow.com/a/4057470/405418
*
* @author Yasser Ganjisaffar <lastname at gmail dot com>
*/
Expand All @@ -48,13 +47,8 @@ public static String getCanonicalURL(String url) {

public static URL getCanonicalURL(String href, String context) {

/*
* Lower case the URL
*/
href = href.toLowerCase();

try {

URL canonicalURL;
if (context == null) {
canonicalURL = new URL(href);
Expand Down Expand Up @@ -107,28 +101,6 @@ public static URL getCanonicalURL(String href, String context) {
queryString = "";
}

/*
* Fix '?' and '&' problems
*/
/*int index = path.lastIndexOf('?');
if (index > 0) {
if (index == (path.length() - 1)) {
// '?' is the last char. Drop it.
path = path.substring(0, path.length() - 1);
} else if (path.charAt(index + 1) == '&') {
// Next char is '&'. Strip it.
if (path.length() == (index + 2)) {
// Then url ends with '?&'. Strip them.
path = path.substring(0, path.length() - 2);
} else {
// The '&' is redundant. Strip it.
path = path.substring(0, index + 1) + path.substring(index + 2);
}
} else if (path.charAt(path.length() - 1) == '&') {
path = path.substring(0, path.length() - 1);
}
}*/

/*
* Add starting slash if needed
*/
Expand All @@ -144,7 +116,14 @@ public static URL getCanonicalURL(String href, String context) {
port = -1;
}

return new URL(canonicalURL.getProtocol(), canonicalURL.getHost(), port, path + queryString);
/*
* Lowercasing protocol and host
*/
String protocol = canonicalURL.getProtocol().toLowerCase();
String host = canonicalURL.getHost().toLowerCase();
String pathAndQueryString = normalizePath(path) + queryString;

return new URL(protocol, host, port, pathAndQueryString);

} catch (MalformedURLException ex) {
return null;
Expand All @@ -168,32 +147,23 @@ private static SortedMap<String, String> createParameterMap(final String querySt
final Map<String, String> params = new HashMap<String, String>(pairs.length);

for (final String pair : pairs) {
if (pair.length() < 1) {
if (pair.length() == 0) {
continue;
}

String[] tokens = pair.split("=", 2);
for (int j = 0; j < tokens.length; j++) {
try {
tokens[j] = URLDecoder.decode(tokens[j], "UTF-8");
} catch (UnsupportedEncodingException ex) {
ex.printStackTrace();
}
}
switch (tokens.length) {
case 1: {
case 1:
if (pair.charAt(0) == '=') {
params.put("", tokens[0]);
} else {
params.put(tokens[0], "");
}
break;
}
case 2: {
case 2:
params.put(tokens[0], tokens[1]);
break;
}
}
}
return new TreeMap<String, String>(params);
}
Expand All @@ -210,19 +180,17 @@ private static String canonicalize(final SortedMap<String, String> sortedParamMa
return "";
}

final StringBuffer sb = new StringBuffer(350);
final Iterator<Map.Entry<String, String>> iter = sortedParamMap.entrySet().iterator();

while (iter.hasNext()) {
final Map.Entry<String, String> pair = iter.next();
sb.append(percentEncodeRfc3986(pair.getKey()));
sb.append('=');
sb.append(percentEncodeRfc3986(pair.getValue()));
if (iter.hasNext()) {
final StringBuffer sb = new StringBuffer(100);
for (Map.Entry<String, String> pair : sortedParamMap.entrySet()) {
if (sb.length() > 0) {
sb.append('&');
}
sb.append(percentEncodeRfc3986(pair.getKey()));
if (!pair.getValue().isEmpty()) {
sb.append('=');
sb.append(percentEncodeRfc3986(pair.getValue()));
}
}

return sb.toString();
}

Expand All @@ -235,11 +203,18 @@ private static String canonicalize(final SortedMap<String, String> sortedParamMa
* Decoded string.
* @return Encoded string per RFC 3986.
*/
private static String percentEncodeRfc3986(final String string) {
private static String percentEncodeRfc3986(String string) {
try {
return URLEncoder.encode(string, "UTF-8").replace("+", "%20").replace("*", "%2A").replace("%7E", "~");
string = string.replace("+", "%2B");
string = URLDecoder.decode(string, "UTF-8");
string = URLEncoder.encode(string, "UTF-8");
return string.replace("+", "%20").replace("*", "%2A").replace("%7E", "~");
} catch (UnsupportedEncodingException e) {
return string;
}
}

private static String normalizePath(final String path) {
return path.replace("%7E", "~").replace(" ", "%20");
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,21 @@
public class URLCanonicalizerTest extends TestCase {

public void testCanonizalier() {

assertEquals("http://www.example.com/display?category=foo%2Fbar%2Bbaz",
URLCanonicalizer.getCanonicalURL("http://www.example.com/display?category=foo/bar+baz"));

assertEquals("http://hostname.com/", URLCanonicalizer.getCanonicalURL("http://hostname.com"));
assertEquals("http://www.example.com/?q=a%2Bb", URLCanonicalizer.getCanonicalURL("http://www.example.com/?q=a+b"));

assertEquals("http://www.example.com/display?category=foo%2Fbar%2Bbaz",
URLCanonicalizer.getCanonicalURL("http://www.example.com/display?category=foo%2Fbar%2Bbaz"));

assertEquals("http://somedomain.com/uploads/1/0/2/5/10259653/6199347.jpg?1325154037",
URLCanonicalizer
.getCanonicalURL("http://somedomain.com/uploads/1/0/2/5/10259653/6199347.jpg?1325154037"));

assertEquals("http://hostname.com/", URLCanonicalizer.getCanonicalURL("http://hostname.com"));

assertEquals("http://hostname.com/", URLCanonicalizer.getCanonicalURL("http://HOSTNAME.com"));

assertEquals("http://www.example.com/index.html",
Expand All @@ -25,7 +37,10 @@ public void testCanonizalier() {
assertEquals("http://www.example.com/index.html?name=test&rame=base",
URLCanonicalizer.getCanonicalURL("http://www.example.com/index.html?name=test&rame=base#123"));

assertEquals("http://www.example.com/a/b/index.html",
assertEquals("http://www.example.com/~username/",
URLCanonicalizer.getCanonicalURL("http://www.example.com/%7Eusername/"));

assertEquals("http://www.example.com/A/B/index.html",
URLCanonicalizer.getCanonicalURL("http://www.example.com//A//B/index.html"));

assertEquals("http://www.example.com/index.html?x=y",
Expand All @@ -39,7 +54,11 @@ public void testCanonizalier() {

assertEquals("http://foo.bar.com/?baz=1", URLCanonicalizer.getCanonicalURL("http://foo.bar.com?baz=1"));

assertEquals("http://www.example.com/index.html?a=b&c=d&e=f",
URLCanonicalizer.getCanonicalURL("http://www.example.com/index.html?&c=d&e=f&a=b"));

assertEquals("http://www.example.com/index.html?q=a%20b",
URLCanonicalizer.getCanonicalURL("http://www.example.com/index.html?q=a b"));

}
}

0 comments on commit ab57c9b

Please sign in to comment.