Skip to content

Commit

Permalink
Merge pull request yasserg#106 from rdelaoc/UrlRequestParameterOrdering
Browse files Browse the repository at this point in the history
The canonicalization of request URLs alphabetizes the parameters
  • Loading branch information
yasserg committed Dec 18, 2015
2 parents 48ce714 + b52e54c commit fdb3baa
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 15 deletions.
25 changes: 12 additions & 13 deletions src/main/java/edu/uci/ics/crawler4j/url/URLCanonicalizer.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,10 @@
import java.net.URL;
import java.net.URLDecoder;
import java.net.URLEncoder;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Objects;
import java.util.SortedMap;
import java.util.TreeMap;


/**
* See http://en.wikipedia.org/wiki/URL_normalization for a reference Note: some
Expand Down Expand Up @@ -73,7 +72,7 @@ public static String getCanonicalURL(String href, String context) {

path = path.trim();

final SortedMap<String, String> params = createParameterMap(canonicalURL.getQuery());
final LinkedHashMap<String, String> params = createParameterMap(canonicalURL.getQuery());
final String queryString;
if ((params != null) && !params.isEmpty()) {
String canonicalParams = canonicalize(params);
Expand Down Expand Up @@ -105,17 +104,17 @@ public static String getCanonicalURL(String href, String context) {

/**
* Takes a query string, separates the constituent name-value pairs, and
* stores them in a SortedMap ordered by lexicographical order.
* stores them in a LinkedHashMap ordered by their original order.
*
* @return Null if there is no query string.
*/
private static SortedMap<String, String> createParameterMap(final String queryString) {
private static LinkedHashMap<String, String> createParameterMap(final String queryString) {
if ((queryString == null) || queryString.isEmpty()) {
return null;
}

final String[] pairs = queryString.split("&");
final Map<String, String> params = new HashMap<>(pairs.length);
final Map<String, String> params = new LinkedHashMap<>(pairs.length);

for (final String pair : pairs) {
if (pair.isEmpty()) {
Expand All @@ -136,23 +135,23 @@ private static SortedMap<String, String> createParameterMap(final String querySt
break;
}
}
return new TreeMap<>(params);
return new LinkedHashMap<>(params);
}

/**
* Canonicalize the query string.
*
* @param sortedParamMap
* Parameter name-value pairs in lexicographical order.
* @param paramsMap
* Parameter map whose name-value pairs are in order of insertion.
* @return Canonical form of query string.
*/
private static String canonicalize(final SortedMap<String, String> sortedParamMap) {
if ((sortedParamMap == null) || sortedParamMap.isEmpty()) {
private static String canonicalize(final LinkedHashMap<String, String> paramsMap) {
if ((paramsMap == null) || paramsMap.isEmpty()) {
return "";
}

final StringBuilder sb = new StringBuilder(100);
for (Map.Entry<String, String> pair : sortedParamMap.entrySet()) {
for (Map.Entry<String, String> pair : paramsMap.entrySet()) {
final String key = pair.getKey().toLowerCase();
if ("jsessionid".equals(key) || "phpsessid".equals(key) || "aspsessionid".equals(key)) {
continue;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,13 +57,13 @@ public void testCanonizalier() {

assertEquals("http://foo.bar.com/?baz=1", URLCanonicalizer.getCanonicalURL("http://foo.bar.com?baz=1"));

assertEquals("http://www.example.com/index.html?a=b&c=d&e=f",
assertEquals("http://www.example.com/index.html?c=d&e=f&a=b",
URLCanonicalizer.getCanonicalURL("http://www.example.com/index.html?&c=d&e=f&a=b"));

assertEquals("http://www.example.com/index.html?q=a%20b",
URLCanonicalizer.getCanonicalURL("http://www.example.com/index.html?q=a b"));

assertEquals("http://www.example.com/search?height=100%&width=100%",
assertEquals("http://www.example.com/search?width=100%&height=100%",
URLCanonicalizer.getCanonicalURL("http://www.example.com/search?width=100%&height=100%"));

assertEquals("http://foo.bar/mydir/myfile?page=2",
Expand Down

0 comments on commit fdb3baa

Please sign in to comment.