Skip to content

Commit

Permalink
Merge pull request yasserg#272 from yasserg/pr/270
Browse files Browse the repository at this point in the history
Fix href characters encoding issue
  • Loading branch information
yasserg authored Dec 20, 2017
2 parents 8f16a21 + b7f3582 commit 312408a
Show file tree
Hide file tree
Showing 4 changed files with 31 additions and 22 deletions.
2 changes: 1 addition & 1 deletion crawler4j/checkstyle.xml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

<module name="TreeWalker">
<module name="LineLength">
<property name="max" value="100"/>
<property name="max" value="120"/>
<property name="ignorePattern" value="^(package .*;)|(import .*;)|(\{@link .*)$"/>
</module>
<module name="FileContentsHolder"/>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.HashSet;
import java.util.Set;

Expand Down Expand Up @@ -132,7 +134,10 @@ public void parse(Page page, String contextURL)
String hrefLoweredCase = href.trim().toLowerCase();
if (!hrefLoweredCase.contains("javascript:") &&
!hrefLoweredCase.contains("mailto:") && !hrefLoweredCase.contains("@")) {
String url = URLCanonicalizer.getCanonicalURL(href, contextURL);
// Prefer page's content charset to encode href url
Charset hrefCharset = ((page.getContentCharset() == null) || page.getContentCharset().isEmpty()) ?
StandardCharsets.UTF_8 : Charset.forName(page.getContentCharset());
String url = URLCanonicalizer.getCanonicalURL(href, contextURL, hrefCharset);
if (url != null) {
WebURL webURL = new WebURL();
webURL.setURL(url);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
import java.net.URL;
import java.net.URLDecoder;
import java.net.URLEncoder;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Objects;
Expand All @@ -40,6 +42,10 @@ public static String getCanonicalURL(String url) {
}

public static String getCanonicalURL(String href, String context) {
return getCanonicalURL(href, context, StandardCharsets.UTF_8);
}

public static String getCanonicalURL(String href, String context, Charset charset) {

try {
URL canonicalURL =
Expand Down Expand Up @@ -75,7 +81,7 @@ public static String getCanonicalURL(String href, String context) {
Map<String, String> params = createParameterMap(canonicalURL.getQuery());
final String queryString;
if ((params != null) && !params.isEmpty()) {
String canonicalParams = canonicalize(params);
String canonicalParams = canonicalize(params, charset);
queryString = (canonicalParams.isEmpty() ? "" : ("?" + canonicalParams));
} else {
queryString = "";
Expand Down Expand Up @@ -143,9 +149,11 @@ private static Map<String, String> createParameterMap(String queryString) {
*
* @param paramsMap
* Parameter map whose name-value pairs are in order of insertion.
* @param charset
* Charset of html page
* @return Canonical form of query string.
*/
private static String canonicalize(Map<String, String> paramsMap) {
private static String canonicalize(Map<String, String> paramsMap, Charset charset) {
if ((paramsMap == null) || paramsMap.isEmpty()) {
return "";
}
Expand All @@ -159,36 +167,27 @@ private static String canonicalize(Map<String, String> paramsMap) {
if (sb.length() > 0) {
sb.append('&');
}
sb.append(percentEncodeRfc3986(pair.getKey()));
sb.append(percentEncodeRfc3986(pair.getKey(), charset));
if (!pair.getValue().isEmpty()) {
sb.append('=');
sb.append(percentEncodeRfc3986(pair.getValue()));
sb.append(percentEncodeRfc3986(pair.getValue(), charset));
}
}
return sb.toString();
}

/**
* Percent-encode values according the RFC 3986. The built-in Java
* URLEncoder does not encode according to the RFC, so we make the extra
* replacements.
*
* @param string
* Decoded string.
* @return Encoded string per RFC 3986.
*/
private static String percentEncodeRfc3986(String string) {
private static String normalizePath(final String path) {
return path.replace("%7E", "~").replace(" ", "%20");
}

private static String percentEncodeRfc3986(String string, Charset charset) {
try {
string = string.replace("+", "%2B");
string = URLDecoder.decode(string, "UTF-8");
string = URLEncoder.encode(string, "UTF-8");
string = URLEncoder.encode(string, charset.name());
return string.replace("+", "%20").replace("*", "%2A").replace("%7E", "~");
} catch (Exception e) {
return string;
}
}

private static String normalizePath(final String path) {
return path.replace("%7E", "~").replace(" ", "%20");
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

import static org.junit.Assert.assertEquals;

import java.nio.charset.Charset;

import org.junit.Test;

import edu.uci.ics.crawler4j.url.URLCanonicalizer;
Expand Down Expand Up @@ -79,6 +81,9 @@ public void testCanonizalier() {

assertEquals("http://foo.bar/mydir/myfile?page=2",
URLCanonicalizer.getCanonicalURL("?page=2", "http://foo.bar/mydir/myfile"));

// test href with charset
assertEquals("http://www.example.com/3.asp?DengJh=%BA%E91700718",
URLCanonicalizer.getCanonicalURL("3.asp?DengJh=洪1700718", "http://www.example.com",
Charset.forName("gb2312")));
}
}

0 comments on commit 312408a

Please sign in to comment.