Skip to content

Commit

Permalink
Merge pull request internetarchive#36 from nlevitt/bad-charset
Browse files Browse the repository at this point in the history
Avoid the exception below by handling bad charsets in FetchHTTP. Restore...
  • Loading branch information
eldondev committed Jan 29, 2014
2 parents 6933232 + 31aea01 commit 10ec391
Show file tree
Hide file tree
Showing 3 changed files with 122 additions and 12 deletions.
21 changes: 15 additions & 6 deletions modules/src/main/java/org/archive/modules/fetcher/FetchHTTP.java
Original file line number Diff line number Diff line change
Expand Up @@ -577,12 +577,21 @@ protected void setOtherCodings(CrawlURI uri, final Recorder rec,
*/
protected void setCharacterEncoding(CrawlURI curi, final Recorder rec,
final HttpResponse response) {
Charset charset = ContentType.getOrDefault(response.getEntity()).getCharset();
if (charset != null) {
rec.setCharset(charset);
} else {
// curi.getAnnotations().add("unsatisfiableCharsetInHeader:"+StringUtils.stripToEmpty(encoding));
rec.setCharset(getDefaultCharset());
rec.setCharset(getDefaultCharset());
try {
Charset charset = ContentType.getOrDefault(response.getEntity()).getCharset();
if (charset != null) {
rec.setCharset(charset);
}
} catch (IllegalArgumentException e) {
// exception could be UnsupportedCharsetException or IllegalCharsetNameException
String unsatisfiableCharset;
try {
unsatisfiableCharset = response.getFirstHeader("content-type").getElements()[0].getParameterByName("charset").getValue();
} catch (Exception f) {
unsatisfiableCharset = "<failed-to-parse>";
}
curi.getAnnotations().add("unsatisfiableCharsetInHeader:"+StringUtils.stripToEmpty(unsatisfiableCharset));
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,41 @@ public class FetchHTTPTest extends ProcessorTestBase {
-54, -50, -55, -51, -53, 47, 40, 44, 42, 46, 41, 45, 43, -81, -88,
-84, 50, 48, 52, 50, 54, 49, 53, 51, -73, -80, -28, 2, 0, -43, 104,
-33, -11, 37, 0, 0, 0 };

protected static final byte[] CP1251_PAYLOAD = {
(byte) 0xca, (byte) 0xee, (byte) 0xf7, (byte) 0xe0, (byte) 0xed, (byte) 0xe8,
(byte) 0x20, (byte) 0xce, (byte) 0xf0, (byte) 0xea, (byte) 0xe5, (byte) 0xf1,
(byte) 0xf2, (byte) 0xe0, (byte) 0xf0, (byte) 0x20, (byte) 0xe5, (byte) 0x20,
(byte) 0xe5, (byte) 0xe4, (byte) 0xe5, (byte) 0xed, (byte) 0x20, (byte) 0xee,
(byte) 0xe4, (byte) 0x20, (byte) 0xed, (byte) 0xe0, (byte) 0xbc, (byte) 0xef,
(byte) 0xee, (byte) 0xe7, (byte) 0xed, (byte) 0xe0, (byte) 0xf2, (byte) 0xe8,
(byte) 0xf2, (byte) 0xe5, (byte) 0x20, (byte) 0xe8, (byte) 0x20, (byte) 0xed,
(byte) 0xe0, (byte) 0xbc, (byte) 0xef, (byte) 0xee, (byte) 0xef, (byte) 0xf3,
(byte) 0xeb, (byte) 0xe0, (byte) 0xf0, (byte) 0xed, (byte) 0xe8, (byte) 0xf2,
(byte) 0xe5, (byte) 0x20, (byte) 0xe1, (byte) 0xeb, (byte) 0xe5, (byte) 0xf5,
(byte) 0x2d, (byte) 0xee, (byte) 0xf0, (byte) 0xea, (byte) 0xe5, (byte) 0xf1,
(byte) 0xf2, (byte) 0xf0, (byte) 0xe8, (byte) 0x20, (byte) 0xe2, (byte) 0xee,
(byte) 0x20, (byte) 0xf1, (byte) 0xe2, (byte) 0xe5, (byte) 0xf2, (byte) 0xee,
(byte) 0xf2, (byte) 0x2c, (byte) 0x20, (byte) 0xea, (byte) 0xee, (byte) 0xbc,
(byte) 0x20, (byte) 0xe3, (byte) 0xee, (byte) 0x20, (byte) 0xf1, (byte) 0xee,
(byte) 0xf7, (byte) 0xe8, (byte) 0xed, (byte) 0xf3, (byte) 0xe2, (byte) 0xe0,
(byte) 0xe0, (byte) 0xf2, (byte) 0x20, (byte) 0xe4, (byte) 0xe5, (byte) 0xf1,
(byte) 0xe5, (byte) 0xf2, (byte) 0xec, (byte) 0xe8, (byte) 0xed, (byte) 0xe0,
(byte) 0x20, (byte) 0xd0, (byte) 0xee, (byte) 0xec, (byte) 0xe8, (byte) 0x2d,
(byte) 0xcc, (byte) 0xe0, (byte) 0xea, (byte) 0xe5, (byte) 0xe4, (byte) 0xee,
(byte) 0xed, (byte) 0xf6, (byte) 0xe8, (byte) 0x20, (byte) 0xef, (byte) 0xee,
(byte) 0x20, (byte) 0xef, (byte) 0xee, (byte) 0xf2, (byte) 0xe5, (byte) 0xea,
(byte) 0xeb, (byte) 0xee, (byte) 0x20, (byte) 0xee, (byte) 0xe4, (byte) 0x20,
(byte) 0xca, (byte) 0xee, (byte) 0xf7, (byte) 0xe0, (byte) 0xed, (byte) 0xe8,
(byte) 0x2c, (byte) 0x20, (byte) 0xef, (byte) 0xf0, (byte) 0xe5, (byte) 0xe4,
(byte) 0xe2, (byte) 0xee, (byte) 0xe4, (byte) 0xe5, (byte) 0xed, (byte) 0xe8,
(byte) 0x20, (byte) 0xee, (byte) 0xe4, (byte) 0x20, (byte) 0xf2, (byte) 0xf0,
(byte) 0xf3, (byte) 0xe1, (byte) 0xe0, (byte) 0xf7, (byte) 0xee, (byte) 0xf2,
(byte) 0x20, (byte) 0xcd, (byte) 0xe0, (byte) 0xe0, (byte) 0xf2, (byte) 0x20,
(byte) 0x28, (byte) 0xcd, (byte) 0xe5, (byte) 0xe0, (byte) 0xf2, (byte) 0x29,
(byte) 0x20, (byte) 0xc2, (byte) 0xe5, (byte) 0xeb, (byte) 0xe8, (byte) 0xee,
(byte) 0xe2, (byte) 0x2e, (byte) 0x0a,
};

protected static final byte[] EIGHTY_BYTE_LINE = "1234567890123456789012345678901234567890123456789012345678901234567890123456789\n".getBytes();

Expand Down Expand Up @@ -142,6 +177,27 @@ public void handle(String target, HttpServletRequest request,
response.setHeader("ETag", ETAG_TEST_VALUE);
response.getOutputStream().write(DEFAULT_PAYLOAD_STRING.getBytes("US-ASCII"));
((Request)request).setHandled(true);
} else if (target.equals("/cp1251")) {
response.setContentType("text/plain;charset=cp1251");
response.setDateHeader("Last-Modified", 0);
response.setHeader("ETag", ETAG_TEST_VALUE);
response.setStatus(HttpServletResponse.SC_OK);
response.getOutputStream().write(CP1251_PAYLOAD);
((Request)request).setHandled(true);
} else if (target.equals("/unsupported-charset")) {
response.setContentType("text/plain;charset=UNSUPPORTED-CHARSET");
response.setDateHeader("Last-Modified", 0);
response.setHeader("ETag", ETAG_TEST_VALUE);
response.setStatus(HttpServletResponse.SC_OK);
response.getOutputStream().write(DEFAULT_PAYLOAD_STRING.getBytes("US-ASCII"));
((Request)request).setHandled(true);
} else if (target.equals("/invalid-charset")) {
response.setContentType("text/plain;charset=%%INVALID-CHARSET%%");
response.setDateHeader("Last-Modified", 0);
response.setHeader("ETag", ETAG_TEST_VALUE);
response.setStatus(HttpServletResponse.SC_OK);
response.getOutputStream().write(DEFAULT_PAYLOAD_STRING.getBytes("US-ASCII"));
((Request)request).setHandled(true);
} else {
response.setContentType("text/plain;charset=US-ASCII");
response.setDateHeader("Last-Modified", 0);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,10 @@

import org.apache.commons.httpclient.URIException;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringEscapeUtils;
import org.apache.http.NoHttpResponseException;
import org.archive.httpclient.ConfigurableX509TrustManager.TrustLevel;
import org.archive.io.ReplayCharSequence;
import org.archive.modules.CrawlMetadata;
import org.archive.modules.CrawlURI;
import org.archive.modules.CrawlURI.FetchType;
Expand Down Expand Up @@ -138,8 +140,10 @@ protected void runDefaultChecks(CrawlURI curi, String... exclusionsArray)

// check various
assertEquals("sha1:TQ5R6YVOZLTQENRIIENVGXHOPX3YCRNJ", curi.getContentDigestSchemeString());
assertEquals("text/plain;charset=US-ASCII", curi.getContentType());
assertEquals(Charset.forName("US-ASCII"), curi.getRecorder().getCharset());
if (!exclusions.contains("contentType")) {
assertEquals("text/plain;charset=US-ASCII", curi.getContentType());
assertEquals(Charset.forName("US-ASCII"), curi.getRecorder().getCharset());
}
assertTrue(curi.getCredentials().isEmpty());
assertTrue(curi.getFetchDuration() >= 0);
if (!exclusions.contains("fetchStatus")) {
Expand Down Expand Up @@ -765,16 +769,57 @@ public void testTwoQuestionMarks() throws Exception {
}

public void testUrlWithSpaces() throws Exception {
CrawlURI curi = makeCrawlURI("http://localhost:7777/url with spaces");
CrawlURI curi = makeCrawlURI("http://localhost:7777/url with spaces?query%20with%20spaces");
fetcher().process(curi);
assertTrue(httpRequestString(curi).startsWith("GET /url%20with%20spaces HTTP/1.0\r\n"));
assertTrue(httpRequestString(curi).startsWith("GET /url%20with%20spaces?query%20with%20spaces HTTP/1.0\r\n"));
runDefaultChecks(curi, "requestLine");

curi = makeCrawlURI("http://localhost:7777/url%20with%20spaces");
curi = makeCrawlURI("http://localhost:7777/url%20with%20spaces?query with spaces");
fetcher().process(curi);
assertTrue(httpRequestString(curi).startsWith("GET /url%20with%20spaces HTTP/1.0\r\n"));
assertTrue(httpRequestString(curi).startsWith("GET /url%20with%20spaces?query%20with%20spaces HTTP/1.0\r\n"));
runDefaultChecks(curi, "requestLine");
}

public void testCharsets() throws Exception {
CrawlURI curi = makeCrawlURI("http://localhost:7777/cp1251");
fetcher().process(curi);
assertEquals("text/plain;charset=cp1251", curi.getHttpResponseHeader("content-type"));
assertEquals(Charset.forName("cp1251"), curi.getRecorder().getCharset());
assertTrue(Arrays.equals(FetchHTTPTest.CP1251_PAYLOAD, IOUtils.toByteArray(curi.getRecorder().getContentReplayInputStream())));
assertEquals("\u041A\u043E\u0447\u0430\u043D\u0438 \u041E\u0440\u043A"
+ "\u0435\u0441\u0442\u0430\u0440 \u0435 \u0435\u0434\u0435"
+ "\u043D \u043E\u0434 \u043D\u0430\u0458\u043F\u043E\u0437"
+ "\u043D\u0430\u0442\u0438\u0442\u0435 \u0438 \u043D\u0430"
+ "\u0458\u043F\u043E\u043F\u0443\u043B\u0430\u0440\u043D"
+ "\u0438\u0442\u0435 \u0431\u043B\u0435\u0445-\u043E\u0440"
+ "\u043A\u0435\u0441\u0442\u0440\u0438 \u0432\u043E \u0441"
+ "\u0432\u0435\u0442\u043E\u0442, \u043A\u043E\u0458 \u0433"
+ "\u043E \u0441\u043E\u0447\u0438\u043D\u0443\u0432\u0430"
+ "\u0430\u0442 \u0434\u0435\u0441\u0435\u0442\u043C\u0438"
+ "\u043D\u0430 \u0420\u043E\u043C\u0438-\u041C\u0430\u043A"
+ "\u0435\u0434\u043E\u043D\u0446\u0438 \u043F\u043E \u043F"
+ "\u043E\u0442\u0435\u043A\u043B\u043E \u043E\u0434 \u041A"
+ "\u043E\u0447\u0430\u043D\u0438, \u043F\u0440\u0435\u0434"
+ "\u0432\u043E\u0434\u0435\u043D\u0438 \u043E\u0434 \u0442"
+ "\u0440\u0443\u0431\u0430\u0447\u043E\u0442 \u041D\u0430"
+ "\u0430\u0442 (\u041D\u0435\u0430\u0442) \u0412\u0435\u043B"
+ "\u0438\u043E\u0432.\n",
curi.getRecorder().getContentReplayCharSequence().toString());

curi = makeCrawlURI("http://localhost:7777/unsupported-charset");
fetcher().process(curi);
assertEquals("text/plain;charset=UNSUPPORTED-CHARSET", curi.getHttpResponseHeader("content-type"));
assertTrue(curi.getAnnotations().contains("unsatisfiableCharsetInHeader:UNSUPPORTED-CHARSET"));
assertEquals(Charset.forName("latin1"), curi.getRecorder().getCharset()); // default fallback
runDefaultChecks(curi, "requestLine", "contentType");

curi = makeCrawlURI("http://localhost:7777/invalid-charset");
fetcher().process(curi);
assertEquals("text/plain;charset=%%INVALID-CHARSET%%", curi.getHttpResponseHeader("content-type"));
assertTrue(curi.getAnnotations().contains("unsatisfiableCharsetInHeader:%%INVALID-CHARSET%%"));
assertEquals(Charset.forName("latin1"), curi.getRecorder().getCharset()); // default fallback
runDefaultChecks(curi, "requestLine", "contentType");
}

@Override
protected FetchHTTP makeModule() throws IOException {
Expand Down

0 comments on commit 10ec391

Please sign in to comment.