Skip to content

Commit

Permalink
Detect charset from xml prolog
Browse files Browse the repository at this point in the history
Fixes jhy#701
  • Loading branch information
jhy committed May 7, 2016
1 parent 2bca40c commit 4eb4f2b
Show file tree
Hide file tree
Showing 8 changed files with 127 additions and 81 deletions.
3 changes: 3 additions & 0 deletions CHANGES
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
jsoup changelog

*** Release 1.9.2 [PENDING]
* In XML documents, detect the charset from the XML prolog - <?xml encoding="UTF-8"?>
<https://github.com/jhy/jsoup/issues/701>

* Fixed an issue where namespaced tags (like <fb:comment>) would cause Element.cssSelector() to fail.
<https://github.com/jhy/jsoup/pull/677>

Expand Down
104 changes: 61 additions & 43 deletions src/main/java/org/jsoup/helper/DataUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,15 @@

import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.XmlDeclaration;
import org.jsoup.parser.Parser;

import java.io.*;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
Expand Down Expand Up @@ -90,51 +96,38 @@ static Document parseByteData(ByteBuffer byteData, String charsetName, String ba
Document doc = null;

// look for BOM - overrides any other header or input
byteData.mark();
byte[] bom = new byte[4];
if (byteData.remaining() >= bom.length) {
byteData.get(bom);
byteData.rewind();
}
if (bom[0] == 0x00 && bom[1] == 0x00 && bom[2] == (byte) 0xFE && bom[3] == (byte) 0xFF || // BE
bom[0] == (byte) 0xFF && bom[1] == (byte) 0xFE && bom[2] == 0x00 && bom[3] == 0x00) { // LE
charsetName = "UTF-32"; // and I hope it's on your system
} else if (bom[0] == (byte) 0xFE && bom[1] == (byte) 0xFF || // BE
bom[0] == (byte) 0xFF && bom[1] == (byte) 0xFE) {
charsetName = "UTF-16"; // in all Javas
} else if (bom[0] == (byte) 0xEF && bom[1] == (byte) 0xBB && bom[2] == (byte) 0xBF) {
charsetName = "UTF-8"; // in all Javas
byteData.position(3); // 16 and 32 decoders consume the BOM to determine be/le; utf-8 should be consumed
}
charsetName = detectCharsetFromBom(byteData, charsetName);

if (charsetName == null) { // determine from meta. safe parse as UTF-8
if (charsetName == null) { // determine from meta. safe first parse as UTF-8
// look for <meta http-equiv="Content-Type" content="text/html;charset=gb2312"> or HTML5 <meta charset="gb2312">
docData = Charset.forName(defaultCharset).decode(byteData).toString();
doc = parser.parseInput(docData, baseUri);
Element meta = doc.select("meta[http-equiv=content-type], meta[charset]").first();
if (meta != null) { // if not found, will keep utf-8 as best attempt
String foundCharset = null;
String foundCharset = null; // if not found, will keep utf-8 as best attempt
if (meta != null) {
if (meta.hasAttr("http-equiv")) {
foundCharset = getCharsetFromContentType(meta.attr("content"));
}
if (foundCharset == null && meta.hasAttr("charset")) {
try {
if (Charset.isSupported(meta.attr("charset"))) {
foundCharset = meta.attr("charset");
}
} catch (IllegalCharsetNameException e) {
foundCharset = null;
}
foundCharset = meta.attr("charset");
}

if (foundCharset != null && foundCharset.length() != 0 && !foundCharset.equals(defaultCharset)) { // need to re-decode
foundCharset = foundCharset.trim().replaceAll("[\"']", "");
charsetName = foundCharset;
byteData.rewind();
docData = Charset.forName(foundCharset).decode(byteData).toString();
doc = null;
}
// look for <?xml encoding='ISO-8859-1'?>
if (foundCharset == null && doc.childNode(0) instanceof XmlDeclaration) {
XmlDeclaration prolog = (XmlDeclaration) doc.childNode(0);
if (prolog.name().equals("xml")) {
foundCharset = prolog.attr("encoding");
}
}
foundCharset = validateCharset(foundCharset);

if (foundCharset != null && !foundCharset.equals(defaultCharset)) { // need to re-decode
foundCharset = foundCharset.trim().replaceAll("[\"']", "");
charsetName = foundCharset;
byteData.rewind();
docData = Charset.forName(foundCharset).decode(byteData).toString();
doc = null;
}
} else { // specified by content type header (or by user on file load)
Validate.notEmpty(charsetName, "Must set charset arg to character set of file to parse. Set to null to attempt to detect from HTML");
docData = Charset.forName(charsetName).decode(byteData).toString();
Expand Down Expand Up @@ -209,15 +202,20 @@ static String getCharsetFromContentType(String contentType) {
if (m.find()) {
String charset = m.group(1).trim();
charset = charset.replace("charset=", "");
if (charset.length() == 0) return null;
try {
if (Charset.isSupported(charset)) return charset;
charset = charset.toUpperCase(Locale.ENGLISH);
if (Charset.isSupported(charset)) return charset;
} catch (IllegalCharsetNameException e) {
// if our advanced charset matching fails.... we just take the default
return null;
}
return validateCharset(charset);
}
return null;
}

private static String validateCharset(String cs) {
if (cs == null || cs.length() == 0) return null;
cs = cs.trim().replaceAll("[\"']", "");
try {
if (Charset.isSupported(cs)) return cs;
cs = cs.toUpperCase(Locale.ENGLISH);
if (Charset.isSupported(cs)) return cs;
} catch (IllegalCharsetNameException e) {
// if our this charset matching fails.... we just take the default
}
return null;
}
Expand All @@ -233,4 +231,24 @@ static String mimeBoundary() {
}
return mime.toString();
}

private static String detectCharsetFromBom(ByteBuffer byteData, String charsetName) {
byteData.mark();
byte[] bom = new byte[4];
if (byteData.remaining() >= bom.length) {
byteData.get(bom);
byteData.rewind();
}
if (bom[0] == 0x00 && bom[1] == 0x00 && bom[2] == (byte) 0xFE && bom[3] == (byte) 0xFF || // BE
bom[0] == (byte) 0xFF && bom[1] == (byte) 0xFE && bom[2] == 0x00 && bom[3] == 0x00) { // LE
charsetName = "UTF-32"; // and I hope it's on your system
} else if (bom[0] == (byte) 0xFE && bom[1] == (byte) 0xFF || // BE
bom[0] == (byte) 0xFF && bom[1] == (byte) 0xFE) {
charsetName = "UTF-16"; // in all Javas
} else if (bom[0] == (byte) 0xEF && bom[1] == (byte) 0xBB && bom[2] == (byte) 0xBF) {
charsetName = "UTF-8"; // in all Javas
byteData.position(3); // 16 and 32 decoders consume the BOM to determine be/le; utf-8 should be consumed here
}
return charsetName;
}
}
2 changes: 1 addition & 1 deletion src/main/java/org/jsoup/nodes/Document.java
Original file line number Diff line number Diff line change
Expand Up @@ -332,7 +332,7 @@ private void ensureMetaCharsetElement() {
if (node instanceof XmlDeclaration) {
XmlDeclaration decl = (XmlDeclaration) node;

if (decl.attr(XmlDeclaration.DECL_KEY).equals("xml")) {
if (decl.name().equals("xml")) {
decl.attr("encoding", charset().displayName());

final String version = decl.attr("version");
Expand Down
53 changes: 24 additions & 29 deletions src/main/java/org/jsoup/nodes/XmlDeclaration.java
Original file line number Diff line number Diff line change
@@ -1,65 +1,60 @@
package org.jsoup.nodes;

import org.jsoup.helper.Validate;

import java.io.IOException;

/**
An XML Declaration.
@author Jonathan Hedley, [email protected] */
public class XmlDeclaration extends Node {
static final String DECL_KEY = "declaration";
private final String name;
private final boolean isProcessingInstruction; // <! if true, <? if false, declaration (and last data char should be ?)

/**
Create a new XML declaration
@param data data
@param name of declaration
@param baseUri base uri
@param isProcessingInstruction is processing instruction
*/
public XmlDeclaration(String data, String baseUri, boolean isProcessingInstruction) {
public XmlDeclaration(String name, String baseUri, boolean isProcessingInstruction) {
super(baseUri);
attributes.put(DECL_KEY, data);
Validate.notNull(name);
this.name = name;
this.isProcessingInstruction = isProcessingInstruction;
}

public String nodeName() {
return "#declaration";
}


/**
* Get the name of this declaration.
* @return name of this declaration.
*/
public String name() {
return name;
}

/**
Get the unencoded XML declaration.
@return XML declaration
*/
public String getWholeDeclaration() {
final String decl = attributes.get(DECL_KEY);

if(decl.equals("xml") && attributes.size() > 1 ) {
StringBuilder sb = new StringBuilder(decl);
final String version = attributes.get("version");

if( version != null ) {
sb.append(" version=\"").append(version).append("\"");
}

final String encoding = attributes.get("encoding");

if( encoding != null ) {
sb.append(" encoding=\"").append(encoding).append("\"");
}

return sb.toString();
}
else {
return attributes.get(DECL_KEY);
}
return attributes.html().trim(); // attr html starts with a " "
}

void outerHtmlHead(Appendable accum, int depth, Document.OutputSettings out) throws IOException {
accum
.append("<")
.append(isProcessingInstruction ? "!" : "?")
.append(getWholeDeclaration())
.append(">");
.append("<")
.append(isProcessingInstruction ? "!" : "?")
.append(name);
attributes.html(accum, out);
accum
.append(isProcessingInstruction ? "!" : "?")
.append(">");
}

void outerHtmlTail(Appendable accum, int depth, Document.OutputSettings out) {}
Expand Down
8 changes: 6 additions & 2 deletions src/main/java/org/jsoup/parser/XmlTreeBuilder.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package org.jsoup.parser;

import org.jsoup.Jsoup;
import org.jsoup.helper.Validate;
import org.jsoup.nodes.*;

Expand Down Expand Up @@ -70,10 +71,13 @@ void insert(Token.Comment commentToken) {
Comment comment = new Comment(commentToken.getData(), baseUri);
Node insert = comment;
if (commentToken.bogus) { // xml declarations are emitted as bogus comments (which is right for html, but not xml)
// so we do a bit of a hack and parse the data as an element to pull the attributes out
String data = comment.getData();
if (data.length() > 1 && (data.startsWith("!") || data.startsWith("?"))) {
String declaration = data.substring(1);
insert = new XmlDeclaration(declaration, comment.baseUri(), data.startsWith("!"));
Document doc = Jsoup.parse("<" + data.substring(1, data.length() -1) + ">", baseUri, Parser.xmlParser());
Element el = doc.child(0);
insert = new XmlDeclaration(el.tagName(), comment.baseUri(), data.startsWith("!"));
insert.attributes().addAll(el.attributes());
}
}
insertNode(insert);
Expand Down
8 changes: 4 additions & 4 deletions src/test/java/org/jsoup/nodes/DocumentTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,7 @@ public void testMetaCharsetUpdateXmlUtf8() {
doc.updateMetaCharsetElement(true);
doc.charset(Charset.forName(charsetUtf8));

final String xmlCharsetUTF8 = "<?xml version=\"1.0\" encoding=\"" + charsetUtf8 + "\">\n" +
final String xmlCharsetUTF8 = "<?xml version=\"1.0\" encoding=\"" + charsetUtf8 + "\"?>\n" +
"<root>\n" +
" node\n" +
"</root>";
Expand All @@ -330,7 +330,7 @@ public void testMetaCharsetUpdateXmlIso8859() {
doc.updateMetaCharsetElement(true);
doc.charset(Charset.forName(charsetIso8859));

final String xmlCharsetISO = "<?xml version=\"1.0\" encoding=\"" + charsetIso8859 + "\">\n" +
final String xmlCharsetISO = "<?xml version=\"1.0\" encoding=\"" + charsetIso8859 + "\"?>\n" +
"<root>\n" +
" node\n" +
"</root>";
Expand All @@ -348,7 +348,7 @@ public void testMetaCharsetUpdateXmlNoCharset() {
doc.updateMetaCharsetElement(true);
doc.charset(Charset.forName(charsetUtf8));

final String xmlCharsetUTF8 = "<?xml version=\"1.0\" encoding=\"" + charsetUtf8 + "\">\n" +
final String xmlCharsetUTF8 = "<?xml version=\"1.0\" encoding=\"" + charsetUtf8 + "\"?>\n" +
"<root>\n" +
" node\n" +
"</root>";
Expand All @@ -372,7 +372,7 @@ public void testMetaCharsetUpdateXmlDisabled() {
public void testMetaCharsetUpdateXmlDisabledNoChanges() {
final Document doc = createXmlDocument("dontTouch", "dontTouch", true);

final String xmlCharset = "<?xml version=\"dontTouch\" encoding=\"dontTouch\">\n" +
final String xmlCharset = "<?xml version=\"dontTouch\" encoding=\"dontTouch\"?>\n" +
"<root>\n" +
" node\n" +
"</root>";
Expand Down
28 changes: 26 additions & 2 deletions src/test/java/org/jsoup/parser/XmlTreeBuilderTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.nodes.XmlDeclaration;
import org.junit.Ignore;
import org.junit.Test;

Expand All @@ -17,7 +18,8 @@
import java.util.List;

import static org.jsoup.nodes.Document.OutputSettings.Syntax;
import static org.junit.Assert.*;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;

/**
* Tests XmlTreeBuilder.
Expand Down Expand Up @@ -103,7 +105,7 @@ public void testDoesNotForceSelfClosingKnownTags() {
@Test public void handlesXmlDeclarationAsDeclaration() {
String html = "<?xml encoding='UTF-8' ?><body>One</body><!-- comment -->";
Document doc = Jsoup.parse(html, "", Parser.xmlParser());
assertEquals("<?xml encoding='UTF-8' ?> <body> One </body> <!-- comment -->",
assertEquals("<?xml encoding=\"UTF-8\"?> <body> One </body> <!-- comment -->",
StringUtil.normaliseWhitespace(doc.outerHtml()));
assertEquals("#declaration", doc.childNode(0).nodeName());
assertEquals("#comment", doc.childNode(2).nodeName());
Expand All @@ -130,4 +132,26 @@ public void testDoesHandleEOFInTag() {
Document xmlDoc = Jsoup.parse(html, "", Parser.xmlParser());
assertEquals("<img src=\"asdf\" onerror=\"alert(1)\" x=\"\" />", xmlDoc.html());
}

@Test
public void testDetectCharsetEncodingDeclaration() throws IOException, URISyntaxException {
File xmlFile = new File(XmlTreeBuilder.class.getResource("/htmltests/xml-charset.xml").toURI());
InputStream inStream = new FileInputStream(xmlFile);
Document doc = Jsoup.parse(inStream, null, "http://example.com/", Parser.xmlParser());
assertEquals("ISO-8859-1", doc.charset().name());
assertEquals("<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?> <data>äöåéü</data>",
TextUtil.stripNewlines(doc.html()));
}

@Test
public void testParseDeclarationAttributes() {
String xml = "<?xml version='1' encoding='UTF-8' something='else'?><val>One</val>";
Document doc = Jsoup.parse(xml, "", Parser.xmlParser());
XmlDeclaration decl = (XmlDeclaration) doc.childNode(0);
assertEquals("1", decl.attr("version"));
assertEquals("UTF-8", decl.attr("encoding"));
assertEquals("else", decl.attr("something"));
assertEquals("version=\"1\" encoding=\"UTF-8\" something=\"else\"", decl.getWholeDeclaration());
assertEquals("<?xml version=\"1\" encoding=\"UTF-8\" something=\"else\"?>", decl.outerHtml());
}
}
2 changes: 2 additions & 0 deletions src/test/resources/htmltests/xml-charset.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
<?xml version="1.0" encoding="ISO-8859-1"?>
<data>äöåéü</data>

0 comments on commit 4eb4f2b

Please sign in to comment.