Detect charset from xml prolog

Fixes jhy#701
gzwz · May 7, 2016 · 4eb4f2b · 4eb4f2b
1 parent 2bca40c
commit 4eb4f2b
Show file tree

Hide file tree

Showing 8 changed files with 127 additions and 81 deletions.
diff --git a/CHANGES b/CHANGES
@@ -1,6 +1,9 @@
 jsoup changelog
 
 *** Release 1.9.2 [PENDING]
+ * In XML documents, detect the charset from the XML prolog - <?xml encoding="UTF-8"?>
+   <https://github.com/jhy/jsoup/issues/701>
+
  * Fixed an issue where namespaced tags (like <fb:comment>) would cause Element.cssSelector() to fail.
    <https://github.com/jhy/jsoup/pull/677>
 

diff --git a/src/main/java/org/jsoup/helper/DataUtil.java b/src/main/java/org/jsoup/helper/DataUtil.java
@@ -2,9 +2,15 @@
 
 import org.jsoup.nodes.Document;
 import org.jsoup.nodes.Element;
+import org.jsoup.nodes.XmlDeclaration;
 import org.jsoup.parser.Parser;
 
-import java.io.*;
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.io.RandomAccessFile;
 import java.nio.ByteBuffer;
 import java.nio.charset.Charset;
 import java.nio.charset.IllegalCharsetNameException;
@@ -90,51 +96,38 @@ static Document parseByteData(ByteBuffer byteData, String charsetName, String ba
         Document doc = null;
 
         // look for BOM - overrides any other header or input
-        byteData.mark();
-        byte[] bom = new byte[4];
-        if (byteData.remaining() >= bom.length) {
-            byteData.get(bom);
-            byteData.rewind();
-        }
-        if (bom[0] == 0x00 && bom[1] == 0x00 && bom[2] == (byte) 0xFE && bom[3] == (byte) 0xFF || // BE
-                bom[0] == (byte) 0xFF && bom[1] == (byte) 0xFE && bom[2] == 0x00 && bom[3] == 0x00) { // LE
-            charsetName = "UTF-32"; // and I hope it's on your system
-        } else if (bom[0] == (byte) 0xFE && bom[1] == (byte) 0xFF || // BE
-                bom[0] == (byte) 0xFF && bom[1] == (byte) 0xFE) {
-            charsetName = "UTF-16"; // in all Javas
-        } else if (bom[0] == (byte) 0xEF && bom[1] == (byte) 0xBB && bom[2] == (byte) 0xBF) {
-            charsetName = "UTF-8"; // in all Javas
-            byteData.position(3); // 16 and 32 decoders consume the BOM to determine be/le; utf-8 should be consumed
-        }
+        charsetName = detectCharsetFromBom(byteData, charsetName);
 
-        if (charsetName == null) { // determine from meta. safe parse as UTF-8
+        if (charsetName == null) { // determine from meta. safe first parse as UTF-8
             // look for <meta http-equiv="Content-Type" content="text/html;charset=gb2312"> or HTML5 <meta charset="gb2312">
             docData = Charset.forName(defaultCharset).decode(byteData).toString();
             doc = parser.parseInput(docData, baseUri);
             Element meta = doc.select("meta[http-equiv=content-type], meta[charset]").first();
-            if (meta != null) { // if not found, will keep utf-8 as best attempt
-                String foundCharset = null;
+            String foundCharset = null; // if not found, will keep utf-8 as best attempt
+            if (meta != null) {
                 if (meta.hasAttr("http-equiv")) {
                     foundCharset = getCharsetFromContentType(meta.attr("content"));
                 }
                 if (foundCharset == null && meta.hasAttr("charset")) {
-                    try {
-                        if (Charset.isSupported(meta.attr("charset"))) {
-                            foundCharset = meta.attr("charset");
-                        }
-                    } catch (IllegalCharsetNameException e) {
-                        foundCharset = null;
-                    }
+                    foundCharset = meta.attr("charset");
                 }
-
-                if (foundCharset != null && foundCharset.length() != 0 && !foundCharset.equals(defaultCharset)) { // need to re-decode
-                    foundCharset = foundCharset.trim().replaceAll("[\"']", "");
-                    charsetName = foundCharset;
-                    byteData.rewind();
-                    docData = Charset.forName(foundCharset).decode(byteData).toString();
-                    doc = null;
+            }
+            // look for <?xml encoding='ISO-8859-1'?>
+            if (foundCharset == null && doc.childNode(0) instanceof XmlDeclaration) {
+                XmlDeclaration prolog = (XmlDeclaration) doc.childNode(0);
+                if (prolog.name().equals("xml")) {
+                    foundCharset = prolog.attr("encoding");
                 }
             }
+            foundCharset = validateCharset(foundCharset);
+
+            if (foundCharset != null && !foundCharset.equals(defaultCharset)) { // need to re-decode
+                foundCharset = foundCharset.trim().replaceAll("[\"']", "");
+                charsetName = foundCharset;
+                byteData.rewind();
+                docData = Charset.forName(foundCharset).decode(byteData).toString();
+                doc = null;
+            }
         } else { // specified by content type header (or by user on file load)
             Validate.notEmpty(charsetName, "Must set charset arg to character set of file to parse. Set to null to attempt to detect from HTML");
             docData = Charset.forName(charsetName).decode(byteData).toString();
@@ -209,15 +202,20 @@ static String getCharsetFromContentType(String contentType) {
         if (m.find()) {
             String charset = m.group(1).trim();
             charset = charset.replace("charset=", "");
-            if (charset.length() == 0) return null;
-            try {
-                if (Charset.isSupported(charset)) return charset;
-                charset = charset.toUpperCase(Locale.ENGLISH);
-                if (Charset.isSupported(charset)) return charset;
-            } catch (IllegalCharsetNameException e) {
-                // if our advanced charset matching fails.... we just take the default
-                return null;
-            }
+            return validateCharset(charset);
+        }
+        return null;
+    }
+
+    private static String validateCharset(String cs) {
+        if (cs == null || cs.length() == 0) return null;
+        cs = cs.trim().replaceAll("[\"']", "");
+        try {
+            if (Charset.isSupported(cs)) return cs;
+            cs = cs.toUpperCase(Locale.ENGLISH);
+            if (Charset.isSupported(cs)) return cs;
+        } catch (IllegalCharsetNameException e) {
+            // if our this charset matching fails.... we just take the default
         }
         return null;
     }
@@ -233,4 +231,24 @@ static String mimeBoundary() {
         }
         return mime.toString();
     }
+
+    private static String detectCharsetFromBom(ByteBuffer byteData, String charsetName) {
+        byteData.mark();
+        byte[] bom = new byte[4];
+        if (byteData.remaining() >= bom.length) {
+            byteData.get(bom);
+            byteData.rewind();
+        }
+        if (bom[0] == 0x00 && bom[1] == 0x00 && bom[2] == (byte) 0xFE && bom[3] == (byte) 0xFF || // BE
+            bom[0] == (byte) 0xFF && bom[1] == (byte) 0xFE && bom[2] == 0x00 && bom[3] == 0x00) { // LE
+            charsetName = "UTF-32"; // and I hope it's on your system
+        } else if (bom[0] == (byte) 0xFE && bom[1] == (byte) 0xFF || // BE
+            bom[0] == (byte) 0xFF && bom[1] == (byte) 0xFE) {
+            charsetName = "UTF-16"; // in all Javas
+        } else if (bom[0] == (byte) 0xEF && bom[1] == (byte) 0xBB && bom[2] == (byte) 0xBF) {
+            charsetName = "UTF-8"; // in all Javas
+            byteData.position(3); // 16 and 32 decoders consume the BOM to determine be/le; utf-8 should be consumed here
+        }
+        return charsetName;
+    }
 }
diff --git a/src/main/java/org/jsoup/nodes/Document.java b/src/main/java/org/jsoup/nodes/Document.java
@@ -332,7 +332,7 @@ private void ensureMetaCharsetElement() {
                 if (node instanceof XmlDeclaration) {
                     XmlDeclaration decl = (XmlDeclaration) node;
 
-                    if (decl.attr(XmlDeclaration.DECL_KEY).equals("xml")) {
+                    if (decl.name().equals("xml")) {
                         decl.attr("encoding", charset().displayName());
 
                         final String version = decl.attr("version");

diff --git a/src/main/java/org/jsoup/nodes/XmlDeclaration.java b/src/main/java/org/jsoup/nodes/XmlDeclaration.java
@@ -1,65 +1,60 @@
 package org.jsoup.nodes;
 
+import org.jsoup.helper.Validate;
+
 import java.io.IOException;
 
 /**
  An XML Declaration.
 
  @author Jonathan Hedley, [email protected] */
 public class XmlDeclaration extends Node {
-    static final String DECL_KEY = "declaration";
+    private final String name;
     private final boolean isProcessingInstruction; // <! if true, <? if false, declaration (and last data char should be ?)
 
     /**
      Create a new XML declaration
-     @param data data
+     @param name of declaration
      @param baseUri base uri
      @param isProcessingInstruction is processing instruction
      */
-    public XmlDeclaration(String data, String baseUri, boolean isProcessingInstruction) {
+    public XmlDeclaration(String name, String baseUri, boolean isProcessingInstruction) {
         super(baseUri);
-        attributes.put(DECL_KEY, data);
+        Validate.notNull(name);
+        this.name = name;
         this.isProcessingInstruction = isProcessingInstruction;
     }
 
     public String nodeName() {
         return "#declaration";
     }
 
+
+    /**
+     * Get the name of this declaration.
+     * @return name of this declaration.
+     */
+    public String name() {
+        return name;
+    }
+
     /**
      Get the unencoded XML declaration.
      @return XML declaration
      */
     public String getWholeDeclaration() {
-        final String decl = attributes.get(DECL_KEY);
-
-        if(decl.equals("xml") && attributes.size() > 1 ) {
-            StringBuilder sb = new StringBuilder(decl);
-            final String version = attributes.get("version");
-
-            if( version != null ) {
-                sb.append(" version=\"").append(version).append("\"");
-            }
-
-            final String encoding = attributes.get("encoding");
-
-            if( encoding != null ) {
-                sb.append(" encoding=\"").append(encoding).append("\"");
-            }
-
-            return sb.toString();
-        }
-        else {
-            return attributes.get(DECL_KEY);
-        }
+        return attributes.html().trim(); // attr html starts with a " "
     }
 
 	void outerHtmlHead(Appendable accum, int depth, Document.OutputSettings out) throws IOException {
         accum
-                .append("<")
-                .append(isProcessingInstruction ? "!" : "?")
-                .append(getWholeDeclaration())
-                .append(">");
+            .append("<")
+            .append(isProcessingInstruction ? "!" : "?")
+            .append(name);
+        attributes.html(accum, out);
+        accum
+            .append(isProcessingInstruction ? "!" : "?")
+            .append(">");
     }
 
 	void outerHtmlTail(Appendable accum, int depth, Document.OutputSettings out) {}

diff --git a/src/main/java/org/jsoup/parser/XmlTreeBuilder.java b/src/main/java/org/jsoup/parser/XmlTreeBuilder.java
@@ -1,5 +1,6 @@
 package org.jsoup.parser;
 
+import org.jsoup.Jsoup;
 import org.jsoup.helper.Validate;
 import org.jsoup.nodes.*;
 
@@ -70,10 +71,13 @@ void insert(Token.Comment commentToken) {
         Comment comment = new Comment(commentToken.getData(), baseUri);
         Node insert = comment;
         if (commentToken.bogus) { // xml declarations are emitted as bogus comments (which is right for html, but not xml)
+            // so we do a bit of a hack and parse the data as an element to pull the attributes out
             String data = comment.getData();
             if (data.length() > 1 && (data.startsWith("!") || data.startsWith("?"))) {
-                String declaration = data.substring(1);
-                insert = new XmlDeclaration(declaration, comment.baseUri(), data.startsWith("!"));
+                Document doc = Jsoup.parse("<" + data.substring(1, data.length() -1) + ">", baseUri, Parser.xmlParser());
+                Element el = doc.child(0);
+                insert = new XmlDeclaration(el.tagName(), comment.baseUri(), data.startsWith("!"));
+                insert.attributes().addAll(el.attributes());
             }
         }
         insertNode(insert);

diff --git a/src/test/java/org/jsoup/nodes/DocumentTest.java b/src/test/java/org/jsoup/nodes/DocumentTest.java
@@ -312,7 +312,7 @@ public void testMetaCharsetUpdateXmlUtf8() {
         doc.updateMetaCharsetElement(true);
         doc.charset(Charset.forName(charsetUtf8));
 
-        final String xmlCharsetUTF8 = "<?xml version=\"1.0\" encoding=\"" + charsetUtf8 + "\">\n" +
+        final String xmlCharsetUTF8 = "<?xml version=\"1.0\" encoding=\"" + charsetUtf8 + "\"?>\n" +
                                         "<root>\n" +
                                         " node\n" +
                                         "</root>";
@@ -330,7 +330,7 @@ public void testMetaCharsetUpdateXmlIso8859() {
         doc.updateMetaCharsetElement(true);
         doc.charset(Charset.forName(charsetIso8859));
 
-        final String xmlCharsetISO = "<?xml version=\"1.0\" encoding=\"" + charsetIso8859 + "\">\n" +
+        final String xmlCharsetISO = "<?xml version=\"1.0\" encoding=\"" + charsetIso8859 + "\"?>\n" +
                                         "<root>\n" +
                                         " node\n" +
                                         "</root>";
@@ -348,7 +348,7 @@ public void testMetaCharsetUpdateXmlNoCharset() {
         doc.updateMetaCharsetElement(true);
         doc.charset(Charset.forName(charsetUtf8));
 
-        final String xmlCharsetUTF8 = "<?xml version=\"1.0\" encoding=\"" + charsetUtf8 + "\">\n" +
+        final String xmlCharsetUTF8 = "<?xml version=\"1.0\" encoding=\"" + charsetUtf8 + "\"?>\n" +
                                         "<root>\n" +
                                         " node\n" +
                                         "</root>";
@@ -372,7 +372,7 @@ public void testMetaCharsetUpdateXmlDisabled() {
     public void testMetaCharsetUpdateXmlDisabledNoChanges() {
         final Document doc = createXmlDocument("dontTouch", "dontTouch", true);
 
-        final String xmlCharset = "<?xml version=\"dontTouch\" encoding=\"dontTouch\">\n" +
+        final String xmlCharset = "<?xml version=\"dontTouch\" encoding=\"dontTouch\"?>\n" +
                                     "<root>\n" +
                                     " node\n" +
                                     "</root>";

diff --git a/src/test/java/org/jsoup/parser/XmlTreeBuilderTest.java b/src/test/java/org/jsoup/parser/XmlTreeBuilderTest.java
@@ -6,6 +6,7 @@
 import org.jsoup.nodes.Document;
 import org.jsoup.nodes.Node;
 import org.jsoup.nodes.TextNode;
+import org.jsoup.nodes.XmlDeclaration;
 import org.junit.Ignore;
 import org.junit.Test;
 
@@ -17,7 +18,8 @@
 import java.util.List;
 
 import static org.jsoup.nodes.Document.OutputSettings.Syntax;
-import static org.junit.Assert.*;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
 
 /**
  * Tests XmlTreeBuilder.
@@ -103,7 +105,7 @@ public void testDoesNotForceSelfClosingKnownTags() {
     @Test public void handlesXmlDeclarationAsDeclaration() {
         String html = "<?xml encoding='UTF-8' ?><body>One</body><!-- comment -->";
         Document doc = Jsoup.parse(html, "", Parser.xmlParser());
-        assertEquals("<?xml encoding='UTF-8' ?> <body> One </body> <!-- comment -->",
+        assertEquals("<?xml encoding=\"UTF-8\"?> <body> One </body> <!-- comment -->",
                 StringUtil.normaliseWhitespace(doc.outerHtml()));
         assertEquals("#declaration", doc.childNode(0).nodeName());
         assertEquals("#comment", doc.childNode(2).nodeName());
@@ -130,4 +132,26 @@ public void testDoesHandleEOFInTag() {
         Document xmlDoc = Jsoup.parse(html, "", Parser.xmlParser());
         assertEquals("<img src=\"asdf\" onerror=\"alert(1)\" x=\"\" />", xmlDoc.html());
     }
+
+    @Test
+    public void testDetectCharsetEncodingDeclaration() throws IOException, URISyntaxException {
+        File xmlFile = new File(XmlTreeBuilder.class.getResource("/htmltests/xml-charset.xml").toURI());
+        InputStream inStream = new FileInputStream(xmlFile);
+        Document doc = Jsoup.parse(inStream, null, "http://example.com/", Parser.xmlParser());
+        assertEquals("ISO-8859-1", doc.charset().name());
+        assertEquals("<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?> <data>äöåéü</data>",
+            TextUtil.stripNewlines(doc.html()));
+    }
+
+    @Test
+    public void testParseDeclarationAttributes() {
+        String xml = "<?xml version='1' encoding='UTF-8' something='else'?><val>One</val>";
+        Document doc = Jsoup.parse(xml, "", Parser.xmlParser());
+        XmlDeclaration decl = (XmlDeclaration) doc.childNode(0);
+        assertEquals("1", decl.attr("version"));
+        assertEquals("UTF-8", decl.attr("encoding"));
+        assertEquals("else", decl.attr("something"));
+        assertEquals("version=\"1\" encoding=\"UTF-8\" something=\"else\"", decl.getWholeDeclaration());
+        assertEquals("<?xml version=\"1\" encoding=\"UTF-8\" something=\"else\"?>", decl.outerHtml());
+    }
 }
diff --git a/src/test/resources/htmltests/xml-charset.xml b/src/test/resources/htmltests/xml-charset.xml
@@ -0,0 +1,2 @@
+<?xml version="1.0" encoding="ISO-8859-1"?>
+<data>äöåéü</data>
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		<?xml version="1.0" encoding="ISO-8859-1"?>
		<data>äöåéü</data>