17
17
18
18
package edu .uci .ics .crawler4j .parser ;
19
19
20
+ import java .io .ByteArrayInputStream ;
21
+ import java .io .ByteArrayOutputStream ;
22
+ import java .io .InputStream ;
23
+ import java .io .OutputStream ;
24
+ import java .io .PrintStream ;
25
+ import java .io .UnsupportedEncodingException ;
26
+
27
+ import javax .xml .transform .OutputKeys ;
28
+ import javax .xml .transform .Transformer ;
29
+ import javax .xml .transform .TransformerConfigurationException ;
30
+ import javax .xml .transform .sax .SAXTransformerFactory ;
31
+ import javax .xml .transform .sax .TransformerHandler ;
32
+ import javax .xml .transform .stream .StreamResult ;
33
+
34
+ import org .apache .tika .metadata .Metadata ;
35
+ import org .apache .tika .parser .AutoDetectParser ;
36
+ import org .apache .tika .parser .ParseContext ;
37
+ import org .apache .tika .parser .Parser ;
38
+ import org .slf4j .Logger ;
39
+ import org .slf4j .LoggerFactory ;
40
+
20
41
public class BinaryParseData implements ParseData {
21
42
22
- private static BinaryParseData instance = new BinaryParseData ();
23
-
24
- public static BinaryParseData getInstance () {
25
- return instance ;
26
- }
27
-
28
- @ Override
29
- public String toString () {
30
- return "[Binary parse data can not be dumped as string]" ;
31
- }
32
- }
43
+ private static final Logger logger = LoggerFactory .getLogger (BinaryParseData .class );
44
+ private static final String DEFAULT_ENCODING = "UTF-8" ;
45
+ private static final String DEFAULT_OUTPUT_FORMAT = "html" ;
46
+
47
+ private static final Metadata METADATA = new Metadata ();
48
+ private static final Parser AUTO_DETECT_PARSER = new AutoDetectParser ();
49
+ private static final SAXTransformerFactory SAX_TRANSFORMER_FACTORY = (SAXTransformerFactory ) SAXTransformerFactory .newInstance ();
50
+
51
+ private final ParseContext context = new ParseContext ();
52
+ private String html = null ;
53
+
54
+ public BinaryParseData () {
55
+ context .set (Parser .class , AUTO_DETECT_PARSER );
56
+ }
57
+
58
+ public void setBinaryContent (byte [] data ) {
59
+ InputStream inputStream = new ByteArrayInputStream (data );
60
+ ByteArrayOutputStream outputStream = new ByteArrayOutputStream ();
61
+
62
+ try {
63
+ TransformerHandler handler = getTransformerHandler (outputStream , DEFAULT_OUTPUT_FORMAT , DEFAULT_ENCODING );
64
+ AUTO_DETECT_PARSER .parse (inputStream , handler , METADATA , context );
65
+
66
+ setHtml (new String (outputStream .toByteArray (), DEFAULT_ENCODING ));
67
+ } catch (TransformerConfigurationException e ) {
68
+ logger .error ("Error configuring handler" , e );
69
+ } catch (UnsupportedEncodingException e ) {
70
+ logger .error ("Encoding for content not supported" , e );
71
+ } catch (Exception e ) {
72
+ logger .error ("Error parsing file" , e );
73
+ }
74
+ }
75
+
76
+ /**
77
+ * Returns a transformer handler that serializes incoming SAX events to
78
+ * XHTML or HTML (depending the given method) using the given output encoding.
79
+ *
80
+ * @param encoding output encoding, or <code>null</code> for the platform default
81
+ */
82
+ private static TransformerHandler getTransformerHandler (OutputStream out , String method , String encoding )
83
+ throws TransformerConfigurationException {
84
+
85
+ TransformerHandler transformerHandler = SAX_TRANSFORMER_FACTORY .newTransformerHandler ();
86
+ Transformer transformer = transformerHandler .getTransformer ();
87
+ transformer .setOutputProperty (OutputKeys .METHOD , method );
88
+ transformer .setOutputProperty (OutputKeys .INDENT , "yes" );
89
+
90
+ if (encoding != null ) {
91
+ transformer .setOutputProperty (OutputKeys .ENCODING , encoding );
92
+ }
93
+
94
+ transformerHandler .setResult (new StreamResult (new PrintStream (out )));
95
+ return transformerHandler ;
96
+ }
97
+
98
+ /** @return Parsed binary content or null */
99
+ public String getHtml () {
100
+ return html ;
101
+ }
102
+
103
+ public void setHtml (String html ) {
104
+ this .html = html ;
105
+ }
106
+
107
+ @ Override
108
+ public String toString () {
109
+ if (html == null || html .isEmpty ()) {
110
+ return "No data parsed yet" ;
111
+ } else {
112
+ return getHtml ();
113
+ }
114
+ }
115
+ }
0 commit comments