From d6e49e6d8c17a9ba909171c9bd1982a1496dc687 Mon Sep 17 00:00:00 2001 From: John Bessire Date: Thu, 25 Apr 2013 16:38:05 -0700 Subject: [PATCH 1/7] Make sure there are no interloping unescaped server tags or server comment tags in file StartTagTypeUnregistered.java --- .../StartTagTypeServerCommonEscaped.java | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/src/java/net/htmlparser/jericho/StartTagTypeServerCommonEscaped.java b/src/java/net/htmlparser/jericho/StartTagTypeServerCommonEscaped.java index a9b85b7..727f78b 100644 --- a/src/java/net/htmlparser/jericho/StartTagTypeServerCommonEscaped.java +++ b/src/java/net/htmlparser/jericho/StartTagTypeServerCommonEscaped.java @@ -26,4 +26,27 @@ final class StartTagTypeServerCommonEscaped extends StartTagTypeGenericImplement private StartTagTypeServerCommonEscaped() { super("escaped common server tag","<\\%","%>",null,true); } + + protected int getEnd(final Source source, int pos) { + // Make sure there are no interloping unescaped server tags or server comment tags + Tag nextServerCommonTag=source.getNextTag(pos,StartTagTypeServerCommon.INSTANCE); + Tag nextServerCommonCommentTag=source.getNextTag(pos,StartTagTypeServerCommonComment.INSTANCE); + while (true) { + int potentialEnd=super.getEnd(source,pos); + if (potentialEnd==-1) return -1; + do { + int skipToPos=pos; + if (nextServerCommonTag!=null && nextServerCommonTag.getEnd()<=potentialEnd) { + skipToPos=nextServerCommonTag.getEnd()+1; + } + if (nextServerCommonCommentTag!=null && nextServerCommonCommentTag.getEnd()<=potentialEnd) { + skipToPos=Math.max(skipToPos,nextServerCommonCommentTag.getEnd()+1); + } + if (skipToPos==pos) return potentialEnd; + pos=skipToPos; + if (nextServerCommonTag!=null && nextServerCommonTag.getEnd()<=pos) nextServerCommonTag=source.getNextTag(pos,StartTagTypeServerCommon.INSTANCE); + if (nextServerCommonCommentTag!=null && nextServerCommonCommentTag.getEnd()<=pos) nextServerCommonCommentTag=source.getNextTag(pos,StartTagTypeServerCommonComment.INSTANCE); + } while (pos Date: Thu, 25 Apr 2013 16:53:42 -0700 Subject: [PATCH 2/7] Minor change to comment line 28 for getRowColumnVector on file RowColumnVector.java --- src/java/net/htmlparser/jericho/RowColumnVector.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/java/net/htmlparser/jericho/RowColumnVector.java b/src/java/net/htmlparser/jericho/RowColumnVector.java index 3b47e82..27c4e64 100644 --- a/src/java/net/htmlparser/jericho/RowColumnVector.java +++ b/src/java/net/htmlparser/jericho/RowColumnVector.java @@ -25,7 +25,7 @@ /** * Represents the row and column number of a character position in the source document. *

- * Obtained using the {@link Source#getRowColumnVector(int pos)} method. + * Obtained using the {@link Source#getRowColumnVector(int pos)} or {@link Segment#getRowColumnVector()} method. */ public final class RowColumnVector { private final int row; From bce8807604b7cc9e565990c319d7134f647b15d1 Mon Sep 17 00:00:00 2001 From: John Bessire Date: Tue, 30 Apr 2013 14:38:50 -0700 Subject: [PATCH 3/7] Added file ScriptTest.java --- .../net/htmlparser/jericho/ScriptTest.java | 71 +++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100755 test/src/net/htmlparser/jericho/ScriptTest.java diff --git a/test/src/net/htmlparser/jericho/ScriptTest.java b/test/src/net/htmlparser/jericho/ScriptTest.java new file mode 100755 index 0000000..911df1f --- /dev/null +++ b/test/src/net/htmlparser/jericho/ScriptTest.java @@ -0,0 +1,71 @@ +package net.htmlparser.jericho; + +import org.junit.Test; +import static org.junit.Assert.*; + +import java.io.*; +import java.net.*; +import java.util.*; +import java.nio.CharBuffer; + +public class ScriptTest { + private static final String sourceUrlString="file:test/data/ScriptTest.html"; + + @Test public void testFullSequentialParse() throws Exception { + Source source=new Source(new URL(sourceUrlString)); + source.fullSequentialParse(); + List scriptElements=source.getAllElements(HTMLElementName.SCRIPT); + assertEquals(4,scriptElements.size()); + List scriptContentStartTags; + StartTag scriptContentStartTag; + + scriptContentStartTags=scriptElements.get(0).getContent().getAllStartTags(); + assertEquals(0,scriptContentStartTags.size()); + + scriptContentStartTags=scriptElements.get(1).getContent().getAllStartTags(); + assertEquals(0,scriptContentStartTags.size()); + + scriptContentStartTags=scriptElements.get(2).getContent().getAllStartTags(); + assertEquals(0,scriptContentStartTags.size()); + + scriptContentStartTags=scriptElements.get(3).getContent().getAllStartTags(); + assertEquals(2,scriptContentStartTags.size()); + scriptContentStartTag=scriptContentStartTags.get(0); + assertEquals(HTMLElementName.P,scriptContentStartTag.getName()); + scriptContentStartTag=scriptContentStartTags.get(1); + assertEquals(HTMLElementName.P,scriptContentStartTag.getName()); + } + + @Test public void testParseOnDemand() throws Exception { + Source source=new Source(new URL(sourceUrlString)); + List scriptElements=source.getAllElements(HTMLElementName.SCRIPT); + assertEquals(4,scriptElements.size()); + List scriptContentStartTags; + StartTag scriptContentStartTag; + + scriptContentStartTags=scriptElements.get(0).getContent().getAllStartTags(); + assertEquals(1,scriptContentStartTags.size()); + scriptContentStartTag=scriptContentStartTags.get(0); + assertEquals(HTMLElementName.P,scriptContentStartTag.getName()); + + scriptContentStartTags=scriptElements.get(1).getContent().getAllStartTags(); + assertEquals(1,scriptContentStartTags.size()); + scriptContentStartTag=scriptContentStartTags.get(0); + assertEquals(StartTagType.CDATA_SECTION,scriptContentStartTag.getTagType()); + + scriptContentStartTags=scriptElements.get(2).getContent().getAllStartTags(); + assertEquals(2,scriptContentStartTags.size()); + scriptContentStartTag=scriptContentStartTags.get(0); + assertEquals(StartTagType.COMMENT,scriptContentStartTag.getTagType()); + scriptContentStartTag=scriptContentStartTags.get(1); + assertEquals(HTMLElementName.P,scriptContentStartTag.getName()); + + scriptContentStartTags=scriptElements.get(3).getContent().getAllStartTags(); + assertEquals(2,scriptContentStartTags.size()); + scriptContentStartTag=scriptContentStartTags.get(0); + assertEquals(StartTagType.COMMENT,scriptContentStartTag.getTagType()); + scriptContentStartTag=scriptContentStartTags.get(1); + assertEquals(HTMLElementName.P,scriptContentStartTag.getName()); + assertEquals("

This paragraph is recognised in both modes.

",scriptContentStartTag.getElement().toString()); + } +} From 0bbc6674056ae7f6288ee8b3641d9434a9948ce0 Mon Sep 17 00:00:00 2001 From: John Bessire Date: Tue, 30 Apr 2013 15:22:34 -0700 Subject: [PATCH 4/7] Added line private static final long serialVersionUID --- src/java/net/htmlparser/jericho/EndTag.java | 15 ++++++++++++--- .../htmlparser/jericho/HTMLElementNameSet.java | 2 ++ 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/src/java/net/htmlparser/jericho/EndTag.java b/src/java/net/htmlparser/jericho/EndTag.java index 1e5552f..763f790 100644 --- a/src/java/net/htmlparser/jericho/EndTag.java +++ b/src/java/net/htmlparser/jericho/EndTag.java @@ -128,13 +128,22 @@ public boolean isUnregistered() { /** * Returns an XML representation of this end tag. *

- * This method is included for symmetry with the {@link StartTag#tidy()} method and simply - * returns the {@linkplain Segment#toString() source text} of the tag. + * The tidying of the tag is carried out as follows: + *

    + *
  • if this end tag is a {@link EndTagType#NORMAL NORMAL} end tag then any {@linkplain CharacterReference#isWhiteSpace(char) white space} before the closing angle bracket is removed. + *
  • otherwise the original {@linkplain Segment#toString() source text} of the entire tag is returned. + *
* * @return an XML representation of this end tag. + * @see StartTag#tidy() */ public String tidy() { - return toString(); + final String string=toString(); + if (endTagType!=EndTagType.NORMAL) return string; + if (!CharacterReference.isWhiteSpace(string.charAt(string.length()-2))) return string; + int i=string.length()-3; + while (i>0 && CharacterReference.isWhiteSpace(string.charAt(i))) i--; + return string.substring(0,i+1)+'>'; } /** diff --git a/src/java/net/htmlparser/jericho/HTMLElementNameSet.java b/src/java/net/htmlparser/jericho/HTMLElementNameSet.java index e33c410..e44780f 100644 --- a/src/java/net/htmlparser/jericho/HTMLElementNameSet.java +++ b/src/java/net/htmlparser/jericho/HTMLElementNameSet.java @@ -23,6 +23,8 @@ import java.util.*; final class HTMLElementNameSet extends HashSet { + private static final long serialVersionUID=1L; + public HTMLElementNameSet() { super(1); } From 80f5457a041e61ff83934cb331d3be7f8c5a79e4 Mon Sep 17 00:00:00 2001 From: John Bessire Date: Tue, 30 Apr 2013 15:28:08 -0700 Subject: [PATCH 5/7] Add deprecated --- src/java/net/htmlparser/jericho/MicrosoftTagTypes.java | 1 + 1 file changed, 1 insertion(+) diff --git a/src/java/net/htmlparser/jericho/MicrosoftTagTypes.java b/src/java/net/htmlparser/jericho/MicrosoftTagTypes.java index 9240c83..2dcf7b8 100644 --- a/src/java/net/htmlparser/jericho/MicrosoftTagTypes.java +++ b/src/java/net/htmlparser/jericho/MicrosoftTagTypes.java @@ -68,6 +68,7 @@ public final class MicrosoftTagTypes { * * @deprecated Use {@link MicrosoftConditionalCommentTagTypes#DOWNLEVEL_REVEALED_IF} and {@link MicrosoftConditionalCommentTagTypes#DOWNLEVEL_REVEALED_ENDIF} instead. */ + @Deprecated public static final StartTagType DOWNLEVEL_REVEALED_CONDITIONAL_COMMENT=StartTagTypeMicrosoftDownlevelRevealedConditionalComment.INSTANCE; private static final TagType[] TAG_TYPES={ From c74d9a36fd5c4ae7767fde7ea8229a3854a0f96d Mon Sep 17 00:00:00 2001 From: John Bessire Date: Tue, 30 Apr 2013 15:48:46 -0700 Subject: [PATCH 6/7] Segment for output document --- .../htmlparser/jericho/OutputDocument.java | 31 +++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/src/java/net/htmlparser/jericho/OutputDocument.java b/src/java/net/htmlparser/jericho/OutputDocument.java index 8d629c9..9423400 100644 --- a/src/java/net/htmlparser/jericho/OutputDocument.java +++ b/src/java/net/htmlparser/jericho/OutputDocument.java @@ -81,6 +81,7 @@ public final class OutputDocument implements CharStreamSource { private CharSequence sourceText; private ArrayList outputSegments=new ArrayList(); + private final Segment segment; /** * Constructs a new output document based on the specified source document. @@ -88,6 +89,7 @@ public final class OutputDocument implements CharStreamSource { */ public OutputDocument(final Source source) { if (source==null) throw new IllegalArgumentException("source argument must not be null"); + this.segment=source; this.sourceText=source; } @@ -97,14 +99,27 @@ public OutputDocument(final Source source) { */ public OutputDocument(final Segment segment) { if (segment==null) throw new IllegalArgumentException("segment argument must not be null"); + this.segment=segment; Source source=segment.source; this.sourceText=source; - if (segment.begin>0) remove(new Segment(source,0,segment.begin)); - if (segment.end0) remove(0,segment.begin); + if (segment.end + * If a {@link Source} was used to construct the output document, this returns the {@link Source} object. + * + * @return the original segment upon which this output document is based. + */ + public Segment getSegment() { + return segment; } /** @@ -118,6 +133,18 @@ public CharSequence getSourceText() { return sourceText; } + /** + * Removes the specified segment of this output document. + *

+ * This is equivalent to {@link #replace(int,int,CharSequence) replace}(begin,end,null). + * + * @param begin the character position at which to begin the removal. + * @param end the character position at which to end the removal. + */ + public void remove(final int begin, final int end) { + register(new RemoveOutputSegment(begin,end)); + } + /** * Removes the specified {@linkplain Segment segment} from this output document. *

From d6143b7c3f04b9d161070078f9295a1bf6721064 Mon Sep 17 00:00:00 2001 From: John Bessire Date: Tue, 30 Apr 2013 16:17:43 -0700 Subject: [PATCH 7/7] Changed for checking HR line length and default line length --- src/java/net/htmlparser/.DS_Store | Bin 0 -> 6148 bytes src/java/net/htmlparser/jericho/Renderer.java | 2643 +++++++++-------- 2 files changed, 1357 insertions(+), 1286 deletions(-) create mode 100644 src/java/net/htmlparser/.DS_Store mode change 100644 => 100755 src/java/net/htmlparser/jericho/Renderer.java diff --git a/src/java/net/htmlparser/.DS_Store b/src/java/net/htmlparser/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..5008ddfcf53c02e82d7eee2e57c38e5672ef89f6 GIT binary patch literal 6148 zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0 - * This provides a human readable version of the segment content that is modelled on the way - * Mozilla Thunderbird and other email clients provide an automatic conversion of - * HTML content to text in their alternative MIME encoding of emails. - *

- * The output using default settings complies with the "text/plain; format=flowed" (DelSp=No) protocol described in - * RFC3676. - *

- * Many properties are available to customise the output, possibly the most significant of which being {@link #setMaxLineLength(int) MaxLineLength}. - * See the individual property descriptions for details. - *

- * Use one of the following methods to obtain the output: - *

    - *
  • {@link #writeTo(Writer)}
  • - *
  • {@link #appendTo(Appendable)}
  • - *
  • {@link #toString()}
  • - *
  • {@link CharStreamSourceUtil#getReader(CharStreamSource) CharStreamSourceUtil.getReader(this)}
  • - *
- *

- * The rendering of some constructs, especially tables, is very rudimentary. - * No attempt is made to render nested tables properly, except to ensure that all of the text content is included in the output. - *

- * Rendering an entire {@link Source} object performs a {@linkplain Source#fullSequentialParse() full sequential parse} automatically. - *

- * Any aspect of the algorithm not specifically mentioned here is subject to change without notice in future versions. - *

- * To extract pure text without any rendering of the markup, use the {@link TextExtractor} class instead. - */ -public class Renderer implements CharStreamSource { - private final Segment rootSegment; - private int maxLineLength=76; - private String newLine="\r\n"; - private boolean includeHyperlinkURLs=true; - private boolean includeAlternateText=true; - private boolean decorateFontStyles=false; - private boolean convertNonBreakingSpaces=Config.ConvertNonBreakingSpaces; - private int blockIndentSize=4; - private int listIndentSize=6; - private char[] listBullets=new char[] {'*','o','+','#'}; - private boolean includeFirstElementTopMargin=false; - private String tableCellSeparator=" \t"; - - private static final int UNORDERED_LIST=-1; - - private static Map ELEMENT_HANDLERS=new HashMap(); - static { - ELEMENT_HANDLERS.put(HTMLElementName.A,A_ElementHandler.INSTANCE); - ELEMENT_HANDLERS.put(HTMLElementName.ADDRESS,StandardBlockElementHandler.INSTANCE_0_0); - ELEMENT_HANDLERS.put(HTMLElementName.APPLET,AlternateTextElementHandler.INSTANCE); - ELEMENT_HANDLERS.put(HTMLElementName.B,FontStyleElementHandler.INSTANCE_B); - ELEMENT_HANDLERS.put(HTMLElementName.BLOCKQUOTE,StandardBlockElementHandler.INSTANCE_1_1_INDENT); - ELEMENT_HANDLERS.put(HTMLElementName.BR,BR_ElementHandler.INSTANCE); - ELEMENT_HANDLERS.put(HTMLElementName.BUTTON,RemoveElementHandler.INSTANCE); - ELEMENT_HANDLERS.put(HTMLElementName.CAPTION,StandardBlockElementHandler.INSTANCE_0_0); - ELEMENT_HANDLERS.put(HTMLElementName.CENTER,StandardBlockElementHandler.INSTANCE_1_1); - ELEMENT_HANDLERS.put(HTMLElementName.CODE,FontStyleElementHandler.INSTANCE_CODE); - ELEMENT_HANDLERS.put(HTMLElementName.DD,StandardBlockElementHandler.INSTANCE_0_0_INDENT); - ELEMENT_HANDLERS.put(HTMLElementName.DIR,ListElementHandler.INSTANCE_UL); - ELEMENT_HANDLERS.put(HTMLElementName.DIV,StandardBlockElementHandler.INSTANCE_0_0); - ELEMENT_HANDLERS.put(HTMLElementName.DT,StandardBlockElementHandler.INSTANCE_0_0); - ELEMENT_HANDLERS.put(HTMLElementName.EM,FontStyleElementHandler.INSTANCE_I); - ELEMENT_HANDLERS.put(HTMLElementName.FIELDSET,StandardBlockElementHandler.INSTANCE_1_1); - ELEMENT_HANDLERS.put(HTMLElementName.FORM,StandardBlockElementHandler.INSTANCE_1_1); - ELEMENT_HANDLERS.put(HTMLElementName.H1,StandardBlockElementHandler.INSTANCE_2_1); - ELEMENT_HANDLERS.put(HTMLElementName.H2,StandardBlockElementHandler.INSTANCE_2_1); - ELEMENT_HANDLERS.put(HTMLElementName.H3,StandardBlockElementHandler.INSTANCE_2_1); - ELEMENT_HANDLERS.put(HTMLElementName.H4,StandardBlockElementHandler.INSTANCE_2_1); - ELEMENT_HANDLERS.put(HTMLElementName.H5,StandardBlockElementHandler.INSTANCE_2_1); - ELEMENT_HANDLERS.put(HTMLElementName.H6,StandardBlockElementHandler.INSTANCE_2_1); - ELEMENT_HANDLERS.put(HTMLElementName.HEAD,RemoveElementHandler.INSTANCE); - ELEMENT_HANDLERS.put(HTMLElementName.HR,HR_ElementHandler.INSTANCE); - ELEMENT_HANDLERS.put(HTMLElementName.I,FontStyleElementHandler.INSTANCE_I); - ELEMENT_HANDLERS.put(HTMLElementName.IMG,AlternateTextElementHandler.INSTANCE); - ELEMENT_HANDLERS.put(HTMLElementName.INPUT,AlternateTextElementHandler.INSTANCE); - ELEMENT_HANDLERS.put(HTMLElementName.LEGEND,StandardBlockElementHandler.INSTANCE_0_0); - ELEMENT_HANDLERS.put(HTMLElementName.LI,LI_ElementHandler.INSTANCE); - ELEMENT_HANDLERS.put(HTMLElementName.MENU,ListElementHandler.INSTANCE_UL); - ELEMENT_HANDLERS.put(HTMLElementName.MAP,RemoveElementHandler.INSTANCE); - ELEMENT_HANDLERS.put(HTMLElementName.NOFRAMES,RemoveElementHandler.INSTANCE); - ELEMENT_HANDLERS.put(HTMLElementName.NOSCRIPT,RemoveElementHandler.INSTANCE); - ELEMENT_HANDLERS.put(HTMLElementName.OL,ListElementHandler.INSTANCE_OL); - ELEMENT_HANDLERS.put(HTMLElementName.P,StandardBlockElementHandler.INSTANCE_1_1); - ELEMENT_HANDLERS.put(HTMLElementName.PRE,PRE_ElementHandler.INSTANCE); - ELEMENT_HANDLERS.put(HTMLElementName.SCRIPT,RemoveElementHandler.INSTANCE); - ELEMENT_HANDLERS.put(HTMLElementName.SELECT,RemoveElementHandler.INSTANCE); - ELEMENT_HANDLERS.put(HTMLElementName.STRONG,FontStyleElementHandler.INSTANCE_B); - ELEMENT_HANDLERS.put(HTMLElementName.STYLE,RemoveElementHandler.INSTANCE); - ELEMENT_HANDLERS.put(HTMLElementName.TEXTAREA,RemoveElementHandler.INSTANCE); - ELEMENT_HANDLERS.put(HTMLElementName.TD,TD_ElementHandler.INSTANCE); - ELEMENT_HANDLERS.put(HTMLElementName.TH,TD_ElementHandler.INSTANCE); - ELEMENT_HANDLERS.put(HTMLElementName.TR,StandardBlockElementHandler.INSTANCE_0_0); - ELEMENT_HANDLERS.put(HTMLElementName.U,FontStyleElementHandler.INSTANCE_U); - ELEMENT_HANDLERS.put(HTMLElementName.UL,ListElementHandler.INSTANCE_UL); - } - - /** - * Constructs a new Renderer based on the specified {@link Segment}. - * @param segment the segment containing the HTML to be rendered. - * @see Segment#getRenderer() - */ - public Renderer(final Segment segment) { - rootSegment=segment; - } - - // Documentation inherited from CharStreamSource - public void writeTo(final Writer writer) throws IOException { - appendTo(writer); - writer.flush(); - } - - // Documentation inherited from CharStreamSource - public void appendTo(final Appendable appendable) throws IOException { - new Processor(this,rootSegment,getMaxLineLength(),getNewLine(),getIncludeHyperlinkURLs(),getIncludeAlternateText(),getDecorateFontStyles(),getConvertNonBreakingSpaces(),getBlockIndentSize(),getListIndentSize(),getListBullets(),getTableCellSeparator()).appendTo(appendable); - } - - // Documentation inherited from CharStreamSource - public long getEstimatedMaximumOutputLength() { - return rootSegment.length(); - } - - // Documentation inherited from CharStreamSource - public String toString() { - return CharStreamSourceUtil.toString(this); - } - - /** - * Sets the column at which lines are to be wrapped. - *

- * Lines that would otherwise exceed this length are wrapped onto a new line at a word boundary. - *

- * A Line may still exceed this length if it consists of a single word, where the length of the word plus the line indent exceeds the maximum length. - * In this case the line is wrapped immediately after the end of the word. - *

- * The default value is 76, which reflects the maximum line length for sending - * email data specified in RFC2049 section 3.5. - * - * @param maxLineLength the column at which lines are to be wrapped. - * @return this Renderer instance, allowing multiple property setting methods to be chained in a single statement. - * @see #getMaxLineLength() - */ - public Renderer setMaxLineLength(final int maxLineLength) { - this.maxLineLength=maxLineLength; - return this; - } - - /** - * Returns the column at which lines are to be wrapped. - *

- * See the {@link #setMaxLineLength(int)} method for a full description of this property. - * - * @return the column at which lines are to be wrapped. - */ - public int getMaxLineLength() { - return maxLineLength; - } - - /** - * Sets the string to be used to represent a newline in the output. - *

- * The default value is "\r\n" (CR+LF) regardless of the platform on which the library is running. - * This is so that the default configuration produces valid - * MIME plain/text output, which mandates the use of CR+LF for line breaks. - *

- * Specifying a null argument causes the output to use same new line string as is used in the source document, which is - * determined via the {@link Source#getNewLine()} method. - * If the source document does not contain any new lines, a "best guess" is made by either taking the new line string of a previously parsed document, - * or using the value from the static {@link Config#NewLine} property. - * - * @param newLine the string to be used to represent a newline in the output, may be null. - * @return this Renderer instance, allowing multiple property setting methods to be chained in a single statement. - * @see #getNewLine() - */ - public Renderer setNewLine(final String newLine) { - this.newLine=newLine; - return this; - } - - /** - * Returns the string to be used to represent a newline in the output. - *

- * See the {@link #setNewLine(String)} method for a full description of this property. - * - * @return the string to be used to represent a newline in the output. - */ - public String getNewLine() { - if (newLine==null) newLine=rootSegment.source.getBestGuessNewLine(); - return newLine; - } - - /** - * Sets whether hyperlink URLs are included in the output. - *

- * The default value is true. - *

- * When this property is true, the URL of each hyperlink is included in the output as determined by the implementation of the - * {@link #renderHyperlinkURL(StartTag)} method. - *

- *

- *
Example:
- *
- *

- * Assuming the default implementation of {@link #renderHyperlinkURL(StartTag)}, when this property is true, the following HTML: - *

- * <a href="http://jericho.htmlparser.net/">Jericho HTML Parser</a> - *
- * produces the following output: - *
- * Jericho HTML Parser <http://jericho.htmlparser.net/> - *
- *
- *
- * - * @param includeHyperlinkURLs specifies whether hyperlink URLs are included in the output. - * @return this Renderer instance, allowing multiple property setting methods to be chained in a single statement. - * @see #getIncludeHyperlinkURLs() - */ - public Renderer setIncludeHyperlinkURLs(final boolean includeHyperlinkURLs) { - this.includeHyperlinkURLs=includeHyperlinkURLs; - return this; - } - - /** - * Indicates whether hyperlink URLs are included in the output. - *

- * See the {@link #setIncludeHyperlinkURLs(boolean)} method for a full description of this property. - * - * @return true if hyperlink URLs are included in the output, otherwise false. - */ - public boolean getIncludeHyperlinkURLs() { - return includeHyperlinkURLs; - } - - /** - * Renders the hyperlink URL from the specified {@link StartTag}. - *

- * A return value of null indicates that the hyperlink URL should not be rendered at all. - *

- * The default implementation of this method returns null if the href attribute of the specified start tag - * is '#', starts with "javascript:", or is missing. - * In all other cases it returns the value of the href attribute enclosed in angle brackets. - *

- * See the documentation of the {@link #setIncludeHyperlinkURLs(boolean)} method for an example of how a hyperlink is rendered by the default implementation. - *

- * This method can be overridden in a subclass to customise the rendering of hyperlink URLs. - *

- * Rendering of hyperlink URLs can be disabled completely without overriding this method by setting the - * {@link #setIncludeHyperlinkURLs(boolean) IncludeHyperlinkURLs} property to false. - *

- *

- *
Example:
- *
- * To render hyperlink URLs without the enclosing angle brackets:

- * - * Renderer renderer=new Renderer(segment) {
- *     public String renderHyperlinkURL(StartTag startTag) {
- *         String href=startTag.getAttributeValue("href");
- *         if (href==null || href.equals("#") || href.startsWith("javascript:")) return null;
- *         return href;
- *     }
- * };
- * String renderedSegment=renderer.toString(); - *
- *
- *
- * @param startTag the start tag of the hyperlink element, must not be null. - * @return The rendered hyperlink URL from the specified {@link StartTag}, or null if the hyperlink URL should not be rendered. - */ - public String renderHyperlinkURL(final StartTag startTag) { - final String href=startTag.getAttributeValue("href"); - if (href==null || href.equals("#") || href.startsWith("javascript:")) return null; - return '<'+href+'>'; - } - - /** - * Sets whether the alternate text of a tag that has an alt attribute is included in the output. - *

- * The default value is true. - * Note that this is not conistent with common email clients such as Mozilla Thunderbird which do not render alternate text at all, - * even when a tag specifies alternate text. - *

- * When this property is true, the alternate text is included in the output as determined by the implementation of the - * {@link #renderAlternateText(StartTag)} method. - *

- *

- *
Example:
- *
- *

- * Assuming the default implementation of {@link #renderAlternateText(StartTag)}, when this property is true, the following HTML: - *

- * <img src="smiley.png" alt="smiley face" /> - *
- * produces the following output: - *
- * [smiley face] - *
- *
- *
- * - * @param includeAlternateText specifies whether the alternate text of a tag that has an alt attribute is included in the output. - * @return this Renderer instance, allowing multiple property setting methods to be chained in a single statement. - * @see #getIncludeAlternateText() - */ - public Renderer setIncludeAlternateText(final boolean includeAlternateText) { - this.includeAlternateText=includeAlternateText; - return this; - } - - /** - * Indicates whether the alternate text of a tag that has an alt attribute is included in the output. - *

- * See the {@link #setIncludeAlternateText(boolean)} method for a full description of this property. - * - * @return true if the alternate text of a tag that has an alt attribute is included in the output, otherwise false. - */ - public boolean getIncludeAlternateText() { - return includeAlternateText; - } - - /** - * Renders the alternate text of the specified start tag. - *

- * A return value of null indicates that the alternate text is not to be rendered at all. - *

- * The default implementation of this method returns null if the alt attribute of the specified start tag is missing or empty, or if the - * specified start tag is from an {@link HTMLElementName#AREA AREA} element. - * In all other cases it returns the value of the alt attribute enclosed in square brackets […]. - *

- * See the documentation of the {@link #setIncludeAlternateText(boolean)} method for an example of how alternate text is rendered by the default implementation. - *

- * This method can be overridden in a subclass to customise the rendering of alternate text. - *

- * Rendering of alternate text can be disabled completely without overriding this method by setting the - * {@link #setIncludeAlternateText(boolean) IncludeAlternateText} property to false. - *

- *

- *
Example:
- *
- * To render alternate text with double angle quotation marks instead of square brackets:

- * - * Renderer renderer=new Renderer(segment) {
- *     public String renderAlternateText(StartTag startTag) {
- *         if (startTag.getName()==HTMLElementName.AREA) return null; - *         String alt=startTag.getAttributeValue("alt");
- *         if (alt==null || alt.length()==0) return null;
- *         return ''+alt+'';
- *     }
- * };
- * String renderedSegment=renderer.toString(); - *
- *
- *
- * @param startTag the start tag containing an alt attribute, must not be null. - * @return The rendered alternate text, or null if the alternate text should not be rendered. - */ - public String renderAlternateText(final StartTag startTag) { - if (startTag.getName()==HTMLElementName.AREA) return null; - final String alt=startTag.getAttributeValue("alt"); - if (alt==null || alt.length()==0) return null; - return '['+alt+']'; - } - - /** - * Sets whether decoration characters are to be included around the content of some - * font style elements and - * phrase elements. - *

- * The default value is false. - *

- * Below is a table summarising the decorated elements. - *

- * - * - * - * - * - * - * - *
ElementsCharacterExample Output
{@link HTMLElementName#B B} and {@link HTMLElementName#STRONG STRONG}**bold text*
{@link HTMLElementName#I I} and {@link HTMLElementName#EM EM}//italic text/
{@link HTMLElementName#U U}__underlined text_
{@link HTMLElementName#CODE CODE}||code|
- * - * @param decorateFontStyles specifies whether decoration characters are to be included around the content of some font style elements. - * @return this Renderer instance, allowing multiple property setting methods to be chained in a single statement. - * @see #getDecorateFontStyles() - */ - public Renderer setDecorateFontStyles(final boolean decorateFontStyles) { - this.decorateFontStyles=decorateFontStyles; - return this; - } - - /** - * Indicates whether decoration characters are to be included around the content of some - * font style elements and - * phrase elements. - *

- * See the {@link #setDecorateFontStyles(boolean)} method for a full description of this property. - * - * @return true if decoration characters are to be included around the content of some font style elements, otherwise false. - */ - public boolean getDecorateFontStyles() { - return decorateFontStyles; - } - - /** - * Sets whether non-breaking space ({@link CharacterEntityReference#_nbsp &nbsp;}) character entity references are converted to spaces. - *

- * The default value is that of the static {@link Config#ConvertNonBreakingSpaces} property at the time the Renderer is instantiated. - * - * @param convertNonBreakingSpaces specifies whether non-breaking space ({@link CharacterEntityReference#_nbsp &nbsp;}) character entity references are converted to spaces. - * @return this Renderer instance, allowing multiple property setting methods to be chained in a single statement. - * @see #getConvertNonBreakingSpaces() - */ - public Renderer setConvertNonBreakingSpaces(boolean convertNonBreakingSpaces) { - this.convertNonBreakingSpaces=convertNonBreakingSpaces; - return this; - } - - /** - * Indicates whether non-breaking space ({@link CharacterEntityReference#_nbsp &nbsp;}) character entity references are converted to spaces. - *

- * See the {@link #setConvertNonBreakingSpaces(boolean)} method for a full description of this property. - * - * @return true if non-breaking space ({@link CharacterEntityReference#_nbsp &nbsp;}) character entity references are converted to spaces, otherwise false. - */ - public boolean getConvertNonBreakingSpaces() { - return convertNonBreakingSpaces; - } - - /** - * Sets the size of the indent to be used for anything other than {@link HTMLElementName#LI LI} elements. - *

- * At present this applies to {@link HTMLElementName#BLOCKQUOTE BLOCKQUOTE} and {@link HTMLElementName#DD DD} elements. - *

- * The default value is 4. - * - * @param blockIndentSize the size of the indent. - * @return this Renderer instance, allowing multiple property setting methods to be chained in a single statement. - * @see #getBlockIndentSize() - */ - public Renderer setBlockIndentSize(final int blockIndentSize) { - this.blockIndentSize=blockIndentSize; - return this; - } - - /** - * Returns the size of the indent to be used for anything other than {@link HTMLElementName#LI LI} elements. - *

- * See the {@link #setBlockIndentSize(int)} method for a full description of this property. - * - * @return the size of the indent to be used for anything other than {@link HTMLElementName#LI LI} elements. - */ - public int getBlockIndentSize() { - return blockIndentSize; - } - - /** - * Sets the size of the indent to be used for {@link HTMLElementName#LI LI} elements. - *

- * The default value is 6. - *

- * This applies to {@link HTMLElementName#LI LI} elements inside both {@link HTMLElementName#UL UL} and {@link HTMLElementName#OL OL} elements. - *

- * The bullet or number of the list item is included as part of the indent. - * - * @param listIndentSize the size of the indent. - * @return this Renderer instance, allowing multiple property setting methods to be chained in a single statement. - * @see #getListIndentSize() - */ - public Renderer setListIndentSize(final int listIndentSize) { - this.listIndentSize=listIndentSize; - return this; - } - - /** - * Returns the size of the indent to be used for {@link HTMLElementName#LI LI} elements. - *

- * See the {@link #setListIndentSize(int)} method for a full description of this property. - * - * @return the size of the indent to be used for {@link HTMLElementName#LI LI} elements. - */ - public int getListIndentSize() { - return listIndentSize; - } - - /** - * Sets the bullet characters to use for list items inside {@link HTMLElementName#UL UL} elements. - *

- * The values in the default array are *, o, + and #. - *

- * If the nesting of rendered lists goes deeper than the length of this array, the bullet characters start repeating from the first in the array. - *

- * WARNING: If any of the characters in the default array are modified, this will affect all other instances of this class using the default array. - * - * @param listBullets an array of characters to be used as bullets, must have at least one entry. - * @return this Renderer instance, allowing multiple property setting methods to be chained in a single statement. - * @see #getListBullets() - */ - public Renderer setListBullets(final char[] listBullets) { - if (listBullets==null || listBullets.length==0) throw new IllegalArgumentException("listBullets argument must be an array of at least one character"); - this.listBullets=listBullets; - return this; - } - - /** - * Returns the bullet characters to use for list items inside {@link HTMLElementName#UL UL} elements. - *

- * See the {@link #setListBullets(char[])} method for a full description of this property. - * - * @return the bullet characters to use for list items inside {@link HTMLElementName#UL UL} elements. - */ - public char[] getListBullets() { - return listBullets; - } - - /** - * Sets whether the top margin of the first element is rendered. - *

- * The default value is false. - *

- * If this property is set to true, then the source "<h1>Heading</h1>" would be rendered as "\r\n\r\nHeading", - * assuming all other default settings. - * If this property is false, then the same source would be rendered as "Heading". - *

- * Note that the bottom margin of the last element is never rendered. - * - * @param includeFirstElementTopMargin specifies whether the top margin of the first element is rendered. - * @return this Renderer instance, allowing multiple property setting methods to be chained in a single statement. - * @see #getIncludeFirstElementTopMargin() - */ - public Renderer setIncludeFirstElementTopMargin(final boolean includeFirstElementTopMargin) { - this.includeFirstElementTopMargin=includeFirstElementTopMargin; - return this; - } - - /** - * Indicates whether the top margin of the first element is rendered. - *

- * See the {@link #setIncludeFirstElementTopMargin(boolean)} method for a full description of this property. - * - * @return true if the top margin of the first element is rendered, otherwise false. - */ - public boolean getIncludeFirstElementTopMargin() { - return includeFirstElementTopMargin; - } - - /** - * Sets the string that is to separate table cells. - *

- * The default value is " \t" (a space followed by a tab). - * - * @param tableCellSeparator the string that is to separate table cells. - * @return this Renderer instance, allowing multiple property setting methods to be chained in a single statement. - * @see #getTableCellSeparator() - */ - public Renderer setTableCellSeparator(final String tableCellSeparator) { - this.tableCellSeparator=tableCellSeparator; - return this; - } - - /** - * Returns the string that is to separate table cells. - *

- * See the {@link #setTableCellSeparator(String)} method for a full description of this property. - * - * @return the string that is to separate table cells. - */ - public String getTableCellSeparator() { - return tableCellSeparator; - } - - /** - * Sets the default top margin of an HTML block element with the specified name. - *

- * The top margin is the number of blank lines that are to be inserted above the rendered block. - *

- * As this is a static method, the setting affects all instances of the Renderer class. - *

- * The htmlElementName argument must be one of the following:
- * {@link HTMLElementName#ADDRESS ADDRESS}, - * {@link HTMLElementName#BLOCKQUOTE BLOCKQUOTE}, - * {@link HTMLElementName#CAPTION CAPTION}, - * {@link HTMLElementName#CENTER CENTER}, - * {@link HTMLElementName#DD DD}, - * {@link HTMLElementName#DIR DIR}, - * {@link HTMLElementName#DIV DIV}, - * {@link HTMLElementName#DT DT}, - * {@link HTMLElementName#FIELDSET FIELDSET}, - * {@link HTMLElementName#FORM FORM}, - * {@link HTMLElementName#H1 H1}, - * {@link HTMLElementName#H2 H2}, - * {@link HTMLElementName#H3 H3}, - * {@link HTMLElementName#H4 H4}, - * {@link HTMLElementName#H5 H5}, - * {@link HTMLElementName#H6 H6}, - * {@link HTMLElementName#HR HR}, - * {@link HTMLElementName#LEGEND LEGEND}, - * {@link HTMLElementName#LI LI}, - * {@link HTMLElementName#MENU MENU}, - * {@link HTMLElementName#OL OL}, - * {@link HTMLElementName#P P}, - * {@link HTMLElementName#PRE PRE}, - * {@link HTMLElementName#TR TR}, - * {@link HTMLElementName#UL UL} - * - * @param htmlElementName (required) the case insensitive name of a supported HTML block element. - * @param topMargin the new top margin of the specified element. - * @throws UnsupportedOperationException if an unsupported element name is specified. - */ - public static void setDefaultTopMargin(String htmlElementName, final int topMargin) { - htmlElementName=HTMLElements.getConstantElementName(htmlElementName.toLowerCase()); - ELEMENT_HANDLERS.put(htmlElementName,getAbstractBlockElementHandler(htmlElementName).newTopMargin(topMargin)); - } - - /** - * Returns the default top margin of an HTML block element with the specified name. - *

- * See the {@link #setDefaultTopMargin(String htmlElementName, int topMargin)} method for a full description of this property. - * - * @param htmlElementName (required) the case insensitive name of a supported HTML block element. - * @return the default top margin of an HTML block element with the specified name. - * @throws UnsupportedOperationException if an unsupported element name is specified. - */ - public static int getDefaultTopMargin(final String htmlElementName) { - return getAbstractBlockElementHandler(htmlElementName.toLowerCase()).getTopMargin(); - } - - /** - * Sets the default bottom margin of an HTML block element with the specified name. - *

- * The bottom margin is the number of blank lines that are to be inserted below the rendered block. - *

- * As this is a static method, the setting affects all instances of the Renderer class. - *

- * The htmlElementName argument must be one of the following:
- * {@link HTMLElementName#ADDRESS ADDRESS}, - * {@link HTMLElementName#BLOCKQUOTE BLOCKQUOTE}, - * {@link HTMLElementName#CAPTION CAPTION}, - * {@link HTMLElementName#CENTER CENTER}, - * {@link HTMLElementName#DD DD}, - * {@link HTMLElementName#DIR DIR}, - * {@link HTMLElementName#DIV DIV}, - * {@link HTMLElementName#DT DT}, - * {@link HTMLElementName#FIELDSET FIELDSET}, - * {@link HTMLElementName#FORM FORM}, - * {@link HTMLElementName#H1 H1}, - * {@link HTMLElementName#H2 H2}, - * {@link HTMLElementName#H3 H3}, - * {@link HTMLElementName#H4 H4}, - * {@link HTMLElementName#H5 H5}, - * {@link HTMLElementName#H6 H6}, - * {@link HTMLElementName#HR HR}, - * {@link HTMLElementName#LEGEND LEGEND}, - * {@link HTMLElementName#LI LI}, - * {@link HTMLElementName#MENU MENU}, - * {@link HTMLElementName#OL OL}, - * {@link HTMLElementName#P P}, - * {@link HTMLElementName#PRE PRE}, - * {@link HTMLElementName#TR TR}, - * {@link HTMLElementName#UL UL} - * - * @param htmlElementName (required) the case insensitive name of a supported HTML block element. - * @param bottomMargin the new bottom margin of the specified element. - * @throws UnsupportedOperationException if an unsupported element name is specified. - */ - public static void setDefaultBottomMargin(String htmlElementName, final int bottomMargin) { - htmlElementName=HTMLElements.getConstantElementName(htmlElementName.toLowerCase()); - ELEMENT_HANDLERS.put(htmlElementName,getAbstractBlockElementHandler(htmlElementName).newBottomMargin(bottomMargin)); - } - - /** - * Returns the default bottom margin of an HTML block element with the specified name. - *

- * See the {@link #setDefaultBottomMargin(String htmlElementName, int bottomMargin)} method for a full description of this property. - * - * @param htmlElementName (required) the case insensitive name of a supported HTML block element. - * @return the default bottom margin of an HTML block element with the specified name. - * @throws UnsupportedOperationException if an unsupported element name is specified. - */ - public static int getDefaultBottomMargin(final String htmlElementName) { - return getAbstractBlockElementHandler(htmlElementName.toLowerCase()).getBottomMargin(); - } - - /** - * Sets the default value of whether an HTML block element of the specified name is indented. - *

- * As this is a static method, the setting affects all instances of the Renderer class. - *

- * The htmlElementName argument must be one of the following:
- * {@link HTMLElementName#ADDRESS ADDRESS}, - * {@link HTMLElementName#BLOCKQUOTE BLOCKQUOTE}, - * {@link HTMLElementName#CAPTION CAPTION}, - * {@link HTMLElementName#CENTER CENTER}, - * {@link HTMLElementName#DD DD}, - * {@link HTMLElementName#DIR DIR}, - * {@link HTMLElementName#DIV DIV}, - * {@link HTMLElementName#DT DT}, - * {@link HTMLElementName#FIELDSET FIELDSET}, - * {@link HTMLElementName#FORM FORM}, - * {@link HTMLElementName#H1 H1}, - * {@link HTMLElementName#H2 H2}, - * {@link HTMLElementName#H3 H3}, - * {@link HTMLElementName#H4 H4}, - * {@link HTMLElementName#H5 H5}, - * {@link HTMLElementName#H6 H6}, - * {@link HTMLElementName#HR HR}, - * {@link HTMLElementName#LEGEND LEGEND}, - * {@link HTMLElementName#MENU MENU}, - * {@link HTMLElementName#OL OL}, - * {@link HTMLElementName#P P}, - * {@link HTMLElementName#PRE PRE}, - * {@link HTMLElementName#TR TR}, - * {@link HTMLElementName#UL UL} - * - * @param htmlElementName (required) the case insensitive name of a supported HTML block element. - * @param indent whether the the specified element is indented. - * @throws UnsupportedOperationException if an unsupported element name is specified. - */ - public static void setDefaultIndent(String htmlElementName, final boolean indent) { - htmlElementName=HTMLElements.getConstantElementName(htmlElementName.toLowerCase()); - if (htmlElementName==HTMLElementName.LI) throw new UnsupportedOperationException(); - ELEMENT_HANDLERS.put(htmlElementName,getAbstractBlockElementHandler(htmlElementName).newIndent(indent)); - } - - /** - * Returns the default value of whether an HTML block element of the specified name is indented. - *

- * See the {@link #setDefaultIndent(String htmlElementName, boolean indent)} method for a full description of this property. - * - * @param htmlElementName (required) the case insensitive name of a supported HTML block element. - * @return the default value of whether an HTML block element of the specified name is indented. - * @throws UnsupportedOperationException if an unsupported element name is specified. - */ - public static boolean isDefaultIndent(String htmlElementName) { - htmlElementName=HTMLElements.getConstantElementName(htmlElementName.toLowerCase()); - if (htmlElementName==HTMLElementName.LI) throw new UnsupportedOperationException(); - return getAbstractBlockElementHandler(htmlElementName.toLowerCase()).isIndent(); - } - - private static AbstractBlockElementHandler getAbstractBlockElementHandler(String htmlElementName) { - ElementHandler elementHandler=ELEMENT_HANDLERS.get(htmlElementName); - if (elementHandler==null || !(elementHandler instanceof AbstractBlockElementHandler)) throw new UnsupportedOperationException("Cannot set block properties on element "+htmlElementName); - return (AbstractBlockElementHandler)elementHandler; - } - - /** This class does the actual work, but is first passed final copies of all the parameters for efficiency. */ - private static final class Processor { - private final Renderer renderer; - private final Segment rootSegment; - private final Source source; - private final int maxLineLength; - private final String newLine; - private final boolean includeHyperlinkURLs; - private final boolean includeAlternateText; - private final boolean decorateFontStyles; - private final boolean convertNonBreakingSpaces; - private final int blockIndentSize; - private final int listIndentSize; - private final char[] listBullets; - private final String tableCellSeparator; - - private Appendable appendable; - private int renderedIndex; // keeps track of where rendering is up to in case of overlapping elements - private boolean atStartOfLine; - private boolean skipInitialNewLines; - private int col; - private int listIndentLevel; - private int indentSize; - private int blockVerticalMargin; // minimum number of blank lines to output at the current block boundary, or NO_MARGIN (-1) if we are not currently at a block boundary. - private boolean preformatted; - private boolean lastCharWhiteSpace; - private final boolean ignoreInitialWhiteSpace=false; // can remove this at some stage once we're sure it won't be used. - private boolean bullet; - private int listBulletNumber; - - private static final int NO_MARGIN=-1; - - public Processor(final Renderer renderer, final Segment rootSegment, final int maxLineLength, final String newLine, final boolean includeHyperlinkURLs, final boolean includeAlternateText, final boolean decorateFontStyles, final boolean convertNonBreakingSpaces, final int blockIndentSize, final int listIndentSize, final char[] listBullets, final String tableCellSeparator) { - this.renderer=renderer; - this.rootSegment=rootSegment; - source=rootSegment.source; - this.maxLineLength=maxLineLength; - this.newLine=newLine; - this.includeHyperlinkURLs=includeHyperlinkURLs; - this.includeAlternateText=includeAlternateText; - this.decorateFontStyles=decorateFontStyles; - this.convertNonBreakingSpaces=convertNonBreakingSpaces; - this.blockIndentSize=blockIndentSize; - this.listIndentSize=listIndentSize; - this.listBullets=listBullets; - this.tableCellSeparator=tableCellSeparator; - } - - public void appendTo(final Appendable appendable) throws IOException { - reset(); - this.appendable=appendable; - appendSegmentProcessingChildElements(rootSegment.begin,rootSegment.end,rootSegment.getChildElements()); - } - - private void reset() { - renderedIndex=0; - atStartOfLine=true; - skipInitialNewLines=!renderer.includeFirstElementTopMargin; - col=0; - listIndentLevel=0; - indentSize=0; - blockVerticalMargin=NO_MARGIN; - preformatted=false; - lastCharWhiteSpace=false; - //ignoreInitialWhiteSpace=false; - bullet=false; - } - - private void appendElementContent(final Element element) throws IOException { - final int contentEnd=element.getContentEnd(); - if (element.isEmpty() || renderedIndex>=contentEnd) return; - final int contentBegin=element.getStartTag().end; - appendSegmentProcessingChildElements(Math.max(renderedIndex,contentBegin),contentEnd,element.getChildElements()); - } - - private void appendSegmentProcessingChildElements(final int begin, final int end, final List childElements) throws IOException { - int index=begin; - for (Element childElement : childElements) { - if (index>=childElement.end) continue; - if (index=end) break; - appendSegment(index,tag.begin); - index=tag.end; - } - appendSegment(index,end); - } - - private void appendSegment(int begin, final int end) throws IOException { - assert begin<=end; - if (begin=end) return; - try { - if (preformatted) - appendPreformattedSegment(begin,end); - else - appendNonPreformattedSegment(begin,end); - } finally { - if (renderedIndex=renderedIndex; - if (isBlockBoundary()) appendBlockVerticalMargin(); - final String text=CharacterReference.decode(source.subSequence(begin,end),false,convertNonBreakingSpaces); - for (int i=0; i=renderedIndex; - final String text=CharacterReference.decodeCollapseWhiteSpace(source.subSequence(begin,end),convertNonBreakingSpaces); - if (text.length()==0) { - // collapsed text is zero length but original segment wasn't, meaning it consists purely of white space. - if (!ignoreInitialWhiteSpace) lastCharWhiteSpace=true; - return; - } - appendNonPreformattedText(text,Segment.isWhiteSpace(source.charAt(begin)),Segment.isWhiteSpace(source.charAt(end-1))); - } - - private void appendText(final String text) throws IOException { - assert text.length()>0; - appendNonPreformattedText(text,Segment.isWhiteSpace(text.charAt(0)),Segment.isWhiteSpace(text.charAt(text.length()-1))); - } - - private void appendNonPreformattedText(final String text, final boolean isWhiteSpaceAtStart, final boolean isWhiteSpaceAtEnd) throws IOException { - if (isBlockBoundary()) { - appendBlockVerticalMargin(); - } else if (lastCharWhiteSpace || (isWhiteSpaceAtStart && !ignoreInitialWhiteSpace)) { - // output white space only if not on a block boundary - append(' '); - } - int textIndex=0; - int i=0; - lastCharWhiteSpace=false; - //ignoreInitialWhiteSpace=false; - while (true) { - for (; i" or "From ". - if (i+1') continue; - if (i+6=maxLineLength) { - if (lastCharWhiteSpace && (listIndentLevel|indentSize)==0) append(' '); - startNewLine(0); - } else if (lastCharWhiteSpace) { - append(' '); - } - append(text,textIndex,i); - if (i==text.length()) break; - lastCharWhiteSpace=true; - textIndex=++i; - } - lastCharWhiteSpace=isWhiteSpaceAtEnd; - } - - private boolean isBlockBoundary() { - return blockVerticalMargin!=NO_MARGIN; - } - - private void appendBlockVerticalMargin() throws IOException { - assert blockVerticalMargin!=NO_MARGIN; - if (skipInitialNewLines) { - // at first text after

  • element or start of document - skipInitialNewLines=false; - final int indentCol=indentSize+listIndentLevel*listIndentSize; - if (col==indentCol) { - atStartOfLine=false; // no need to call appendIndent() from appendTextInit(). - } else { - // there was an indenting block since the
  • or start of document - if (bullet || col>indentCol) { - // just start new line as normal if the last indenting block is another
  • , or if the current column is already past the required indent - startNewLine(0); - } else { - // just append spaces to get the column up to the required indent - while (indentCol>col) { - appendable.append(' '); - col++; - } - atStartOfLine=false; // make sure appendIndent() isn't called again from appendTextInit() - } - } - } else { - startNewLine(blockVerticalMargin); - } - blockVerticalMargin=NO_MARGIN; - } - - private void blockBoundary(final int verticalMargin) throws IOException { - // Set a block boundary with the given vertical margin. The vertical margin is the minimum number of blank lines to output between the blocks. - // This method can be called multiple times at a block boundary, and the next textual output will output the number of blank lines determined by the - // maximum vertical margin of all the method calls. - if (blockVerticalMargin0; i--) appendable.append(' '); - if (bullet) { - for (int i=(listIndentLevel-1)*listIndentSize; i>0; i--) appendable.append(' '); - if (listBulletNumber==UNORDERED_LIST) { - for (int i=listIndentSize-2; i>0; i--) appendable.append(' '); - appendable.append(listBullets[(listIndentLevel-1)%listBullets.length]).append(' '); - } else { - String bulletNumberString=Integer.toString(listBulletNumber); - for (int i=listIndentSize-bulletNumberString.length()-2; i>0; i--) appendable.append(' '); - appendable.append(bulletNumberString).append(". "); - } - bullet=false; - } else { - for (int i=listIndentLevel*listIndentSize; i>0; i--) appendable.append(' '); - } - col=indentSize+listIndentLevel*listIndentSize; - atStartOfLine=false; - } - - private Processor append(final char ch) throws IOException { - appendTextInit(); - appendable.append(ch); - col++; - return this; - } - - private Processor append(final String text) throws IOException { - appendTextInit(); - appendable.append(text); - col+=text.length(); - return this; - } - - private void append(final CharSequence text, final int begin, final int end) throws IOException { - appendTextInit(); - for (int i=begin; i=x.maxLineLength) { - x.startNewLine(0); - } else { - x.append(' '); - } - x.append(renderedHyperlinkURL); - x.lastCharWhiteSpace=true; - } - } - - private static final class BR_ElementHandler implements ElementHandler { - public static final ElementHandler INSTANCE=new BR_ElementHandler(); - public void process(Processor x, Element element) throws IOException { - if (x.isBlockBoundary() && !x.atStartOfLine && !x.skipInitialNewLines) x.newLine(); // add an extra new line if we're at a block boundary and aren't already at the start of the next line and it's not the first element after
  • - x.newLine(); - x.blockBoundary(0); - } - } - - private static final class HR_ElementHandler extends AbstractBlockElementHandler { - public static final ElementHandler INSTANCE=new HR_ElementHandler(); - private HR_ElementHandler() { - this(0,0,false); - } - private HR_ElementHandler(int topMargin, int bottomMargin, boolean indent) { - super(topMargin,bottomMargin,indent); - } - protected void processBlockContent(Processor x, Element element) throws IOException { - x.appendBlockVerticalMargin(); - final int maxCol=x.maxLineLength-4; - x.append('-'); - for (int i=x.col; i + * This provides a human readable version of the segment content that is modelled on the way + * Mozilla Thunderbird and other email clients provide an automatic conversion of + * HTML content to text in their alternative MIME encoding of emails. + *

    + * The output using default settings complies with the "text/plain; format=flowed" (DelSp=No) protocol described in + * RFC3676. + *

    + * Many properties are available to customise the output, possibly the most significant of which being {@link #setMaxLineLength(int) MaxLineLength}. + * See the individual property descriptions for details. + *

    + * Use one of the following methods to obtain the output: + *

      + *
    • {@link #writeTo(Writer)}
    • + *
    • {@link #appendTo(Appendable)}
    • + *
    • {@link #toString()}
    • + *
    • {@link CharStreamSourceUtil#getReader(CharStreamSource) CharStreamSourceUtil.getReader(this)}
    • + *
    + *

    + * The rendering of some constructs, especially tables, is very rudimentary. + * No attempt is made to render nested tables properly, except to ensure that all of the text content is included in the output. + *

    + * Rendering an entire {@link Source} object performs a {@linkplain Source#fullSequentialParse() full sequential parse} automatically. + *

    + * Any aspect of the algorithm not specifically mentioned here is subject to change without notice in future versions. + *

    + * To extract pure text without any rendering of the markup, use the {@link TextExtractor} class instead. + */ +public class Renderer implements CharStreamSource { + private final Segment rootSegment; + private int maxLineLength=DEFAULT_LINE_LENGTH; + private int hrLineLength=DEFAULT_LINE_LENGTH-4; + private String newLine="\r\n"; + private boolean includeHyperlinkURLs=true; + private boolean includeAlternateText=true; + private boolean decorateFontStyles=false; + private boolean convertNonBreakingSpaces=Config.ConvertNonBreakingSpaces; + private int blockIndentSize=4; + private int listIndentSize=6; + private char[] listBullets=new char[] {'*','o','+','#'}; + private boolean includeFirstElementTopMargin=false; + private String tableCellSeparator=" \t"; + + private static final int DEFAULT_LINE_LENGTH=76; + private static final int UNORDERED_LIST=-1; + + private static Map ELEMENT_HANDLERS=new HashMap(); + static { + ELEMENT_HANDLERS.put(HTMLElementName.A,A_ElementHandler.INSTANCE); + ELEMENT_HANDLERS.put(HTMLElementName.ADDRESS,StandardBlockElementHandler.INSTANCE_0_0); + ELEMENT_HANDLERS.put(HTMLElementName.APPLET,AlternateTextElementHandler.INSTANCE); + ELEMENT_HANDLERS.put(HTMLElementName.B,FontStyleElementHandler.INSTANCE_B); + ELEMENT_HANDLERS.put(HTMLElementName.BLOCKQUOTE,StandardBlockElementHandler.INSTANCE_1_1_INDENT); + ELEMENT_HANDLERS.put(HTMLElementName.BR,BR_ElementHandler.INSTANCE); + ELEMENT_HANDLERS.put(HTMLElementName.BUTTON,RemoveElementHandler.INSTANCE); + ELEMENT_HANDLERS.put(HTMLElementName.CAPTION,StandardBlockElementHandler.INSTANCE_0_0); + ELEMENT_HANDLERS.put(HTMLElementName.CENTER,StandardBlockElementHandler.INSTANCE_1_1); + ELEMENT_HANDLERS.put(HTMLElementName.CODE,FontStyleElementHandler.INSTANCE_CODE); + ELEMENT_HANDLERS.put(HTMLElementName.DD,StandardBlockElementHandler.INSTANCE_0_0_INDENT); + ELEMENT_HANDLERS.put(HTMLElementName.DIR,ListElementHandler.INSTANCE_UL); + ELEMENT_HANDLERS.put(HTMLElementName.DIV,StandardBlockElementHandler.INSTANCE_0_0); + ELEMENT_HANDLERS.put(HTMLElementName.DT,StandardBlockElementHandler.INSTANCE_0_0); + ELEMENT_HANDLERS.put(HTMLElementName.EM,FontStyleElementHandler.INSTANCE_I); + ELEMENT_HANDLERS.put(HTMLElementName.FIELDSET,StandardBlockElementHandler.INSTANCE_1_1); + ELEMENT_HANDLERS.put(HTMLElementName.FORM,StandardBlockElementHandler.INSTANCE_1_1); + ELEMENT_HANDLERS.put(HTMLElementName.H1,StandardBlockElementHandler.INSTANCE_2_1); + ELEMENT_HANDLERS.put(HTMLElementName.H2,StandardBlockElementHandler.INSTANCE_2_1); + ELEMENT_HANDLERS.put(HTMLElementName.H3,StandardBlockElementHandler.INSTANCE_2_1); + ELEMENT_HANDLERS.put(HTMLElementName.H4,StandardBlockElementHandler.INSTANCE_2_1); + ELEMENT_HANDLERS.put(HTMLElementName.H5,StandardBlockElementHandler.INSTANCE_2_1); + ELEMENT_HANDLERS.put(HTMLElementName.H6,StandardBlockElementHandler.INSTANCE_2_1); + ELEMENT_HANDLERS.put(HTMLElementName.HEAD,RemoveElementHandler.INSTANCE); + ELEMENT_HANDLERS.put(HTMLElementName.HR,HR_ElementHandler.INSTANCE); + ELEMENT_HANDLERS.put(HTMLElementName.I,FontStyleElementHandler.INSTANCE_I); + ELEMENT_HANDLERS.put(HTMLElementName.IMG,AlternateTextElementHandler.INSTANCE); + ELEMENT_HANDLERS.put(HTMLElementName.INPUT,AlternateTextElementHandler.INSTANCE); + ELEMENT_HANDLERS.put(HTMLElementName.LEGEND,StandardBlockElementHandler.INSTANCE_0_0); + ELEMENT_HANDLERS.put(HTMLElementName.LI,LI_ElementHandler.INSTANCE); + ELEMENT_HANDLERS.put(HTMLElementName.MENU,ListElementHandler.INSTANCE_UL); + ELEMENT_HANDLERS.put(HTMLElementName.MAP,RemoveElementHandler.INSTANCE); + ELEMENT_HANDLERS.put(HTMLElementName.NOFRAMES,RemoveElementHandler.INSTANCE); + ELEMENT_HANDLERS.put(HTMLElementName.NOSCRIPT,RemoveElementHandler.INSTANCE); + ELEMENT_HANDLERS.put(HTMLElementName.OL,ListElementHandler.INSTANCE_OL); + ELEMENT_HANDLERS.put(HTMLElementName.P,StandardBlockElementHandler.INSTANCE_1_1); + ELEMENT_HANDLERS.put(HTMLElementName.PRE,PRE_ElementHandler.INSTANCE); + ELEMENT_HANDLERS.put(HTMLElementName.SCRIPT,RemoveElementHandler.INSTANCE); + ELEMENT_HANDLERS.put(HTMLElementName.SELECT,RemoveElementHandler.INSTANCE); + ELEMENT_HANDLERS.put(HTMLElementName.STRONG,FontStyleElementHandler.INSTANCE_B); + ELEMENT_HANDLERS.put(HTMLElementName.STYLE,RemoveElementHandler.INSTANCE); + ELEMENT_HANDLERS.put(HTMLElementName.TEXTAREA,RemoveElementHandler.INSTANCE); + ELEMENT_HANDLERS.put(HTMLElementName.TD,TD_ElementHandler.INSTANCE); + ELEMENT_HANDLERS.put(HTMLElementName.TH,TD_ElementHandler.INSTANCE); + ELEMENT_HANDLERS.put(HTMLElementName.TR,StandardBlockElementHandler.INSTANCE_0_0); + ELEMENT_HANDLERS.put(HTMLElementName.U,FontStyleElementHandler.INSTANCE_U); + ELEMENT_HANDLERS.put(HTMLElementName.UL,ListElementHandler.INSTANCE_UL); + } + + /** + * Constructs a new Renderer based on the specified {@link Segment}. + * @param segment the segment containing the HTML to be rendered. + * @see Segment#getRenderer() + */ + public Renderer(final Segment segment) { + rootSegment=segment; + } + + // Documentation inherited from CharStreamSource + public void writeTo(final Writer writer) throws IOException { + appendTo(writer); + writer.flush(); + } + + // Documentation inherited from CharStreamSource + public void appendTo(final Appendable appendable) throws IOException { + new Processor(this,rootSegment,getMaxLineLength(),getHRLineLength(),getNewLine(),getIncludeHyperlinkURLs(),getIncludeAlternateText(),getDecorateFontStyles(),getConvertNonBreakingSpaces(),getBlockIndentSize(),getListIndentSize(),getListBullets(),getTableCellSeparator()).appendTo(appendable); + } + + // Documentation inherited from CharStreamSource + public long getEstimatedMaximumOutputLength() { + return rootSegment.length(); + } + + // Documentation inherited from CharStreamSource + public String toString() { + return CharStreamSourceUtil.toString(this); + } + + /** + * Sets the column at which lines are to be wrapped. + *

    + * Lines that would otherwise exceed this length are wrapped onto a new line at a word boundary. + *

    + * Setting this property automatically sets the {@link #setHRLineLength(int) HRLineLength} property to MaxLineLength - 4. + *

    + * Setting this property to zero disables line wrapping completely, and leaves the value of {@link #setHRLineLength(int) HRLineLength} unchanged. + *

    + * A Line may still exceed this length if it consists of a single word, where the length of the word plus the line indent exceeds the maximum length. + * In this case the line is wrapped immediately after the end of the word. + *

    + * The default value is 76, which reflects the maximum line length for sending + * email data specified in RFC2049 section 3.5. + * + * @param maxLineLength the column at which lines are to be wrapped. + * @return this Renderer instance, allowing multiple property setting methods to be chained in a single statement. + * @see #getMaxLineLength() + */ + public Renderer setMaxLineLength(final int maxLineLength) { + this.maxLineLength=maxLineLength; + if (maxLineLength>0) hrLineLength=Math.max(2,maxLineLength-4); + return this; + } + + /** + * Returns the column at which lines are to be wrapped. + *

    + * See the {@link #setMaxLineLength(int)} method for a full description of this property. + * + * @return the column at which lines are to be wrapped, or zero if line wrapping is disabled. + */ + public int getMaxLineLength() { + return maxLineLength; + } + + /** + * Sets the length of a horizontal line. + *

    + * The length determines the number of hyphen characters used to render {@link HTMLElementName#HR HR} elements. + *

    + * This property is set automatically to MaxLineLength - 4 when the {@link #setMaxLineLength(int) MaxLineLength} property is set. + * The default value is 72. + * + * @param hrLineLength the length of a horizontal line. + * @return this Renderer instance, allowing multiple property setting methods to be chained in a single statement. + * @see #getHRLineLength() + */ + public Renderer setHRLineLength(final int hrLineLength) { + this.hrLineLength=hrLineLength; + return this; + } + + /** + * Returns the length of a horizontal line. + *

    + * See the {@link #setHRLineLength(int)} method for a full description of this property. + * + * @return the length of a horizontal line. + */ + public int getHRLineLength() { + return hrLineLength; + } + + /** + * Sets the string to be used to represent a newline in the output. + *

    + * The default value is "\r\n" (CR+LF) regardless of the platform on which the library is running. + * This is so that the default configuration produces valid + * MIME plain/text output, which mandates the use of CR+LF for line breaks. + *

    + * Specifying a null argument causes the output to use same new line string as is used in the source document, which is + * determined via the {@link Source#getNewLine()} method. + * If the source document does not contain any new lines, a "best guess" is made by either taking the new line string of a previously parsed document, + * or using the value from the static {@link Config#NewLine} property. + * + * @param newLine the string to be used to represent a newline in the output, may be null. + * @return this Renderer instance, allowing multiple property setting methods to be chained in a single statement. + * @see #getNewLine() + */ + public Renderer setNewLine(final String newLine) { + this.newLine=newLine; + return this; + } + + /** + * Returns the string to be used to represent a newline in the output. + *

    + * See the {@link #setNewLine(String)} method for a full description of this property. + * + * @return the string to be used to represent a newline in the output. + */ + public String getNewLine() { + if (newLine==null) newLine=rootSegment.source.getBestGuessNewLine(); + return newLine; + } + + /** + * Sets whether hyperlink URLs are included in the output. + *

    + * The default value is true. + *

    + * When this property is true, the URL of each hyperlink is included in the output as determined by the implementation of the + * {@link #renderHyperlinkURL(StartTag)} method. + *

    + *

    + *
    Example:
    + *
    + *

    + * Assuming the default implementation of {@link #renderHyperlinkURL(StartTag)}, when this property is true, the following HTML: + *

    + * <a href="http://jericho.htmlparser.net/">Jericho HTML Parser</a> + *
    + * produces the following output: + *
    + * Jericho HTML Parser <http://jericho.htmlparser.net/> + *
    + *
    + *
    + * + * @param includeHyperlinkURLs specifies whether hyperlink URLs are included in the output. + * @return this Renderer instance, allowing multiple property setting methods to be chained in a single statement. + * @see #getIncludeHyperlinkURLs() + */ + public Renderer setIncludeHyperlinkURLs(final boolean includeHyperlinkURLs) { + this.includeHyperlinkURLs=includeHyperlinkURLs; + return this; + } + + /** + * Indicates whether hyperlink URLs are included in the output. + *

    + * See the {@link #setIncludeHyperlinkURLs(boolean)} method for a full description of this property. + * + * @return true if hyperlink URLs are included in the output, otherwise false. + */ + public boolean getIncludeHyperlinkURLs() { + return includeHyperlinkURLs; + } + + /** + * Renders the hyperlink URL from the specified {@link StartTag}. + *

    + * A return value of null indicates that the hyperlink URL should not be rendered at all. + *

    + * The default implementation of this method returns null if the href attribute of the specified start tag + * starts with "javascript:", is a relative or invalid URI, or is missing completely. + * In all other cases it returns the value of the href attribute enclosed in angle brackets. + *

    + * See the documentation of the {@link #setIncludeHyperlinkURLs(boolean)} method for an example of how a hyperlink is rendered by the default implementation. + *

    + * This method can be overridden in a subclass to customise the rendering of hyperlink URLs. + *

    + * Rendering of hyperlink URLs can be disabled completely without overriding this method by setting the + * {@link #setIncludeHyperlinkURLs(boolean) IncludeHyperlinkURLs} property to false. + *

    + *

    + *
    Example:
    + *
    + * To render hyperlink URLs without the enclosing angle brackets:

    + * + * Renderer renderer=new Renderer(segment) {
    + *     public String renderHyperlinkURL(StartTag startTag) {
    + *         String href=startTag.getAttributeValue("href");
    + *         if (href==null || href.startsWith("javascript:")) return null;
    + *         try {
    + *           URI uri=new URI(href);
    + *           if (!uri.isAbsolute()) return null;
    + *         } catch (URISyntaxException ex) {
    + *           return null;
    + *         }
    + *         return href;
    + *     }
    + * };
    + * String renderedSegment=renderer.toString(); + *
    + *
    + *
    + * @param startTag the start tag of the hyperlink element, must not be null. + * @return The rendered hyperlink URL from the specified {@link StartTag}, or null if the hyperlink URL should not be rendered. + */ + public String renderHyperlinkURL(final StartTag startTag) { + final String href=startTag.getAttributeValue("href"); + if (href==null || href.startsWith("javascript:")) return null; + try { + URI uri=new URI(href); + if (!uri.isAbsolute()) return null; + } catch (URISyntaxException ex) { + return null; + } + return '<'+href+'>'; + } + + /** + * Sets whether the alternate text of a tag that has an alt attribute is included in the output. + *

    + * The default value is true. + * Note that this is not conistent with common email clients such as Mozilla Thunderbird which do not render alternate text at all, + * even when a tag specifies alternate text. + *

    + * When this property is true, the alternate text is included in the output as determined by the implementation of the + * {@link #renderAlternateText(StartTag)} method. + *

    + *

    + *
    Example:
    + *
    + *

    + * Assuming the default implementation of {@link #renderAlternateText(StartTag)}, when this property is true, the following HTML: + *

    + * <img src="smiley.png" alt="smiley face" /> + *
    + * produces the following output: + *
    + * [smiley face] + *
    + *
    + *
    + * + * @param includeAlternateText specifies whether the alternate text of a tag that has an alt attribute is included in the output. + * @return this Renderer instance, allowing multiple property setting methods to be chained in a single statement. + * @see #getIncludeAlternateText() + */ + public Renderer setIncludeAlternateText(final boolean includeAlternateText) { + this.includeAlternateText=includeAlternateText; + return this; + } + + /** + * Indicates whether the alternate text of a tag that has an alt attribute is included in the output. + *

    + * See the {@link #setIncludeAlternateText(boolean)} method for a full description of this property. + * + * @return true if the alternate text of a tag that has an alt attribute is included in the output, otherwise false. + */ + public boolean getIncludeAlternateText() { + return includeAlternateText; + } + + /** + * Renders the alternate text of the specified start tag. + *

    + * A return value of null indicates that the alternate text is not to be rendered at all. + *

    + * The default implementation of this method returns null if the alt attribute of the specified start tag is missing or empty, or if the + * specified start tag is from an {@link HTMLElementName#AREA AREA} element. + * In all other cases it returns the value of the alt attribute enclosed in square brackets […]. + *

    + * See the documentation of the {@link #setIncludeAlternateText(boolean)} method for an example of how alternate text is rendered by the default implementation. + *

    + * This method can be overridden in a subclass to customise the rendering of alternate text. + *

    + * Rendering of alternate text can be disabled completely without overriding this method by setting the + * {@link #setIncludeAlternateText(boolean) IncludeAlternateText} property to false. + *

    + *

    + *
    Example:
    + *
    + * To render alternate text with double angle quotation marks instead of square brackets:

    + * + * Renderer renderer=new Renderer(segment) {
    + *     public String renderAlternateText(StartTag startTag) {
    + *         if (startTag.getName()==HTMLElementName.AREA) return null; + *         String alt=startTag.getAttributeValue("alt");
    + *         if (alt==null || alt.length()==0) return null;
    + *         return '«'+alt+'»';
    + *     }
    + * };
    + * String renderedSegment=renderer.toString(); + *
    + *
    + *
    + * @param startTag the start tag containing an alt attribute, must not be null. + * @return The rendered alternate text, or null if the alternate text should not be rendered. + */ + public String renderAlternateText(final StartTag startTag) { + if (startTag.getName()==HTMLElementName.AREA) return null; + final String alt=startTag.getAttributeValue("alt"); + if (alt==null || alt.length()==0) return null; + return '['+alt+']'; + } + + /** + * Sets whether decoration characters are to be included around the content of some + * font style elements and + * phrase elements. + *

    + * The default value is false. + *

    + * Below is a table summarising the decorated elements. + *

    + * + * + * + * + * + * + * + *
    ElementsCharacterExample Output
    {@link HTMLElementName#B B} and {@link HTMLElementName#STRONG STRONG}**bold text*
    {@link HTMLElementName#I I} and {@link HTMLElementName#EM EM}//italic text/
    {@link HTMLElementName#U U}__underlined text_
    {@link HTMLElementName#CODE CODE}||code|
    + * + * @param decorateFontStyles specifies whether decoration characters are to be included around the content of some font style elements. + * @return this Renderer instance, allowing multiple property setting methods to be chained in a single statement. + * @see #getDecorateFontStyles() + */ + public Renderer setDecorateFontStyles(final boolean decorateFontStyles) { + this.decorateFontStyles=decorateFontStyles; + return this; + } + + /** + * Indicates whether decoration characters are to be included around the content of some + * font style elements and + * phrase elements. + *

    + * See the {@link #setDecorateFontStyles(boolean)} method for a full description of this property. + * + * @return true if decoration characters are to be included around the content of some font style elements, otherwise false. + */ + public boolean getDecorateFontStyles() { + return decorateFontStyles; + } + + /** + * Sets whether non-breaking space ({@link CharacterEntityReference#_nbsp &nbsp;}) character entity references are converted to spaces. + *

    + * The default value is that of the static {@link Config#ConvertNonBreakingSpaces} property at the time the Renderer is instantiated. + * + * @param convertNonBreakingSpaces specifies whether non-breaking space ({@link CharacterEntityReference#_nbsp &nbsp;}) character entity references are converted to spaces. + * @return this Renderer instance, allowing multiple property setting methods to be chained in a single statement. + * @see #getConvertNonBreakingSpaces() + */ + public Renderer setConvertNonBreakingSpaces(boolean convertNonBreakingSpaces) { + this.convertNonBreakingSpaces=convertNonBreakingSpaces; + return this; + } + + /** + * Indicates whether non-breaking space ({@link CharacterEntityReference#_nbsp &nbsp;}) character entity references are converted to spaces. + *

    + * See the {@link #setConvertNonBreakingSpaces(boolean)} method for a full description of this property. + * + * @return true if non-breaking space ({@link CharacterEntityReference#_nbsp &nbsp;}) character entity references are converted to spaces, otherwise false. + */ + public boolean getConvertNonBreakingSpaces() { + return convertNonBreakingSpaces; + } + + /** + * Sets the size of the indent to be used for anything other than {@link HTMLElementName#LI LI} elements. + *

    + * At present this applies to {@link HTMLElementName#BLOCKQUOTE BLOCKQUOTE} and {@link HTMLElementName#DD DD} elements. + *

    + * The default value is 4. + * + * @param blockIndentSize the size of the indent. + * @return this Renderer instance, allowing multiple property setting methods to be chained in a single statement. + * @see #getBlockIndentSize() + */ + public Renderer setBlockIndentSize(final int blockIndentSize) { + this.blockIndentSize=blockIndentSize; + return this; + } + + /** + * Returns the size of the indent to be used for anything other than {@link HTMLElementName#LI LI} elements. + *

    + * See the {@link #setBlockIndentSize(int)} method for a full description of this property. + * + * @return the size of the indent to be used for anything other than {@link HTMLElementName#LI LI} elements. + */ + public int getBlockIndentSize() { + return blockIndentSize; + } + + /** + * Sets the size of the indent to be used for {@link HTMLElementName#LI LI} elements. + *

    + * The default value is 6. + *

    + * This applies to {@link HTMLElementName#LI LI} elements inside both {@link HTMLElementName#UL UL} and {@link HTMLElementName#OL OL} elements. + *

    + * The bullet or number of the list item is included as part of the indent. + * + * @param listIndentSize the size of the indent. + * @return this Renderer instance, allowing multiple property setting methods to be chained in a single statement. + * @see #getListIndentSize() + */ + public Renderer setListIndentSize(final int listIndentSize) { + this.listIndentSize=listIndentSize; + return this; + } + + /** + * Returns the size of the indent to be used for {@link HTMLElementName#LI LI} elements. + *

    + * See the {@link #setListIndentSize(int)} method for a full description of this property. + * + * @return the size of the indent to be used for {@link HTMLElementName#LI LI} elements. + */ + public int getListIndentSize() { + return listIndentSize; + } + + /** + * Sets the bullet characters to use for list items inside {@link HTMLElementName#UL UL} elements. + *

    + * The values in the default array are *, o, + and #. + *

    + * If the nesting of rendered lists goes deeper than the length of this array, the bullet characters start repeating from the first in the array. + *

    + * WARNING: If any of the characters in the default array are modified, this will affect all other instances of this class using the default array. + * + * @param listBullets an array of characters to be used as bullets, must have at least one entry. + * @return this Renderer instance, allowing multiple property setting methods to be chained in a single statement. + * @see #getListBullets() + */ + public Renderer setListBullets(final char[] listBullets) { + if (listBullets==null || listBullets.length==0) throw new IllegalArgumentException("listBullets argument must be an array of at least one character"); + this.listBullets=listBullets; + return this; + } + + /** + * Returns the bullet characters to use for list items inside {@link HTMLElementName#UL UL} elements. + *

    + * See the {@link #setListBullets(char[])} method for a full description of this property. + * + * @return the bullet characters to use for list items inside {@link HTMLElementName#UL UL} elements. + */ + public char[] getListBullets() { + return listBullets; + } + + /** + * Sets whether the top margin of the first element is rendered. + *

    + * The default value is false. + *

    + * If this property is set to true, then the source "<h1>Heading</h1>" would be rendered as "\r\n\r\nHeading", + * assuming all other default settings. + * If this property is false, then the same source would be rendered as "Heading". + *

    + * Note that the bottom margin of the last element is never rendered. + * + * @param includeFirstElementTopMargin specifies whether the top margin of the first element is rendered. + * @return this Renderer instance, allowing multiple property setting methods to be chained in a single statement. + * @see #getIncludeFirstElementTopMargin() + */ + public Renderer setIncludeFirstElementTopMargin(final boolean includeFirstElementTopMargin) { + this.includeFirstElementTopMargin=includeFirstElementTopMargin; + return this; + } + + /** + * Indicates whether the top margin of the first element is rendered. + *

    + * See the {@link #setIncludeFirstElementTopMargin(boolean)} method for a full description of this property. + * + * @return true if the top margin of the first element is rendered, otherwise false. + */ + public boolean getIncludeFirstElementTopMargin() { + return includeFirstElementTopMargin; + } + + /** + * Sets the string that is to separate table cells. + *

    + * The default value is " \t" (a space followed by a tab). + * + * @param tableCellSeparator the string that is to separate table cells. + * @return this Renderer instance, allowing multiple property setting methods to be chained in a single statement. + * @see #getTableCellSeparator() + */ + public Renderer setTableCellSeparator(final String tableCellSeparator) { + this.tableCellSeparator=tableCellSeparator; + return this; + } + + /** + * Returns the string that is to separate table cells. + *

    + * See the {@link #setTableCellSeparator(String)} method for a full description of this property. + * + * @return the string that is to separate table cells. + */ + public String getTableCellSeparator() { + return tableCellSeparator; + } + + /** + * Sets the default top margin of an HTML block element with the specified name. + *

    + * The top margin is the number of blank lines that are to be inserted above the rendered block. + *

    + * As this is a static method, the setting affects all instances of the Renderer class. + *

    + * The htmlElementName argument must be one of the following:
    + * {@link HTMLElementName#ADDRESS ADDRESS}, + * {@link HTMLElementName#BLOCKQUOTE BLOCKQUOTE}, + * {@link HTMLElementName#CAPTION CAPTION}, + * {@link HTMLElementName#CENTER CENTER}, + * {@link HTMLElementName#DD DD}, + * {@link HTMLElementName#DIR DIR}, + * {@link HTMLElementName#DIV DIV}, + * {@link HTMLElementName#DT DT}, + * {@link HTMLElementName#FIELDSET FIELDSET}, + * {@link HTMLElementName#FORM FORM}, + * {@link HTMLElementName#H1 H1}, + * {@link HTMLElementName#H2 H2}, + * {@link HTMLElementName#H3 H3}, + * {@link HTMLElementName#H4 H4}, + * {@link HTMLElementName#H5 H5}, + * {@link HTMLElementName#H6 H6}, + * {@link HTMLElementName#HR HR}, + * {@link HTMLElementName#LEGEND LEGEND}, + * {@link HTMLElementName#LI LI}, + * {@link HTMLElementName#MENU MENU}, + * {@link HTMLElementName#OL OL}, + * {@link HTMLElementName#P P}, + * {@link HTMLElementName#PRE PRE}, + * {@link HTMLElementName#TR TR}, + * {@link HTMLElementName#UL UL} + * + * @param htmlElementName (required) the case insensitive name of a supported HTML block element. + * @param topMargin the new top margin of the specified element. + * @throws UnsupportedOperationException if an unsupported element name is specified. + */ + public static void setDefaultTopMargin(String htmlElementName, final int topMargin) { + htmlElementName=HTMLElements.getConstantElementName(htmlElementName.toLowerCase()); + ELEMENT_HANDLERS.put(htmlElementName,getAbstractBlockElementHandler(htmlElementName).newTopMargin(topMargin)); + } + + /** + * Returns the default top margin of an HTML block element with the specified name. + *

    + * See the {@link #setDefaultTopMargin(String htmlElementName, int topMargin)} method for a full description of this property. + * + * @param htmlElementName (required) the case insensitive name of a supported HTML block element. + * @return the default top margin of an HTML block element with the specified name. + * @throws UnsupportedOperationException if an unsupported element name is specified. + */ + public static int getDefaultTopMargin(final String htmlElementName) { + return getAbstractBlockElementHandler(htmlElementName.toLowerCase()).getTopMargin(); + } + + /** + * Sets the default bottom margin of an HTML block element with the specified name. + *

    + * The bottom margin is the number of blank lines that are to be inserted below the rendered block. + *

    + * As this is a static method, the setting affects all instances of the Renderer class. + *

    + * The htmlElementName argument must be one of the following:
    + * {@link HTMLElementName#ADDRESS ADDRESS}, + * {@link HTMLElementName#BLOCKQUOTE BLOCKQUOTE}, + * {@link HTMLElementName#CAPTION CAPTION}, + * {@link HTMLElementName#CENTER CENTER}, + * {@link HTMLElementName#DD DD}, + * {@link HTMLElementName#DIR DIR}, + * {@link HTMLElementName#DIV DIV}, + * {@link HTMLElementName#DT DT}, + * {@link HTMLElementName#FIELDSET FIELDSET}, + * {@link HTMLElementName#FORM FORM}, + * {@link HTMLElementName#H1 H1}, + * {@link HTMLElementName#H2 H2}, + * {@link HTMLElementName#H3 H3}, + * {@link HTMLElementName#H4 H4}, + * {@link HTMLElementName#H5 H5}, + * {@link HTMLElementName#H6 H6}, + * {@link HTMLElementName#HR HR}, + * {@link HTMLElementName#LEGEND LEGEND}, + * {@link HTMLElementName#LI LI}, + * {@link HTMLElementName#MENU MENU}, + * {@link HTMLElementName#OL OL}, + * {@link HTMLElementName#P P}, + * {@link HTMLElementName#PRE PRE}, + * {@link HTMLElementName#TR TR}, + * {@link HTMLElementName#UL UL} + * + * @param htmlElementName (required) the case insensitive name of a supported HTML block element. + * @param bottomMargin the new bottom margin of the specified element. + * @throws UnsupportedOperationException if an unsupported element name is specified. + */ + public static void setDefaultBottomMargin(String htmlElementName, final int bottomMargin) { + htmlElementName=HTMLElements.getConstantElementName(htmlElementName.toLowerCase()); + ELEMENT_HANDLERS.put(htmlElementName,getAbstractBlockElementHandler(htmlElementName).newBottomMargin(bottomMargin)); + } + + /** + * Returns the default bottom margin of an HTML block element with the specified name. + *

    + * See the {@link #setDefaultBottomMargin(String htmlElementName, int bottomMargin)} method for a full description of this property. + * + * @param htmlElementName (required) the case insensitive name of a supported HTML block element. + * @return the default bottom margin of an HTML block element with the specified name. + * @throws UnsupportedOperationException if an unsupported element name is specified. + */ + public static int getDefaultBottomMargin(final String htmlElementName) { + return getAbstractBlockElementHandler(htmlElementName.toLowerCase()).getBottomMargin(); + } + + /** + * Sets the default value of whether an HTML block element of the specified name is indented. + *

    + * As this is a static method, the setting affects all instances of the Renderer class. + *

    + * The htmlElementName argument must be one of the following:
    + * {@link HTMLElementName#ADDRESS ADDRESS}, + * {@link HTMLElementName#BLOCKQUOTE BLOCKQUOTE}, + * {@link HTMLElementName#CAPTION CAPTION}, + * {@link HTMLElementName#CENTER CENTER}, + * {@link HTMLElementName#DD DD}, + * {@link HTMLElementName#DIR DIR}, + * {@link HTMLElementName#DIV DIV}, + * {@link HTMLElementName#DT DT}, + * {@link HTMLElementName#FIELDSET FIELDSET}, + * {@link HTMLElementName#FORM FORM}, + * {@link HTMLElementName#H1 H1}, + * {@link HTMLElementName#H2 H2}, + * {@link HTMLElementName#H3 H3}, + * {@link HTMLElementName#H4 H4}, + * {@link HTMLElementName#H5 H5}, + * {@link HTMLElementName#H6 H6}, + * {@link HTMLElementName#HR HR}, + * {@link HTMLElementName#LEGEND LEGEND}, + * {@link HTMLElementName#MENU MENU}, + * {@link HTMLElementName#OL OL}, + * {@link HTMLElementName#P P}, + * {@link HTMLElementName#PRE PRE}, + * {@link HTMLElementName#TR TR}, + * {@link HTMLElementName#UL UL} + * + * @param htmlElementName (required) the case insensitive name of a supported HTML block element. + * @param indent whether the the specified element is indented. + * @throws UnsupportedOperationException if an unsupported element name is specified. + */ + public static void setDefaultIndent(String htmlElementName, final boolean indent) { + htmlElementName=HTMLElements.getConstantElementName(htmlElementName.toLowerCase()); + if (htmlElementName==HTMLElementName.LI) throw new UnsupportedOperationException(); + ELEMENT_HANDLERS.put(htmlElementName,getAbstractBlockElementHandler(htmlElementName).newIndent(indent)); + } + + /** + * Returns the default value of whether an HTML block element of the specified name is indented. + *

    + * See the {@link #setDefaultIndent(String htmlElementName, boolean indent)} method for a full description of this property. + * + * @param htmlElementName (required) the case insensitive name of a supported HTML block element. + * @return the default value of whether an HTML block element of the specified name is indented. + * @throws UnsupportedOperationException if an unsupported element name is specified. + */ + public static boolean isDefaultIndent(String htmlElementName) { + htmlElementName=HTMLElements.getConstantElementName(htmlElementName.toLowerCase()); + if (htmlElementName==HTMLElementName.LI) throw new UnsupportedOperationException(); + return getAbstractBlockElementHandler(htmlElementName.toLowerCase()).isIndent(); + } + + private static AbstractBlockElementHandler getAbstractBlockElementHandler(String htmlElementName) { + ElementHandler elementHandler=ELEMENT_HANDLERS.get(htmlElementName); + if (elementHandler==null || !(elementHandler instanceof AbstractBlockElementHandler)) throw new UnsupportedOperationException("Cannot set block properties on element "+htmlElementName); + return (AbstractBlockElementHandler)elementHandler; + } + + /** This class does the actual work, but is first passed final copies of all the parameters for efficiency. */ + private static final class Processor { + private final Renderer renderer; + private final Segment rootSegment; + private final Source source; + private final int maxLineLength; + private final int hrLineLength; + private final String newLine; + private final boolean includeHyperlinkURLs; + private final boolean includeAlternateText; + private final boolean decorateFontStyles; + private final boolean convertNonBreakingSpaces; + private final int blockIndentSize; + private final int listIndentSize; + private final char[] listBullets; + private final String tableCellSeparator; + + private Appendable appendable; + private int renderedIndex; // keeps track of where rendering is up to in case of overlapping elements + private boolean atStartOfLine; + private boolean skipInitialNewLines; + private int col; + private int listIndentLevel; + private int indentSize; + private int blockVerticalMargin; // minimum number of blank lines to output at the current block boundary, or NO_MARGIN (-1) if we are not currently at a block boundary. + private boolean preformatted; + private boolean lastCharWhiteSpace; + private final boolean ignoreInitialWhiteSpace=false; // can remove this at some stage once we're sure it won't be used. + private boolean bullet; + private int listBulletNumber; + + private static final int NO_MARGIN=-1; + + public Processor(final Renderer renderer, final Segment rootSegment, final int maxLineLength, final int hrLineLength, final String newLine, final boolean includeHyperlinkURLs, final boolean includeAlternateText, final boolean decorateFontStyles, final boolean convertNonBreakingSpaces, final int blockIndentSize, final int listIndentSize, final char[] listBullets, final String tableCellSeparator) { + this.renderer=renderer; + this.rootSegment=rootSegment; + source=rootSegment.source; + this.maxLineLength=maxLineLength; + this.hrLineLength=hrLineLength; + this.newLine=newLine; + this.includeHyperlinkURLs=includeHyperlinkURLs; + this.includeAlternateText=includeAlternateText; + this.decorateFontStyles=decorateFontStyles; + this.convertNonBreakingSpaces=convertNonBreakingSpaces; + this.blockIndentSize=blockIndentSize; + this.listIndentSize=listIndentSize; + this.listBullets=listBullets; + this.tableCellSeparator=tableCellSeparator; + } + + public void appendTo(final Appendable appendable) throws IOException { + reset(); + this.appendable=appendable; + List elements=rootSegment instanceof Element ? Collections.singletonList((Element)rootSegment) : rootSegment.getChildElements(); + appendSegmentProcessingChildElements(rootSegment.begin,rootSegment.end,elements); + } + + private void reset() { + renderedIndex=0; + atStartOfLine=true; + skipInitialNewLines=!renderer.includeFirstElementTopMargin; + col=0; + listIndentLevel=0; + indentSize=0; + blockVerticalMargin=NO_MARGIN; + preformatted=false; + lastCharWhiteSpace=false; + //ignoreInitialWhiteSpace=false; + bullet=false; + } + + private void appendElementContent(final Element element) throws IOException { + final int contentEnd=element.getContentEnd(); + if (element.isEmpty() || renderedIndex>=contentEnd) return; + final int contentBegin=element.getStartTag().end; + appendSegmentProcessingChildElements(Math.max(renderedIndex,contentBegin),contentEnd,element.getChildElements()); + } + + private void appendSegmentProcessingChildElements(final int begin, final int end, final List childElements) throws IOException { + int index=begin; + for (Element childElement : childElements) { + if (index>=childElement.end) continue; + if (index=end) break; + appendSegment(index,tag.begin); + index=tag.end; + } + appendSegment(index,end); + } + + private void appendSegment(int begin, final int end) throws IOException { + assert begin<=end; + if (begin=end) return; + try { + if (preformatted) + appendPreformattedSegment(begin,end); + else + appendNonPreformattedSegment(begin,end); + } finally { + if (renderedIndex=renderedIndex; + if (isBlockBoundary()) appendBlockVerticalMargin(); + final String text=CharacterReference.decode(source.subSequence(begin,end),false,convertNonBreakingSpaces); + for (int i=0; i=renderedIndex; + final String text=CharacterReference.decodeCollapseWhiteSpace(source.subSequence(begin,end),convertNonBreakingSpaces); + if (text.length()==0) { + // collapsed text is zero length but original segment wasn't, meaning it consists purely of white space. + if (!ignoreInitialWhiteSpace) lastCharWhiteSpace=true; + return; + } + appendNonPreformattedText(text,Segment.isWhiteSpace(source.charAt(begin)),Segment.isWhiteSpace(source.charAt(end-1))); + } + + private void appendText(final String text) throws IOException { + assert text.length()>0; + appendNonPreformattedText(text,Segment.isWhiteSpace(text.charAt(0)),Segment.isWhiteSpace(text.charAt(text.length()-1))); + } + + private void appendNonPreformattedText(final String text, final boolean isWhiteSpaceAtStart, final boolean isWhiteSpaceAtEnd) throws IOException { + if (isBlockBoundary()) { + appendBlockVerticalMargin(); + } else if (lastCharWhiteSpace || (isWhiteSpaceAtStart && !ignoreInitialWhiteSpace)) { + // output white space only if not on a block boundary + append(' '); + } + int textIndex=0; + int i=0; + lastCharWhiteSpace=false; + //ignoreInitialWhiteSpace=false; + while (true) { + for (; i" or "From ". + if (i+1') continue; + if (i+60 && col+i-textIndex+1>=maxLineLength) { + if (lastCharWhiteSpace && (listIndentLevel|indentSize)==0) append(' '); + startNewLine(0); + } else if (lastCharWhiteSpace) { + append(' '); + } + append(text,textIndex,i); + if (i==text.length()) break; + lastCharWhiteSpace=true; + textIndex=++i; + } + lastCharWhiteSpace=isWhiteSpaceAtEnd; + } + + private boolean isBlockBoundary() { + return blockVerticalMargin!=NO_MARGIN; + } + + private void appendBlockVerticalMargin() throws IOException { + assert blockVerticalMargin!=NO_MARGIN; + if (skipInitialNewLines) { + // at first text after

  • element or start of document + skipInitialNewLines=false; + final int indentCol=indentSize+listIndentLevel*listIndentSize; + if (col==indentCol) { + atStartOfLine=false; // no need to call appendIndent() from appendTextInit(). + } else { + // there was an indenting block since the
  • or start of document + if (bullet || col>indentCol) { + // just start new line as normal if the last indenting block is another
  • , or if the current column is already past the required indent + startNewLine(0); + } else { + // just append spaces to get the column up to the required indent + while (indentCol>col) { + appendable.append(' '); + col++; + } + atStartOfLine=false; // make sure appendIndent() isn't called again from appendTextInit() + } + } + } else { + startNewLine(blockVerticalMargin); + } + blockVerticalMargin=NO_MARGIN; + } + + private void blockBoundary(final int verticalMargin) throws IOException { + // Set a block boundary with the given vertical margin. The vertical margin is the minimum number of blank lines to output between the blocks. + // This method can be called multiple times at a block boundary, and the next textual output will output the number of blank lines determined by the + // maximum vertical margin of all the method calls. + if (blockVerticalMargin0; i--) appendable.append(' '); + if (bullet) { + for (int i=(listIndentLevel-1)*listIndentSize; i>0; i--) appendable.append(' '); + if (listBulletNumber==UNORDERED_LIST) { + for (int i=listIndentSize-2; i>0; i--) appendable.append(' '); + appendable.append(listBullets[(listIndentLevel-1)%listBullets.length]).append(' '); + } else { + String bulletNumberString=Integer.toString(listBulletNumber); + for (int i=listIndentSize-bulletNumberString.length()-2; i>0; i--) appendable.append(' '); + appendable.append(bulletNumberString).append(". "); + } + bullet=false; + } else { + for (int i=listIndentLevel*listIndentSize; i>0; i--) appendable.append(' '); + } + col=indentSize+listIndentLevel*listIndentSize; + atStartOfLine=false; + } + + private Processor append(final char ch) throws IOException { + appendTextInit(); + appendable.append(ch); + col++; + return this; + } + + private Processor append(final String text) throws IOException { + appendTextInit(); + appendable.append(text); + col+=text.length(); + return this; + } + + private void append(final CharSequence text, final int begin, final int end) throws IOException { + appendTextInit(); + for (int i=begin; i0 && x.col+linkLength>=x.maxLineLength) { + x.startNewLine(0); + } else if (displayContent) { + x.append(' '); + } + x.append(renderedHyperlinkURL); + x.lastCharWhiteSpace=true; + } + } + + private static final String getInformalURL(String url) { + if (url.startsWith("http://")) url=url.substring(7); + if (url.endsWith("/")) url=url.substring(0,url.length()-1); + return url; + } + + private static final class BR_ElementHandler implements ElementHandler { + public static final ElementHandler INSTANCE=new BR_ElementHandler(); + public void process(Processor x, Element element) throws IOException { + if (x.isBlockBoundary() && !x.atStartOfLine && !x.skipInitialNewLines) x.newLine(); // add an extra new line if we're at a block boundary and aren't already at the start of the next line and it's not the first element after
  • + x.newLine(); + x.blockBoundary(0); + } + } + + private static final class HR_ElementHandler extends AbstractBlockElementHandler { + public static final ElementHandler INSTANCE=new HR_ElementHandler(); + private HR_ElementHandler() { + this(0,0,false); + } + private HR_ElementHandler(int topMargin, int bottomMargin, boolean indent) { + super(topMargin,bottomMargin,indent); + } + protected void processBlockContent(Processor x, Element element) throws IOException { + x.appendBlockVerticalMargin(); + x.append('-'); + for (int i=x.col; i