Skip to content

Commit

Permalink
Modified Element.text() to build text by traversing child nodes rathe…
Browse files Browse the repository at this point in the history
…r than recursing.

Fixes jhy#271
  • Loading branch information
jhy committed Dec 26, 2012
1 parent 2d56df2 commit 1fd0a61
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 26 deletions.
4 changes: 4 additions & 0 deletions CHANGES
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,10 @@ jsoup changelog
parse when the HTML stack is hopelessly deep.
<https://github.com/jhy/jsoup/issues/234>

* Modified Element.text() to build text by traversing child nodes rather than recursing. This avoids stack-overflow
errors when the DOM is very deep and the VM stack-size is low.
<https://github.com/jhy/jsoup/issues/271>

*** Release 1.7.1 [2012-Sep-23]
* Improved parse time, now 2.3x faster than previous release, with lower memory consumption.

Expand Down
53 changes: 28 additions & 25 deletions src/main/java/org/jsoup/nodes/Element.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,7 @@
import org.jsoup.helper.Validate;
import org.jsoup.parser.Parser;
import org.jsoup.parser.Tag;
import org.jsoup.select.Collector;
import org.jsoup.select.Elements;
import org.jsoup.select.Evaluator;
import org.jsoup.select.Selector;
import org.jsoup.select.*;

import java.util.*;
import java.util.regex.Pattern;
Expand Down Expand Up @@ -818,25 +815,25 @@ public Elements getAllElements() {
* @see #textNodes()
*/
public String text() {
StringBuilder sb = new StringBuilder();
text(sb);
return sb.toString().trim();
}
final StringBuilder accum = new StringBuilder();
new NodeTraversor(new NodeVisitor() {
public void head(Node node, int depth) {
if (node instanceof TextNode) {
TextNode textNode = (TextNode) node;
appendNormalisedText(accum, textNode);
} else if (node instanceof Element) {
Element element = (Element) node;
if (accum.length() > 0 &&
(element.isBlock() || element.tag.getName().equals("br")) &&
!TextNode.lastCharIsWhitespace(accum))
accum.append(" ");
}
}

private void text(StringBuilder accum) {
appendWhitespaceIfBr(this, accum);

for (Node child : childNodes) {
if (child instanceof TextNode) {
TextNode textNode = (TextNode) child;
appendNormalisedText(accum, textNode);
} else if (child instanceof Element) {
Element element = (Element) child;
if (accum.length() > 0 && element.isBlock() && !TextNode.lastCharIsWhitespace(accum))
accum.append(" ");
element.text(accum);
public void tail(Node node, int depth) {
}
}
}).traverse(this);
return accum.toString().trim();
}

/**
Expand Down Expand Up @@ -867,10 +864,10 @@ private void ownText(StringBuilder accum) {
}
}

private void appendNormalisedText(StringBuilder accum, TextNode textNode) {
private static void appendNormalisedText(StringBuilder accum, TextNode textNode) {
String text = textNode.getWholeText();

if (!preserveWhitespace()) {
if (!preserveWhitespace(textNode.parent())) {
text = TextNode.normaliseWhitespace(text);
if (TextNode.lastCharIsWhitespace(accum))
text = TextNode.stripLeadingWhitespace(text);
Expand All @@ -883,8 +880,14 @@ private static void appendWhitespaceIfBr(Element element, StringBuilder accum) {
accum.append(" ");
}

boolean preserveWhitespace() {
return tag.preserveWhitespace() || parent() != null && parent().preserveWhitespace();
static boolean preserveWhitespace(Node node) {
// looks only at this element and one level up, to prevent recursion & needless stack searches
if (node != null && node instanceof Element) {
Element element = (Element) node;
return element.tag.preserveWhitespace() ||
element.parent() != null && element.parent().tag.preserveWhitespace();
}
return false;
}

/**
Expand Down
2 changes: 1 addition & 1 deletion src/main/java/org/jsoup/nodes/TextNode.java
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ public TextNode splitText(int offset) {

void outerHtmlHead(StringBuilder accum, int depth, Document.OutputSettings out) {
String html = Entities.escape(getWholeText(), out);
if (out.prettyPrint() && parent() instanceof Element && !((Element) parent()).preserveWhitespace()) {
if (out.prettyPrint() && parent() instanceof Element && !Element.preserveWhitespace((Element) parent())) {
html = normaliseWhitespace(html);
}

Expand Down

0 comments on commit 1fd0a61

Please sign in to comment.