You are viewing a plain text version of this content. The canonical link for it is here.
Posted to notifications@james.apache.org by rc...@apache.org on 2023/02/07 09:23:52 UTC
[james-project] branch master updated: [PERF] JsoupHtmlTextExtractor without recursion (#1422)
This is an automated email from the ASF dual-hosted git repository.
rcordier pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/james-project.git
The following commit(s) were added to refs/heads/master by this push:
new 537ae380f9 [PERF] JsoupHtmlTextExtractor without recursion (#1422)
537ae380f9 is described below
commit 537ae380f9837f74c075f0ed2b625affa9b20122
Author: Benoit TELLIER <bt...@linagora.com>
AuthorDate: Tue Feb 7 16:23:47 2023 +0700
[PERF] JsoupHtmlTextExtractor without recursion (#1422)
---
.../jmap/draft/utils/JsoupHtmlTextExtractor.java | 57 +++++++++++++++-------
.../draft/utils/JsoupHtmlTextExtractorTest.java | 10 ++++
2 files changed, 50 insertions(+), 17 deletions(-)
diff --git a/server/protocols/jmap-draft/src/main/java/org/apache/james/jmap/draft/utils/JsoupHtmlTextExtractor.java b/server/protocols/jmap-draft/src/main/java/org/apache/james/jmap/draft/utils/JsoupHtmlTextExtractor.java
index 4fb4204e7c..41fbe88b65 100644
--- a/server/protocols/jmap-draft/src/main/java/org/apache/james/jmap/draft/utils/JsoupHtmlTextExtractor.java
+++ b/server/protocols/jmap-draft/src/main/java/org/apache/james/jmap/draft/utils/JsoupHtmlTextExtractor.java
@@ -19,13 +19,16 @@
package org.apache.james.jmap.draft.utils;
+import java.util.Deque;
import java.util.Optional;
+import java.util.concurrent.ConcurrentLinkedDeque;
import java.util.function.Predicate;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.lang3.StringUtils;
import org.apache.james.util.html.HtmlTextExtractor;
+import org.apache.james.util.streams.Iterators;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
@@ -55,7 +58,7 @@ public class JsoupHtmlTextExtractor implements HtmlTextExtractor {
Element body = Optional.ofNullable(document.body()).orElse(document);
- return flatten(body, INITIAL_LIST_NESTED_LEVEL)
+ return flatten(body)
.map(this::convertNodeToText)
.collect(Collectors.joining());
} catch (Exception e) {
@@ -110,23 +113,31 @@ public class JsoupHtmlTextExtractor implements HtmlTextExtractor {
}
}
- Stream<HTMLNode> flatten(Node base, int listNestedLevel) {
- Position position = getPosition(base);
- int nextElementLevel = getNewNestedLevel(listNestedLevel, base);
-
- Stream<HTMLNode> baseStream = Stream.of(new HTMLNode(base, listNestedLevel));
- Stream<HTMLNode> flatChildren = base.childNodes()
- .stream()
- .flatMap(node -> flatten(node, nextElementLevel));
-
- switch (position) {
- case PREFIX:
- return Stream.concat(baseStream, flatChildren);
- case SUFFIX:
- return Stream.concat(flatChildren, baseStream);
- default:
- throw new RuntimeException("Unexpected POSITION for node element: " + position);
+ Stream<HTMLNode> flatten(Node base) {
+ Deque<HTMLNode> in = new ConcurrentLinkedDeque<>();
+ in.addFirst(new HTMLNode(base, JsoupHtmlTextExtractor.INITIAL_LIST_NESTED_LEVEL));
+ Deque<HTMLNode> out = new ConcurrentLinkedDeque<>();
+
+ while (!in.isEmpty()) {
+ HTMLNode node = in.removeFirst();
+ if (node.isDone) {
+ out.addLast(node);
+ continue;
+ }
+ int nextElementLevel = getNewNestedLevel(node.listNestedLevel, node.underlyingNode);
+ Position position = getPosition(node.underlyingNode);
+
+ if (position == Position.SUFFIX) {
+ node.underlyingNode.childNodes()
+ .forEach(child -> in.addFirst(new HTMLNode(child, nextElementLevel)));
+ out.addLast(node);
+ } else {
+ in.addFirst(node.done());
+ node.underlyingNode.childNodes()
+ .forEach(child -> in.addFirst(new HTMLNode(child, nextElementLevel)));
+ }
}
+ return Iterators.toStream(out.descendingIterator());
}
private int getNewNestedLevel(int listNestedLevel, Node node) {
@@ -161,10 +172,22 @@ public class JsoupHtmlTextExtractor implements HtmlTextExtractor {
private static class HTMLNode {
private final Node underlyingNode;
private final int listNestedLevel;
+ private final boolean isDone;
+
+ public HTMLNode(Node underlyingNode, int listNestedLevel, boolean isDone) {
+ this.underlyingNode = underlyingNode;
+ this.listNestedLevel = listNestedLevel;
+ this.isDone = isDone;
+ }
public HTMLNode(Node underlyingNode, int listNestedLevel) {
this.underlyingNode = underlyingNode;
this.listNestedLevel = listNestedLevel;
+ this.isDone = false;
+ }
+
+ public HTMLNode done() {
+ return new HTMLNode(underlyingNode, listNestedLevel, true);
}
}
diff --git a/server/protocols/jmap-draft/src/test/java/org/apache/james/jmap/draft/utils/JsoupHtmlTextExtractorTest.java b/server/protocols/jmap-draft/src/test/java/org/apache/james/jmap/draft/utils/JsoupHtmlTextExtractorTest.java
index 86197a3b93..4829c00c43 100644
--- a/server/protocols/jmap-draft/src/test/java/org/apache/james/jmap/draft/utils/JsoupHtmlTextExtractorTest.java
+++ b/server/protocols/jmap-draft/src/test/java/org/apache/james/jmap/draft/utils/JsoupHtmlTextExtractorTest.java
@@ -27,6 +27,8 @@ import org.apache.commons.io.IOUtils;
import org.junit.Before;
import org.junit.Test;
+import com.google.common.base.Strings;
+
public class JsoupHtmlTextExtractorTest {
private JsoupHtmlTextExtractor textExtractor;
@@ -63,6 +65,14 @@ public class JsoupHtmlTextExtractorTest {
assertThat(textExtractor.toPlainText(html)).isEqualTo(expectedPlainText);
}
+ @Test
+ public void deeplyNestedHtmlShouldNotThrowStackOverflow() {
+ final int count = 2048;
+ String html = Strings.repeat("<div>", count) + "<p>para1</p><p>para2</p>" + Strings.repeat("</div>", count);
+ String expectedPlainText = "para1\n\npara2\n\n";
+ assertThat(textExtractor.toPlainText(html)).isEqualTo(expectedPlainText);
+ }
+
@Test
public void toPlainTextShouldConciderUpperCaseLabelsAsLowerCase() {
String html = "<P>para1</P><p>para2</p>";
---------------------------------------------------------------------
To unsubscribe, e-mail: notifications-unsubscribe@james.apache.org
For additional commands, e-mail: notifications-help@james.apache.org