You are viewing a plain text version of this content. The canonical link for it is here.
Posted to notifications@james.apache.org by rc...@apache.org on 2023/02/07 09:23:52 UTC

[james-project] branch master updated: [PERF] JsoupHtmlTextExtractor without recursion (#1422)

This is an automated email from the ASF dual-hosted git repository.

rcordier pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/james-project.git


The following commit(s) were added to refs/heads/master by this push:
     new 537ae380f9 [PERF] JsoupHtmlTextExtractor without recursion (#1422)
537ae380f9 is described below

commit 537ae380f9837f74c075f0ed2b625affa9b20122
Author: Benoit TELLIER <bt...@linagora.com>
AuthorDate: Tue Feb 7 16:23:47 2023 +0700

    [PERF] JsoupHtmlTextExtractor without recursion (#1422)
---
 .../jmap/draft/utils/JsoupHtmlTextExtractor.java   | 57 +++++++++++++++-------
 .../draft/utils/JsoupHtmlTextExtractorTest.java    | 10 ++++
 2 files changed, 50 insertions(+), 17 deletions(-)

diff --git a/server/protocols/jmap-draft/src/main/java/org/apache/james/jmap/draft/utils/JsoupHtmlTextExtractor.java b/server/protocols/jmap-draft/src/main/java/org/apache/james/jmap/draft/utils/JsoupHtmlTextExtractor.java
index 4fb4204e7c..41fbe88b65 100644
--- a/server/protocols/jmap-draft/src/main/java/org/apache/james/jmap/draft/utils/JsoupHtmlTextExtractor.java
+++ b/server/protocols/jmap-draft/src/main/java/org/apache/james/jmap/draft/utils/JsoupHtmlTextExtractor.java
@@ -19,13 +19,16 @@
 
 package org.apache.james.jmap.draft.utils;
 
+import java.util.Deque;
 import java.util.Optional;
+import java.util.concurrent.ConcurrentLinkedDeque;
 import java.util.function.Predicate;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
 
 import org.apache.commons.lang3.StringUtils;
 import org.apache.james.util.html.HtmlTextExtractor;
+import org.apache.james.util.streams.Iterators;
 import org.jsoup.Jsoup;
 import org.jsoup.nodes.Document;
 import org.jsoup.nodes.Element;
@@ -55,7 +58,7 @@ public class JsoupHtmlTextExtractor implements HtmlTextExtractor {
 
             Element body = Optional.ofNullable(document.body()).orElse(document);
 
-            return flatten(body, INITIAL_LIST_NESTED_LEVEL)
+            return flatten(body)
                 .map(this::convertNodeToText)
                 .collect(Collectors.joining());
         } catch (Exception e) {
@@ -110,23 +113,31 @@ public class JsoupHtmlTextExtractor implements HtmlTextExtractor {
         }
     }
 
-    Stream<HTMLNode> flatten(Node base, int listNestedLevel) {
-        Position position = getPosition(base);
-        int nextElementLevel = getNewNestedLevel(listNestedLevel, base);
-
-        Stream<HTMLNode> baseStream = Stream.of(new HTMLNode(base, listNestedLevel));
-        Stream<HTMLNode> flatChildren = base.childNodes()
-            .stream()
-            .flatMap(node -> flatten(node, nextElementLevel));
-        
-        switch (position) {
-            case PREFIX:
-                return Stream.concat(baseStream, flatChildren);
-            case SUFFIX:
-                return Stream.concat(flatChildren, baseStream);
-            default:
-                throw new RuntimeException("Unexpected POSITION for node element: " + position);
+    Stream<HTMLNode> flatten(Node base) {
+        Deque<HTMLNode> in = new ConcurrentLinkedDeque<>();
+        in.addFirst(new HTMLNode(base, JsoupHtmlTextExtractor.INITIAL_LIST_NESTED_LEVEL));
+        Deque<HTMLNode> out = new ConcurrentLinkedDeque<>();
+
+        while (!in.isEmpty()) {
+            HTMLNode node = in.removeFirst();
+            if (node.isDone) {
+                out.addLast(node);
+                continue;
+            }
+            int nextElementLevel = getNewNestedLevel(node.listNestedLevel, node.underlyingNode);
+            Position position = getPosition(node.underlyingNode);
+
+            if (position == Position.SUFFIX) {
+                node.underlyingNode.childNodes()
+                    .forEach(child -> in.addFirst(new HTMLNode(child, nextElementLevel)));
+                out.addLast(node);
+            } else {
+                in.addFirst(node.done());
+                node.underlyingNode.childNodes()
+                    .forEach(child -> in.addFirst(new HTMLNode(child, nextElementLevel)));
+            }
         }
+        return Iterators.toStream(out.descendingIterator());
     }
 
     private int getNewNestedLevel(int listNestedLevel, Node node) {
@@ -161,10 +172,22 @@ public class JsoupHtmlTextExtractor implements HtmlTextExtractor {
     private static class HTMLNode {
         private final Node underlyingNode;
         private final int listNestedLevel;
+        private final boolean isDone;
+
+        public HTMLNode(Node underlyingNode, int listNestedLevel, boolean isDone) {
+            this.underlyingNode = underlyingNode;
+            this.listNestedLevel = listNestedLevel;
+            this.isDone = isDone;
+        }
 
         public HTMLNode(Node underlyingNode, int listNestedLevel) {
             this.underlyingNode = underlyingNode;
             this.listNestedLevel = listNestedLevel;
+            this.isDone = false;
+        }
+
+        public HTMLNode done() {
+            return new HTMLNode(underlyingNode, listNestedLevel, true);
         }
     }
 
diff --git a/server/protocols/jmap-draft/src/test/java/org/apache/james/jmap/draft/utils/JsoupHtmlTextExtractorTest.java b/server/protocols/jmap-draft/src/test/java/org/apache/james/jmap/draft/utils/JsoupHtmlTextExtractorTest.java
index 86197a3b93..4829c00c43 100644
--- a/server/protocols/jmap-draft/src/test/java/org/apache/james/jmap/draft/utils/JsoupHtmlTextExtractorTest.java
+++ b/server/protocols/jmap-draft/src/test/java/org/apache/james/jmap/draft/utils/JsoupHtmlTextExtractorTest.java
@@ -27,6 +27,8 @@ import org.apache.commons.io.IOUtils;
 import org.junit.Before;
 import org.junit.Test;
 
+import com.google.common.base.Strings;
+
 public class JsoupHtmlTextExtractorTest {
 
     private JsoupHtmlTextExtractor textExtractor;
@@ -63,6 +65,14 @@ public class JsoupHtmlTextExtractorTest {
         assertThat(textExtractor.toPlainText(html)).isEqualTo(expectedPlainText);
     }
 
+    @Test
+    public void deeplyNestedHtmlShouldNotThrowStackOverflow() {
+        final int count = 2048;
+        String html = Strings.repeat("<div>", count) +  "<p>para1</p><p>para2</p>" + Strings.repeat("</div>", count);
+        String expectedPlainText = "para1\n\npara2\n\n";
+        assertThat(textExtractor.toPlainText(html)).isEqualTo(expectedPlainText);
+    }
+
     @Test
     public void toPlainTextShouldConciderUpperCaseLabelsAsLowerCase() {
         String html = "<P>para1</P><p>para2</p>";


---------------------------------------------------------------------
To unsubscribe, e-mail: notifications-unsubscribe@james.apache.org
For additional commands, e-mail: notifications-help@james.apache.org