You are viewing a plain text version of this content. The canonical link for it is here.
Posted to server-dev@james.apache.org by rc...@apache.org on 2020/02/12 04:14:31 UTC
[james-project] 04/09: JAMES-3044 Test to prove JsoupTextExtractor
fails on null characters
This is an automated email from the ASF dual-hosted git repository.
rcordier pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/james-project.git
commit 2958ee9f6f0d34f61f91950cb907156d7f39dc32
Author: Tran Tien Duc <dt...@linagora.com>
AuthorDate: Fri Feb 7 15:31:31 2020 +0700
JAMES-3044 Test to prove JsoupTextExtractor fails on null characters
---
.../store/extractor/JsoupTextExtractorTest.java | 30 ++++++++++++++++++++--
1 file changed, 28 insertions(+), 2 deletions(-)
diff --git a/mailbox/store/src/test/java/org/apache/james/mailbox/store/extractor/JsoupTextExtractorTest.java b/mailbox/store/src/test/java/org/apache/james/mailbox/store/extractor/JsoupTextExtractorTest.java
index 64bd9b9..2a8ec70 100644
--- a/mailbox/store/src/test/java/org/apache/james/mailbox/store/extractor/JsoupTextExtractorTest.java
+++ b/mailbox/store/src/test/java/org/apache/james/mailbox/store/extractor/JsoupTextExtractorTest.java
@@ -20,6 +20,7 @@
package org.apache.james.mailbox.store.extractor;
import static org.assertj.core.api.Assertions.assertThat;
+import static org.assertj.core.api.Assertions.assertThatCode;
import java.io.ByteArrayInputStream;
import java.io.InputStream;
@@ -28,9 +29,13 @@ import java.nio.charset.StandardCharsets;
import org.apache.james.mailbox.extractor.ParsedContent;
import org.apache.james.mailbox.extractor.TextExtractor;
import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
class JsoupTextExtractorTest {
+
+ private static final String TEXT_HTML_CONTENT_TYPE = "text/html";
+
TextExtractor textExtractor;
@BeforeEach
@@ -42,7 +47,7 @@ class JsoupTextExtractorTest {
void extractedTextFromHtmlShouldNotContainTheContentOfTitleTag() throws Exception {
InputStream inputStream = ClassLoader.getSystemResourceAsStream("documents/html.txt");
- assertThat(textExtractor.extractContent(inputStream, "text/html").getTextualContent().get())
+ assertThat(textExtractor.extractContent(inputStream, TEXT_HTML_CONTENT_TYPE).getTextualContent().get())
.doesNotContain("*|MC:SUBJECT|*");
}
@@ -64,7 +69,7 @@ class JsoupTextExtractorTest {
@Test
void extractContentShouldReturnEmptyWhenNullData() throws Exception {
- assertThat(textExtractor.extractContent(null, "text/html"))
+ assertThat(textExtractor.extractContent(null, TEXT_HTML_CONTENT_TYPE))
.isEqualTo(ParsedContent.empty());
}
@@ -76,4 +81,25 @@ class JsoupTextExtractorTest {
.isEqualTo(ParsedContent.empty());
}
+ @Disabled("JAMES-3044 java.io.IOException: Input is binary and unsupported")
+ @Test
+ void extractContentShouldNotThrowWhenContainingNullCharacters() {
+ InputStream inputStream = textContentWithManyNullCharacters();
+
+ assertThatCode(() -> textExtractor.extractContent(inputStream, TEXT_HTML_CONTENT_TYPE))
+ .doesNotThrowAnyException();
+ }
+
+ private InputStream textContentWithManyNullCharacters() {
+ String htmlTextContent = "HTML pages can include a lot of null '\0' character. But still expecting the content can be parsed." +
+ "Jsoup 1.21.1 thinks a file containing more than 10 null characters can be a binary file";
+ byte[] htmlBytesContent = htmlTextContent.getBytes(StandardCharsets.UTF_8);
+ byte[] nullCharacters = {'\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0'};
+
+ byte[] fullContent = new byte[htmlBytesContent.length + nullCharacters.length];
+ System.arraycopy(htmlBytesContent, 0, fullContent, 0, htmlBytesContent.length);
+ System.arraycopy(nullCharacters, 0, fullContent, htmlBytesContent.length, nullCharacters.length);
+
+ return new ByteArrayInputStream(fullContent);
+ }
}
\ No newline at end of file
---------------------------------------------------------------------
To unsubscribe, e-mail: server-dev-unsubscribe@james.apache.org
For additional commands, e-mail: server-dev-help@james.apache.org