You are viewing a plain text version of this content. The canonical link for it is here.
Posted to server-dev@james.apache.org by ad...@apache.org on 2017/09/08 20:14:52 UTC

[4/7] james-project git commit: JAMES-2137 Sanitize and test PDFTextExtractor

JAMES-2137 Sanitize and test PDFTextExtractor


Project: http://git-wip-us.apache.org/repos/asf/james-project/repo
Commit: http://git-wip-us.apache.org/repos/asf/james-project/commit/11e336a3
Tree: http://git-wip-us.apache.org/repos/asf/james-project/tree/11e336a3
Diff: http://git-wip-us.apache.org/repos/asf/james-project/diff/11e336a3

Branch: refs/heads/master
Commit: 11e336a32c23e7dad43ef4d373d9df04c03c6935
Parents: ad33cef
Author: benwa <bt...@linagora.com>
Authored: Fri Sep 8 10:03:38 2017 +0700
Committer: Antoine Duprat <ad...@linagora.com>
Committed: Fri Sep 8 21:56:59 2017 +0200

----------------------------------------------------------------------
 .../mailbox/store/search/PDFTextExtractor.java  |  26 +++----
 .../store/search/PDFTextExtractorTest.java      |  75 +++++++++++++++++++
 .../scanning-search/src/test/resources/pdf.pdf  | Bin 0 -> 14707 bytes
 .../mailbox/store/search/MessageSearches.java   |  21 ++++--
 4 files changed, 101 insertions(+), 21 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/james-project/blob/11e336a3/mailbox/scanning-search/src/test/java/org/apache/james/mailbox/store/search/PDFTextExtractor.java
----------------------------------------------------------------------
diff --git a/mailbox/scanning-search/src/test/java/org/apache/james/mailbox/store/search/PDFTextExtractor.java b/mailbox/scanning-search/src/test/java/org/apache/james/mailbox/store/search/PDFTextExtractor.java
index 1a5b5eb..1e21b7e 100644
--- a/mailbox/scanning-search/src/test/java/org/apache/james/mailbox/store/search/PDFTextExtractor.java
+++ b/mailbox/scanning-search/src/test/java/org/apache/james/mailbox/store/search/PDFTextExtractor.java
@@ -28,36 +28,32 @@ import org.apache.pdfbox.pdmodel.PDDocument;
 import org.apache.pdfbox.text.PDFTextStripper;
 
 import com.google.common.base.Charsets;
+import com.google.common.base.Preconditions;
 import com.google.common.collect.ImmutableMap;
 
 public class PDFTextExtractor implements TextExtractor {
 
-    private static final String PDF_TYPE = "application/pdf";
+    static final String PDF_TYPE = "application/pdf";
 
     @Override
     public ParsedContent extractContent(InputStream inputStream, String contentType) throws Exception {
+        Preconditions.checkNotNull(inputStream);
+        Preconditions.checkNotNull(contentType);
+
         if (isPDF(contentType)) {
             return extractTextFromPDF(inputStream);
         }
-        try {
-            return new ParsedContent(IOUtils.toString(inputStream, Charsets.UTF_8), ImmutableMap.of());
-        } catch (IOException e) {
-            return new ParsedContent(null, ImmutableMap.of());
-        }
+        return new ParsedContent(IOUtils.toString(inputStream, Charsets.UTF_8), ImmutableMap.of());
     }
 
     private boolean isPDF(String contentType) {
         return contentType.equals(PDF_TYPE);
     }
 
-    private ParsedContent extractTextFromPDF(InputStream inputStream) {
-        try {
-            return new ParsedContent(
-                    new PDFTextStripper().getText(
-                            PDDocument.load(inputStream)),
-                    ImmutableMap.of());
-        } catch (IOException e) {
-            return new ParsedContent(null, ImmutableMap.of());
-        }
+    private ParsedContent extractTextFromPDF(InputStream inputStream) throws IOException {
+        return new ParsedContent(
+            new PDFTextStripper().getText(
+                PDDocument.load(inputStream)),
+            ImmutableMap.of());
     }
 }

http://git-wip-us.apache.org/repos/asf/james-project/blob/11e336a3/mailbox/scanning-search/src/test/java/org/apache/james/mailbox/store/search/PDFTextExtractorTest.java
----------------------------------------------------------------------
diff --git a/mailbox/scanning-search/src/test/java/org/apache/james/mailbox/store/search/PDFTextExtractorTest.java b/mailbox/scanning-search/src/test/java/org/apache/james/mailbox/store/search/PDFTextExtractorTest.java
new file mode 100644
index 0000000..df52009
--- /dev/null
+++ b/mailbox/scanning-search/src/test/java/org/apache/james/mailbox/store/search/PDFTextExtractorTest.java
@@ -0,0 +1,75 @@
+/****************************************************************
+ * Licensed to the Apache Software Foundation (ASF) under one   *
+ * or more contributor license agreements.  See the NOTICE file *
+ * distributed with this work for additional information        *
+ * regarding copyright ownership.  The ASF licenses this file   *
+ * to you under the Apache License, Version 2.0 (the            *
+ * "License"); you may not use this file except in compliance   *
+ * with the License.  You may obtain a copy of the License at   *
+ *                                                              *
+ *   http://www.apache.org/licenses/LICENSE-2.0                 *
+ *                                                              *
+ * Unless required by applicable law or agreed to in writing,   *
+ * software distributed under the License is distributed on an  *
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY       *
+ * KIND, either express or implied.  See the License for the    *
+ * specific language governing permissions and limitations      *
+ * under the License.                                           *
+ ****************************************************************/
+
+package org.apache.james.mailbox.store.search;
+
+import static org.assertj.core.api.Assertions.assertThat;
+import static org.assertj.core.api.Assertions.assertThatThrownBy;
+
+import java.io.ByteArrayInputStream;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+
+import org.junit.Before;
+import org.junit.Test;
+
+public class PDFTextExtractorTest {
+
+    private PDFTextExtractor testee;
+
+    @Before
+    public void setUp() {
+        testee = new PDFTextExtractor();
+    }
+
+    @Test
+    public void extractContentShouldThrowWhenNullInputStream() throws Exception {
+        assertThatThrownBy(() ->
+            testee.extractContent(null, "any/any"))
+            .isInstanceOf(NullPointerException.class);
+    }
+
+    @Test
+    public void extractContentShouldThrowWhenNullContentType() throws Exception {
+        InputStream inputStream = new ByteArrayInputStream("content".getBytes(StandardCharsets.UTF_8));
+        assertThatThrownBy(() -> testee.extractContent(inputStream, null))
+            .isInstanceOf(NullPointerException.class);
+    }
+
+    @Test
+    public void extractContentShouldExtractPlainText() throws Exception {
+        String content = "content";
+        InputStream inputStream = new ByteArrayInputStream(content.getBytes(StandardCharsets.UTF_8));
+
+        assertThat(testee.extractContent(inputStream, "text/plain")
+            .getTextualContent())
+            .isEqualTo(content);
+    }
+
+    @Test
+    public void extractContentShouldExtractPDF() throws Exception {
+        String content = "Little PDF";
+        InputStream inputStream = ClassLoader.getSystemResourceAsStream("pdf.pdf");
+
+        assertThat(testee.extractContent(inputStream, PDFTextExtractor.PDF_TYPE)
+            .getTextualContent())
+            .contains(content);
+    }
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/james-project/blob/11e336a3/mailbox/scanning-search/src/test/resources/pdf.pdf
----------------------------------------------------------------------
diff --git a/mailbox/scanning-search/src/test/resources/pdf.pdf b/mailbox/scanning-search/src/test/resources/pdf.pdf
new file mode 100644
index 0000000..5388d4a
Binary files /dev/null and b/mailbox/scanning-search/src/test/resources/pdf.pdf differ

http://git-wip-us.apache.org/repos/asf/james-project/blob/11e336a3/mailbox/store/src/main/java/org/apache/james/mailbox/store/search/MessageSearches.java
----------------------------------------------------------------------
diff --git a/mailbox/store/src/main/java/org/apache/james/mailbox/store/search/MessageSearches.java b/mailbox/store/src/main/java/org/apache/james/mailbox/store/search/MessageSearches.java
index d34c522..dc35559 100644
--- a/mailbox/store/src/main/java/org/apache/james/mailbox/store/search/MessageSearches.java
+++ b/mailbox/store/src/main/java/org/apache/james/mailbox/store/search/MessageSearches.java
@@ -35,13 +35,13 @@ import java.util.List;
 import java.util.Locale;
 import java.util.Optional;
 import java.util.TimeZone;
+import java.util.stream.Stream;
 
 import javax.mail.Flags;
 
 import org.apache.james.mailbox.MessageUid;
 import org.apache.james.mailbox.exception.MailboxException;
 import org.apache.james.mailbox.exception.UnsupportedSearchException;
-import org.apache.james.mailbox.extractor.ParsedContent;
 import org.apache.james.mailbox.extractor.TextExtractor;
 import org.apache.james.mailbox.model.Attachment;
 import org.apache.james.mailbox.model.MessageAttachment;
@@ -77,9 +77,7 @@ import org.apache.james.mime4j.utils.search.MessageMatcher;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import com.github.fge.lambdas.Throwing;
 import com.google.common.collect.ImmutableList;
-import com.google.common.collect.ImmutableMap;
 import com.google.common.collect.Lists;
 
 /**
@@ -252,12 +250,23 @@ public class MessageSearches implements Iterable<SimpleMessageSearchIndex.Search
     private boolean isInAttachments(String value, List<MessageAttachment> attachments) {
         return attachments.stream()
             .map(MessageAttachment::getAttachment)
-            .map(Throwing.function((Attachment attachment) -> textExtractor.extractContent(attachment.getStream(), attachment.getType()))
-                    .orReturn(new ParsedContent(null, ImmutableMap.of())))
-            .map(ParsedContent::getTextualContent)
+            .flatMap(this::toAttachmentContent)
             .anyMatch(string -> string.contains(value));
     }
 
+    private Stream<String> toAttachmentContent(Attachment attachment) {
+        try {
+            return Stream.of(textExtractor
+                .extractContent(
+                    attachment.getStream(),
+                    attachment.getType())
+                .getTextualContent());
+        } catch (Exception e) {
+            LOGGER.error("Error while parsing attachment content", e);
+            return Stream.of();
+        }
+    }
+
     private InputStream textHeaders(MailboxMessage message) throws MimeIOException, IOException {
         ByteArrayOutputStream out = new ByteArrayOutputStream();
         new DefaultMessageWriter()


---------------------------------------------------------------------
To unsubscribe, e-mail: server-dev-unsubscribe@james.apache.org
For additional commands, e-mail: server-dev-help@james.apache.org