You are viewing a plain text version of this content. The canonical link for it is here.
Posted to server-dev@james.apache.org by ad...@apache.org on 2017/09/08 20:14:52 UTC
[4/7] james-project git commit: JAMES-2137 Sanitize and test
PDFTextExtractor
JAMES-2137 Sanitize and test PDFTextExtractor
Project: http://git-wip-us.apache.org/repos/asf/james-project/repo
Commit: http://git-wip-us.apache.org/repos/asf/james-project/commit/11e336a3
Tree: http://git-wip-us.apache.org/repos/asf/james-project/tree/11e336a3
Diff: http://git-wip-us.apache.org/repos/asf/james-project/diff/11e336a3
Branch: refs/heads/master
Commit: 11e336a32c23e7dad43ef4d373d9df04c03c6935
Parents: ad33cef
Author: benwa <bt...@linagora.com>
Authored: Fri Sep 8 10:03:38 2017 +0700
Committer: Antoine Duprat <ad...@linagora.com>
Committed: Fri Sep 8 21:56:59 2017 +0200
----------------------------------------------------------------------
.../mailbox/store/search/PDFTextExtractor.java | 26 +++----
.../store/search/PDFTextExtractorTest.java | 75 +++++++++++++++++++
.../scanning-search/src/test/resources/pdf.pdf | Bin 0 -> 14707 bytes
.../mailbox/store/search/MessageSearches.java | 21 ++++--
4 files changed, 101 insertions(+), 21 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/james-project/blob/11e336a3/mailbox/scanning-search/src/test/java/org/apache/james/mailbox/store/search/PDFTextExtractor.java
----------------------------------------------------------------------
diff --git a/mailbox/scanning-search/src/test/java/org/apache/james/mailbox/store/search/PDFTextExtractor.java b/mailbox/scanning-search/src/test/java/org/apache/james/mailbox/store/search/PDFTextExtractor.java
index 1a5b5eb..1e21b7e 100644
--- a/mailbox/scanning-search/src/test/java/org/apache/james/mailbox/store/search/PDFTextExtractor.java
+++ b/mailbox/scanning-search/src/test/java/org/apache/james/mailbox/store/search/PDFTextExtractor.java
@@ -28,36 +28,32 @@ import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import com.google.common.base.Charsets;
+import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableMap;
public class PDFTextExtractor implements TextExtractor {
- private static final String PDF_TYPE = "application/pdf";
+ static final String PDF_TYPE = "application/pdf";
@Override
public ParsedContent extractContent(InputStream inputStream, String contentType) throws Exception {
+ Preconditions.checkNotNull(inputStream);
+ Preconditions.checkNotNull(contentType);
+
if (isPDF(contentType)) {
return extractTextFromPDF(inputStream);
}
- try {
- return new ParsedContent(IOUtils.toString(inputStream, Charsets.UTF_8), ImmutableMap.of());
- } catch (IOException e) {
- return new ParsedContent(null, ImmutableMap.of());
- }
+ return new ParsedContent(IOUtils.toString(inputStream, Charsets.UTF_8), ImmutableMap.of());
}
private boolean isPDF(String contentType) {
return contentType.equals(PDF_TYPE);
}
- private ParsedContent extractTextFromPDF(InputStream inputStream) {
- try {
- return new ParsedContent(
- new PDFTextStripper().getText(
- PDDocument.load(inputStream)),
- ImmutableMap.of());
- } catch (IOException e) {
- return new ParsedContent(null, ImmutableMap.of());
- }
+ private ParsedContent extractTextFromPDF(InputStream inputStream) throws IOException {
+ return new ParsedContent(
+ new PDFTextStripper().getText(
+ PDDocument.load(inputStream)),
+ ImmutableMap.of());
}
}
http://git-wip-us.apache.org/repos/asf/james-project/blob/11e336a3/mailbox/scanning-search/src/test/java/org/apache/james/mailbox/store/search/PDFTextExtractorTest.java
----------------------------------------------------------------------
diff --git a/mailbox/scanning-search/src/test/java/org/apache/james/mailbox/store/search/PDFTextExtractorTest.java b/mailbox/scanning-search/src/test/java/org/apache/james/mailbox/store/search/PDFTextExtractorTest.java
new file mode 100644
index 0000000..df52009
--- /dev/null
+++ b/mailbox/scanning-search/src/test/java/org/apache/james/mailbox/store/search/PDFTextExtractorTest.java
@@ -0,0 +1,75 @@
+/****************************************************************
+ * Licensed to the Apache Software Foundation (ASF) under one *
+ * or more contributor license agreements. See the NOTICE file *
+ * distributed with this work for additional information *
+ * regarding copyright ownership. The ASF licenses this file *
+ * to you under the Apache License, Version 2.0 (the *
+ * "License"); you may not use this file except in compliance *
+ * with the License. You may obtain a copy of the License at *
+ * *
+ * http://www.apache.org/licenses/LICENSE-2.0 *
+ * *
+ * Unless required by applicable law or agreed to in writing, *
+ * software distributed under the License is distributed on an *
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY *
+ * KIND, either express or implied. See the License for the *
+ * specific language governing permissions and limitations *
+ * under the License. *
+ ****************************************************************/
+
+package org.apache.james.mailbox.store.search;
+
+import static org.assertj.core.api.Assertions.assertThat;
+import static org.assertj.core.api.Assertions.assertThatThrownBy;
+
+import java.io.ByteArrayInputStream;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+
+import org.junit.Before;
+import org.junit.Test;
+
+public class PDFTextExtractorTest {
+
+ private PDFTextExtractor testee;
+
+ @Before
+ public void setUp() {
+ testee = new PDFTextExtractor();
+ }
+
+ @Test
+ public void extractContentShouldThrowWhenNullInputStream() throws Exception {
+ assertThatThrownBy(() ->
+ testee.extractContent(null, "any/any"))
+ .isInstanceOf(NullPointerException.class);
+ }
+
+ @Test
+ public void extractContentShouldThrowWhenNullContentType() throws Exception {
+ InputStream inputStream = new ByteArrayInputStream("content".getBytes(StandardCharsets.UTF_8));
+ assertThatThrownBy(() -> testee.extractContent(inputStream, null))
+ .isInstanceOf(NullPointerException.class);
+ }
+
+ @Test
+ public void extractContentShouldExtractPlainText() throws Exception {
+ String content = "content";
+ InputStream inputStream = new ByteArrayInputStream(content.getBytes(StandardCharsets.UTF_8));
+
+ assertThat(testee.extractContent(inputStream, "text/plain")
+ .getTextualContent())
+ .isEqualTo(content);
+ }
+
+ @Test
+ public void extractContentShouldExtractPDF() throws Exception {
+ String content = "Little PDF";
+ InputStream inputStream = ClassLoader.getSystemResourceAsStream("pdf.pdf");
+
+ assertThat(testee.extractContent(inputStream, PDFTextExtractor.PDF_TYPE)
+ .getTextualContent())
+ .contains(content);
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/james-project/blob/11e336a3/mailbox/scanning-search/src/test/resources/pdf.pdf
----------------------------------------------------------------------
diff --git a/mailbox/scanning-search/src/test/resources/pdf.pdf b/mailbox/scanning-search/src/test/resources/pdf.pdf
new file mode 100644
index 0000000..5388d4a
Binary files /dev/null and b/mailbox/scanning-search/src/test/resources/pdf.pdf differ
http://git-wip-us.apache.org/repos/asf/james-project/blob/11e336a3/mailbox/store/src/main/java/org/apache/james/mailbox/store/search/MessageSearches.java
----------------------------------------------------------------------
diff --git a/mailbox/store/src/main/java/org/apache/james/mailbox/store/search/MessageSearches.java b/mailbox/store/src/main/java/org/apache/james/mailbox/store/search/MessageSearches.java
index d34c522..dc35559 100644
--- a/mailbox/store/src/main/java/org/apache/james/mailbox/store/search/MessageSearches.java
+++ b/mailbox/store/src/main/java/org/apache/james/mailbox/store/search/MessageSearches.java
@@ -35,13 +35,13 @@ import java.util.List;
import java.util.Locale;
import java.util.Optional;
import java.util.TimeZone;
+import java.util.stream.Stream;
import javax.mail.Flags;
import org.apache.james.mailbox.MessageUid;
import org.apache.james.mailbox.exception.MailboxException;
import org.apache.james.mailbox.exception.UnsupportedSearchException;
-import org.apache.james.mailbox.extractor.ParsedContent;
import org.apache.james.mailbox.extractor.TextExtractor;
import org.apache.james.mailbox.model.Attachment;
import org.apache.james.mailbox.model.MessageAttachment;
@@ -77,9 +77,7 @@ import org.apache.james.mime4j.utils.search.MessageMatcher;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import com.github.fge.lambdas.Throwing;
import com.google.common.collect.ImmutableList;
-import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Lists;
/**
@@ -252,12 +250,23 @@ public class MessageSearches implements Iterable<SimpleMessageSearchIndex.Search
private boolean isInAttachments(String value, List<MessageAttachment> attachments) {
return attachments.stream()
.map(MessageAttachment::getAttachment)
- .map(Throwing.function((Attachment attachment) -> textExtractor.extractContent(attachment.getStream(), attachment.getType()))
- .orReturn(new ParsedContent(null, ImmutableMap.of())))
- .map(ParsedContent::getTextualContent)
+ .flatMap(this::toAttachmentContent)
.anyMatch(string -> string.contains(value));
}
+ private Stream<String> toAttachmentContent(Attachment attachment) {
+ try {
+ return Stream.of(textExtractor
+ .extractContent(
+ attachment.getStream(),
+ attachment.getType())
+ .getTextualContent());
+ } catch (Exception e) {
+ LOGGER.error("Error while parsing attachment content", e);
+ return Stream.of();
+ }
+ }
+
private InputStream textHeaders(MailboxMessage message) throws MimeIOException, IOException {
ByteArrayOutputStream out = new ByteArrayOutputStream();
new DefaultMessageWriter()
---------------------------------------------------------------------
To unsubscribe, e-mail: server-dev-unsubscribe@james.apache.org
For additional commands, e-mail: server-dev-help@james.apache.org