You are viewing a plain text version of this content. The canonical link for it is here.
Posted to server-dev@james.apache.org by bt...@apache.org on 2018/07/11 03:34:36 UTC

[07/10] james-project git commit: JAMES-2456 Upgrade Tika / Tika client should not throw

JAMES-2456 Upgrade Tika / Tika client should not throw


Project: http://git-wip-us.apache.org/repos/asf/james-project/repo
Commit: http://git-wip-us.apache.org/repos/asf/james-project/commit/c8bd682a
Tree: http://git-wip-us.apache.org/repos/asf/james-project/tree/c8bd682a
Diff: http://git-wip-us.apache.org/repos/asf/james-project/diff/c8bd682a

Branch: refs/heads/master
Commit: c8bd682adfadeb3b68c2da7624f44c5d25d09b7f
Parents: 6ef1a29
Author: Antoine Duprat <ad...@linagora.com>
Authored: Fri Jul 6 08:45:36 2018 +0200
Committer: benwa <bt...@linagora.com>
Committed: Wed Jul 11 10:29:00 2018 +0700

----------------------------------------------------------------------
 .../james/mailbox/tika/TikaException.java       | 26 --------------------
 .../james/mailbox/tika/TikaHttpClient.java      |  3 ++-
 .../james/mailbox/tika/TikaHttpClientImpl.java  | 20 ++++++++-------
 .../james/mailbox/tika/TikaTextExtractor.java   | 26 ++++++++++++++------
 .../mailbox/tika/TikaTextExtractorTest.java     | 17 +++++++++----
 .../org/apache/james/util/docker/Images.java    |  2 +-
 6 files changed, 45 insertions(+), 49 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/james-project/blob/c8bd682a/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TikaException.java
----------------------------------------------------------------------
diff --git a/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TikaException.java b/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TikaException.java
deleted file mode 100644
index ecdc742..0000000
--- a/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TikaException.java
+++ /dev/null
@@ -1,26 +0,0 @@
-/****************************************************************
- * Licensed to the Apache Software Foundation (ASF) under one   *
- * or more contributor license agreements.  See the NOTICE file *
- * distributed with this work for additional information        *
- * regarding copyright ownership.  The ASF licenses this file   *
- * to you under the Apache License, Version 2.0 (the            *
- * "License"); you may not use this file except in compliance   *
- * with the License.  You may obtain a copy of the License at   *
- *                                                              *
- *   http://www.apache.org/licenses/LICENSE-2.0                 *
- *                                                              *
- * Unless required by applicable law or agreed to in writing,   *
- * software distributed under the License is distributed on an  *
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY       *
- * KIND, either express or implied.  See the License for the    *
- * specific language governing permissions and limitations      *
- * under the License.                                           *
- ****************************************************************/
-package org.apache.james.mailbox.tika;
-
-public class TikaException extends RuntimeException {
-
-    public TikaException(Exception exception) {
-        super(exception);
-    }
-}

http://git-wip-us.apache.org/repos/asf/james-project/blob/c8bd682a/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TikaHttpClient.java
----------------------------------------------------------------------
diff --git a/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TikaHttpClient.java b/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TikaHttpClient.java
index e736d72..9e490db 100644
--- a/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TikaHttpClient.java
+++ b/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TikaHttpClient.java
@@ -19,8 +19,9 @@
 package org.apache.james.mailbox.tika;
 
 import java.io.InputStream;
+import java.util.Optional;
 
 public interface TikaHttpClient {
 
-    InputStream recursiveMetaDataAsJson(InputStream inputStream, String contentType) throws TikaException;
+    Optional<InputStream> recursiveMetaDataAsJson(InputStream inputStream, String contentType);
 }

http://git-wip-us.apache.org/repos/asf/james-project/blob/c8bd682a/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TikaHttpClientImpl.java
----------------------------------------------------------------------
diff --git a/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TikaHttpClientImpl.java b/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TikaHttpClientImpl.java
index a8d9df4..32ee7e6 100644
--- a/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TikaHttpClientImpl.java
+++ b/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TikaHttpClientImpl.java
@@ -22,6 +22,7 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.net.URI;
 import java.net.URISyntaxException;
+import java.util.Optional;
 
 import org.apache.http.client.fluent.Request;
 import org.apache.http.client.utils.URIBuilder;
@@ -51,17 +52,18 @@ public class TikaHttpClientImpl implements TikaHttpClient {
     }
 
     @Override
-    public InputStream recursiveMetaDataAsJson(InputStream inputStream, String contentType) throws TikaException {
+    public Optional<InputStream> recursiveMetaDataAsJson(InputStream inputStream, String contentType) {
         try {
-            return Request.Put(recursiveMetaData)
-                    .socketTimeout(tikaConfiguration.getTimeoutInMillis())
-                    .bodyStream(inputStream, ContentType.create(contentType))
-                    .execute()
-                    .returnContent()
-                    .asStream();
+            return Optional.ofNullable(
+                    Request.Put(recursiveMetaData)
+                        .socketTimeout(tikaConfiguration.getTimeoutInMillis())
+                        .bodyStream(inputStream, ContentType.create(contentType))
+                        .execute()
+                        .returnContent()
+                        .asStream());
         } catch (IOException e) {
-            LOGGER.error("Failing to call Tika", e);
-            throw new TikaException(e);
+            LOGGER.warn("Failing to call Tika", e);
+            return Optional.empty();
         }
     }
 

http://git-wip-us.apache.org/repos/asf/james-project/blob/c8bd682a/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TikaTextExtractor.java
----------------------------------------------------------------------
diff --git a/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TikaTextExtractor.java b/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TikaTextExtractor.java
index 955647e..305e2a1 100644
--- a/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TikaTextExtractor.java
+++ b/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TikaTextExtractor.java
@@ -25,6 +25,7 @@ import java.util.List;
 import java.util.Map;
 import java.util.Map.Entry;
 import java.util.Objects;
+import java.util.Optional;
 import java.util.function.Predicate;
 
 import javax.inject.Inject;
@@ -51,6 +52,7 @@ import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.MoreObjects;
 import com.google.common.base.Preconditions;
 import com.google.common.collect.ImmutableList;
+import com.google.common.collect.ImmutableMap;
 
 public class TikaTextExtractor implements TextExtractor {
 
@@ -82,11 +84,13 @@ public class TikaTextExtractor implements TextExtractor {
 
     public ParsedContent performContentExtraction(InputStream inputStream, String contentType) throws IOException {
         ContentAndMetadata contentAndMetadata = convert(tikaHttpClient.recursiveMetaDataAsJson(inputStream, contentType));
-        return new ParsedContent(contentAndMetadata.getContent(), contentAndMetadata.getMetadata());
+        return new ParsedContent(contentAndMetadata.getContent().orElse(null), contentAndMetadata.getMetadata());
     }
 
-    private ContentAndMetadata convert(InputStream json) throws IOException, JsonParseException, JsonMappingException {
-        return objectMapper.readValue(json, ContentAndMetadata.class);
+    private ContentAndMetadata convert(Optional<InputStream> maybeInputStream) throws IOException, JsonParseException, JsonMappingException {
+        return maybeInputStream
+                .map(Throwing.function(inputStream -> objectMapper.readValue(inputStream, ContentAndMetadata.class)))
+                .orElse(ContentAndMetadata.empty());
     }
 
     @VisibleForTesting
@@ -119,8 +123,12 @@ public class TikaTextExtractor implements TextExtractor {
         private static final String TIKA_HEADER = "X-TIKA";
         private static final String CONTENT_METADATA_HEADER_NAME = TIKA_HEADER + ":content";
 
+        public static ContentAndMetadata empty() {
+            return new ContentAndMetadata();
+        }
+
         public static ContentAndMetadata from(Map<String, List<String>> contentAndMetadataMap) {
-            return new ContentAndMetadata(content(contentAndMetadataMap),
+            return new ContentAndMetadata(Optional.ofNullable(content(contentAndMetadataMap)),
                     contentAndMetadataMap.entrySet().stream()
                         .filter(allHeadersButTika())
                         .collect(Guavate.toImmutableMap(Entry::getKey, Entry::getValue)));
@@ -139,15 +147,19 @@ public class TikaTextExtractor implements TextExtractor {
             return StringUtils.stripStart(content.get(0), onlySpaces);
         }
 
-        private final String content;
+        private final Optional<String> content;
         private final Map<String, List<String>> metadata;
 
-        private ContentAndMetadata(String content, Map<String, List<String>> metadata) {
+        private ContentAndMetadata() {
+            this(Optional.empty(), ImmutableMap.of());
+        }
+
+        private ContentAndMetadata(Optional<String> content, Map<String, List<String>> metadata) {
             this.content = content;
             this.metadata = metadata;
         }
 
-        public String getContent() {
+        public Optional<String> getContent() {
             return content;
         }
 

http://git-wip-us.apache.org/repos/asf/james-project/blob/c8bd682a/mailbox/tika/src/test/java/org/apache/james/mailbox/tika/TikaTextExtractorTest.java
----------------------------------------------------------------------
diff --git a/mailbox/tika/src/test/java/org/apache/james/mailbox/tika/TikaTextExtractorTest.java b/mailbox/tika/src/test/java/org/apache/james/mailbox/tika/TikaTextExtractorTest.java
index 455a275..a78821f 100644
--- a/mailbox/tika/src/test/java/org/apache/james/mailbox/tika/TikaTextExtractorTest.java
+++ b/mailbox/tika/src/test/java/org/apache/james/mailbox/tika/TikaTextExtractorTest.java
@@ -27,6 +27,7 @@ import java.io.ByteArrayInputStream;
 import java.io.InputStream;
 import java.nio.charset.StandardCharsets;
 import java.util.List;
+import java.util.Optional;
 
 import org.apache.commons.io.IOUtils;
 import org.apache.james.mailbox.extractor.ParsedContent;
@@ -157,8 +158,10 @@ public class TikaTextExtractorTest {
     @Test
     public void deserializerShouldNotThrowWhenMoreThanOneNode() throws Exception {
         TikaTextExtractor textExtractor = new TikaTextExtractor(
-            new NoopMetricFactory(), (inputStream, contentType) -> new ByteArrayInputStream(("[{\"X-TIKA:content\": \"This is an awesome LibreOffice document !\"}, " +
-                "{\"Chroma BlackIsZero\": \"true\"}]").getBytes(StandardCharsets.UTF_8)));
+            new NoopMetricFactory(), 
+            (inputStream, contentType) -> Optional.of(new ByteArrayInputStream(("[{\"X-TIKA:content\": \"This is an awesome LibreOffice document !\"}, " +
+                                                            "{\"Chroma BlackIsZero\": \"true\"}]")
+                                                        .getBytes(StandardCharsets.UTF_8))));
 
         InputStream inputStream = null;
         textExtractor.extractContent(inputStream, "text/plain");
@@ -168,8 +171,10 @@ public class TikaTextExtractorTest {
     public void deserializerShouldTakeFirstNodeWhenSeveral() throws Exception {
         String expectedExtractedContent = "content A";
         TikaTextExtractor textExtractor = new TikaTextExtractor(
-            new NoopMetricFactory(), (inputStream, contentType) -> new ByteArrayInputStream(("[{\"X-TIKA:content\": \"" + expectedExtractedContent + "\"}, " +
-                "{\"X-TIKA:content\": \"content B\"}]").getBytes(StandardCharsets.UTF_8)));
+            new NoopMetricFactory(), 
+            (inputStream, contentType) -> Optional.of(new ByteArrayInputStream(("[{\"X-TIKA:content\": \"" + expectedExtractedContent + "\"}, " +
+                                                            "{\"X-TIKA:content\": \"content B\"}]")
+                                                        .getBytes(StandardCharsets.UTF_8))));
 
         InputStream inputStream = null;
         ParsedContent parsedContent = textExtractor.extractContent(inputStream, "text/plain");
@@ -183,7 +188,9 @@ public class TikaTextExtractorTest {
         expectedException.expectMessage("The element should be a Json object");
 
         TikaTextExtractor textExtractor = new TikaTextExtractor(
-            new NoopMetricFactory(), (inputStream, contentType) -> new ByteArrayInputStream("[\"value1\"]".getBytes(StandardCharsets.UTF_8)));
+            new NoopMetricFactory(), 
+            (inputStream, contentType) -> Optional.of(new ByteArrayInputStream("[\"value1\"]"
+                                                        .getBytes(StandardCharsets.UTF_8))));
 
         InputStream inputStream = null;
         textExtractor.extractContent(inputStream, "text/plain");

http://git-wip-us.apache.org/repos/asf/james-project/blob/c8bd682a/server/container/util-java8/src/test/java/org/apache/james/util/docker/Images.java
----------------------------------------------------------------------
diff --git a/server/container/util-java8/src/test/java/org/apache/james/util/docker/Images.java b/server/container/util-java8/src/test/java/org/apache/james/util/docker/Images.java
index 549cf02..d0da2aa 100644
--- a/server/container/util-java8/src/test/java/org/apache/james/util/docker/Images.java
+++ b/server/container/util-java8/src/test/java/org/apache/james/util/docker/Images.java
@@ -24,6 +24,6 @@ public interface Images {
     String RABBITMQ = "rabbitmq:3.7.5";
     String ELASTICSEARCH = "elasticsearch:2.2.2";
     String NGINX = "nginx:1.7.1";
-    String TIKA = "logicalspark/docker-tikaserver:1.15rc2";
+    String TIKA = "linagora/docker-tikaserver:1.18-SNAPSHOT-plus-TIKA-2520";
     String SPAMASSASSIN = "dinkel/spamassassin:3.4.0";
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: server-dev-unsubscribe@james.apache.org
For additional commands, e-mail: server-dev-help@james.apache.org