You are viewing a plain text version of this content. The canonical link for it is here.
Posted to server-dev@james.apache.org by bt...@apache.org on 2018/07/11 03:34:36 UTC
[07/10] james-project git commit: JAMES-2456 Upgrade Tika / Tika
client should not throw
JAMES-2456 Upgrade Tika / Tika client should not throw
Project: http://git-wip-us.apache.org/repos/asf/james-project/repo
Commit: http://git-wip-us.apache.org/repos/asf/james-project/commit/c8bd682a
Tree: http://git-wip-us.apache.org/repos/asf/james-project/tree/c8bd682a
Diff: http://git-wip-us.apache.org/repos/asf/james-project/diff/c8bd682a
Branch: refs/heads/master
Commit: c8bd682adfadeb3b68c2da7624f44c5d25d09b7f
Parents: 6ef1a29
Author: Antoine Duprat <ad...@linagora.com>
Authored: Fri Jul 6 08:45:36 2018 +0200
Committer: benwa <bt...@linagora.com>
Committed: Wed Jul 11 10:29:00 2018 +0700
----------------------------------------------------------------------
.../james/mailbox/tika/TikaException.java | 26 --------------------
.../james/mailbox/tika/TikaHttpClient.java | 3 ++-
.../james/mailbox/tika/TikaHttpClientImpl.java | 20 ++++++++-------
.../james/mailbox/tika/TikaTextExtractor.java | 26 ++++++++++++++------
.../mailbox/tika/TikaTextExtractorTest.java | 17 +++++++++----
.../org/apache/james/util/docker/Images.java | 2 +-
6 files changed, 45 insertions(+), 49 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/james-project/blob/c8bd682a/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TikaException.java
----------------------------------------------------------------------
diff --git a/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TikaException.java b/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TikaException.java
deleted file mode 100644
index ecdc742..0000000
--- a/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TikaException.java
+++ /dev/null
@@ -1,26 +0,0 @@
-/****************************************************************
- * Licensed to the Apache Software Foundation (ASF) under one *
- * or more contributor license agreements. See the NOTICE file *
- * distributed with this work for additional information *
- * regarding copyright ownership. The ASF licenses this file *
- * to you under the Apache License, Version 2.0 (the *
- * "License"); you may not use this file except in compliance *
- * with the License. You may obtain a copy of the License at *
- * *
- * http://www.apache.org/licenses/LICENSE-2.0 *
- * *
- * Unless required by applicable law or agreed to in writing, *
- * software distributed under the License is distributed on an *
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY *
- * KIND, either express or implied. See the License for the *
- * specific language governing permissions and limitations *
- * under the License. *
- ****************************************************************/
-package org.apache.james.mailbox.tika;
-
-public class TikaException extends RuntimeException {
-
- public TikaException(Exception exception) {
- super(exception);
- }
-}
http://git-wip-us.apache.org/repos/asf/james-project/blob/c8bd682a/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TikaHttpClient.java
----------------------------------------------------------------------
diff --git a/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TikaHttpClient.java b/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TikaHttpClient.java
index e736d72..9e490db 100644
--- a/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TikaHttpClient.java
+++ b/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TikaHttpClient.java
@@ -19,8 +19,9 @@
package org.apache.james.mailbox.tika;
import java.io.InputStream;
+import java.util.Optional;
public interface TikaHttpClient {
- InputStream recursiveMetaDataAsJson(InputStream inputStream, String contentType) throws TikaException;
+ Optional<InputStream> recursiveMetaDataAsJson(InputStream inputStream, String contentType);
}
http://git-wip-us.apache.org/repos/asf/james-project/blob/c8bd682a/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TikaHttpClientImpl.java
----------------------------------------------------------------------
diff --git a/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TikaHttpClientImpl.java b/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TikaHttpClientImpl.java
index a8d9df4..32ee7e6 100644
--- a/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TikaHttpClientImpl.java
+++ b/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TikaHttpClientImpl.java
@@ -22,6 +22,7 @@ import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.net.URISyntaxException;
+import java.util.Optional;
import org.apache.http.client.fluent.Request;
import org.apache.http.client.utils.URIBuilder;
@@ -51,17 +52,18 @@ public class TikaHttpClientImpl implements TikaHttpClient {
}
@Override
- public InputStream recursiveMetaDataAsJson(InputStream inputStream, String contentType) throws TikaException {
+ public Optional<InputStream> recursiveMetaDataAsJson(InputStream inputStream, String contentType) {
try {
- return Request.Put(recursiveMetaData)
- .socketTimeout(tikaConfiguration.getTimeoutInMillis())
- .bodyStream(inputStream, ContentType.create(contentType))
- .execute()
- .returnContent()
- .asStream();
+ return Optional.ofNullable(
+ Request.Put(recursiveMetaData)
+ .socketTimeout(tikaConfiguration.getTimeoutInMillis())
+ .bodyStream(inputStream, ContentType.create(contentType))
+ .execute()
+ .returnContent()
+ .asStream());
} catch (IOException e) {
- LOGGER.error("Failing to call Tika", e);
- throw new TikaException(e);
+ LOGGER.warn("Failing to call Tika", e);
+ return Optional.empty();
}
}
http://git-wip-us.apache.org/repos/asf/james-project/blob/c8bd682a/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TikaTextExtractor.java
----------------------------------------------------------------------
diff --git a/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TikaTextExtractor.java b/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TikaTextExtractor.java
index 955647e..305e2a1 100644
--- a/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TikaTextExtractor.java
+++ b/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TikaTextExtractor.java
@@ -25,6 +25,7 @@ import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Objects;
+import java.util.Optional;
import java.util.function.Predicate;
import javax.inject.Inject;
@@ -51,6 +52,7 @@ import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.MoreObjects;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;
+import com.google.common.collect.ImmutableMap;
public class TikaTextExtractor implements TextExtractor {
@@ -82,11 +84,13 @@ public class TikaTextExtractor implements TextExtractor {
public ParsedContent performContentExtraction(InputStream inputStream, String contentType) throws IOException {
ContentAndMetadata contentAndMetadata = convert(tikaHttpClient.recursiveMetaDataAsJson(inputStream, contentType));
- return new ParsedContent(contentAndMetadata.getContent(), contentAndMetadata.getMetadata());
+ return new ParsedContent(contentAndMetadata.getContent().orElse(null), contentAndMetadata.getMetadata());
}
- private ContentAndMetadata convert(InputStream json) throws IOException, JsonParseException, JsonMappingException {
- return objectMapper.readValue(json, ContentAndMetadata.class);
+ private ContentAndMetadata convert(Optional<InputStream> maybeInputStream) throws IOException, JsonParseException, JsonMappingException {
+ return maybeInputStream
+ .map(Throwing.function(inputStream -> objectMapper.readValue(inputStream, ContentAndMetadata.class)))
+ .orElse(ContentAndMetadata.empty());
}
@VisibleForTesting
@@ -119,8 +123,12 @@ public class TikaTextExtractor implements TextExtractor {
private static final String TIKA_HEADER = "X-TIKA";
private static final String CONTENT_METADATA_HEADER_NAME = TIKA_HEADER + ":content";
+ public static ContentAndMetadata empty() {
+ return new ContentAndMetadata();
+ }
+
public static ContentAndMetadata from(Map<String, List<String>> contentAndMetadataMap) {
- return new ContentAndMetadata(content(contentAndMetadataMap),
+ return new ContentAndMetadata(Optional.ofNullable(content(contentAndMetadataMap)),
contentAndMetadataMap.entrySet().stream()
.filter(allHeadersButTika())
.collect(Guavate.toImmutableMap(Entry::getKey, Entry::getValue)));
@@ -139,15 +147,19 @@ public class TikaTextExtractor implements TextExtractor {
return StringUtils.stripStart(content.get(0), onlySpaces);
}
- private final String content;
+ private final Optional<String> content;
private final Map<String, List<String>> metadata;
- private ContentAndMetadata(String content, Map<String, List<String>> metadata) {
+ private ContentAndMetadata() {
+ this(Optional.empty(), ImmutableMap.of());
+ }
+
+ private ContentAndMetadata(Optional<String> content, Map<String, List<String>> metadata) {
this.content = content;
this.metadata = metadata;
}
- public String getContent() {
+ public Optional<String> getContent() {
return content;
}
http://git-wip-us.apache.org/repos/asf/james-project/blob/c8bd682a/mailbox/tika/src/test/java/org/apache/james/mailbox/tika/TikaTextExtractorTest.java
----------------------------------------------------------------------
diff --git a/mailbox/tika/src/test/java/org/apache/james/mailbox/tika/TikaTextExtractorTest.java b/mailbox/tika/src/test/java/org/apache/james/mailbox/tika/TikaTextExtractorTest.java
index 455a275..a78821f 100644
--- a/mailbox/tika/src/test/java/org/apache/james/mailbox/tika/TikaTextExtractorTest.java
+++ b/mailbox/tika/src/test/java/org/apache/james/mailbox/tika/TikaTextExtractorTest.java
@@ -27,6 +27,7 @@ import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.util.List;
+import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.james.mailbox.extractor.ParsedContent;
@@ -157,8 +158,10 @@ public class TikaTextExtractorTest {
@Test
public void deserializerShouldNotThrowWhenMoreThanOneNode() throws Exception {
TikaTextExtractor textExtractor = new TikaTextExtractor(
- new NoopMetricFactory(), (inputStream, contentType) -> new ByteArrayInputStream(("[{\"X-TIKA:content\": \"This is an awesome LibreOffice document !\"}, " +
- "{\"Chroma BlackIsZero\": \"true\"}]").getBytes(StandardCharsets.UTF_8)));
+ new NoopMetricFactory(),
+ (inputStream, contentType) -> Optional.of(new ByteArrayInputStream(("[{\"X-TIKA:content\": \"This is an awesome LibreOffice document !\"}, " +
+ "{\"Chroma BlackIsZero\": \"true\"}]")
+ .getBytes(StandardCharsets.UTF_8))));
InputStream inputStream = null;
textExtractor.extractContent(inputStream, "text/plain");
@@ -168,8 +171,10 @@ public class TikaTextExtractorTest {
public void deserializerShouldTakeFirstNodeWhenSeveral() throws Exception {
String expectedExtractedContent = "content A";
TikaTextExtractor textExtractor = new TikaTextExtractor(
- new NoopMetricFactory(), (inputStream, contentType) -> new ByteArrayInputStream(("[{\"X-TIKA:content\": \"" + expectedExtractedContent + "\"}, " +
- "{\"X-TIKA:content\": \"content B\"}]").getBytes(StandardCharsets.UTF_8)));
+ new NoopMetricFactory(),
+ (inputStream, contentType) -> Optional.of(new ByteArrayInputStream(("[{\"X-TIKA:content\": \"" + expectedExtractedContent + "\"}, " +
+ "{\"X-TIKA:content\": \"content B\"}]")
+ .getBytes(StandardCharsets.UTF_8))));
InputStream inputStream = null;
ParsedContent parsedContent = textExtractor.extractContent(inputStream, "text/plain");
@@ -183,7 +188,9 @@ public class TikaTextExtractorTest {
expectedException.expectMessage("The element should be a Json object");
TikaTextExtractor textExtractor = new TikaTextExtractor(
- new NoopMetricFactory(), (inputStream, contentType) -> new ByteArrayInputStream("[\"value1\"]".getBytes(StandardCharsets.UTF_8)));
+ new NoopMetricFactory(),
+ (inputStream, contentType) -> Optional.of(new ByteArrayInputStream("[\"value1\"]"
+ .getBytes(StandardCharsets.UTF_8))));
InputStream inputStream = null;
textExtractor.extractContent(inputStream, "text/plain");
http://git-wip-us.apache.org/repos/asf/james-project/blob/c8bd682a/server/container/util-java8/src/test/java/org/apache/james/util/docker/Images.java
----------------------------------------------------------------------
diff --git a/server/container/util-java8/src/test/java/org/apache/james/util/docker/Images.java b/server/container/util-java8/src/test/java/org/apache/james/util/docker/Images.java
index 549cf02..d0da2aa 100644
--- a/server/container/util-java8/src/test/java/org/apache/james/util/docker/Images.java
+++ b/server/container/util-java8/src/test/java/org/apache/james/util/docker/Images.java
@@ -24,6 +24,6 @@ public interface Images {
String RABBITMQ = "rabbitmq:3.7.5";
String ELASTICSEARCH = "elasticsearch:2.2.2";
String NGINX = "nginx:1.7.1";
- String TIKA = "logicalspark/docker-tikaserver:1.15rc2";
+ String TIKA = "linagora/docker-tikaserver:1.18-SNAPSHOT-plus-TIKA-2520";
String SPAMASSASSIN = "dinkel/spamassassin:3.4.0";
}
---------------------------------------------------------------------
To unsubscribe, e-mail: server-dev-unsubscribe@james.apache.org
For additional commands, e-mail: server-dev-help@james.apache.org