You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/03/19 14:05:18 UTC
[tika] 01/02: TIKA-3073 -- allow gz compression of input and output
streams in tika-server
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 4bab6a885b48e1d2b41b2fb305b761145aa92fbc
Author: tallison <ta...@apache.org>
AuthorDate: Thu Mar 19 10:03:54 2020 -0400
TIKA-3073 -- allow gz compression of input and output streams in tika-server
---
CHANGES.txt | 4 ++
.../java/org/apache/tika/server/TikaServerCli.java | 9 ++++
.../apache/tika/server/resource/TikaResource.java | 1 -
.../java/org/apache/tika/server/CXFTestBase.java | 26 ++++++++++-
.../tika/server/RecursiveMetadataResourceTest.java | 42 +++++++++++++++++-
.../org/apache/tika/server/TikaResourceTest.java | 51 ++++++++++++++++++++++
6 files changed, 130 insertions(+), 3 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 4b1eddb..19d1985 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -5,6 +5,10 @@ Release 2.0.0 - ???
Other changes
+Release 1.25 - ???
+
+ * Allow gzip compression of input and output streams for tika-server (TIKA-3073).
+
Release 1.24 - 3/11/2019
* Add scripts to run tika-server as a service via Eric Pugh,
diff --git a/tika-server/src/main/java/org/apache/tika/server/TikaServerCli.java b/tika-server/src/main/java/org/apache/tika/server/TikaServerCli.java
index a049373..10616cd 100644
--- a/tika-server/src/main/java/org/apache/tika/server/TikaServerCli.java
+++ b/tika-server/src/main/java/org/apache/tika/server/TikaServerCli.java
@@ -22,6 +22,7 @@ import java.io.InputStream;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
+import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
@@ -37,6 +38,8 @@ import org.apache.cxf.jaxrs.JAXRSServerFactoryBean;
import org.apache.cxf.jaxrs.lifecycle.ResourceProvider;
import org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider;
import org.apache.cxf.rs.security.cors.CrossOriginResourceSharingFilter;
+import org.apache.cxf.transport.common.gzip.GZIPInInterceptor;
+import org.apache.cxf.transport.common.gzip.GZIPOutInterceptor;
import org.apache.tika.Tika;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.parser.DigestingParser;
@@ -323,6 +326,12 @@ public class TikaServerCli {
}
sf.setProviders(providers);
+ //set compression interceptors
+ sf.setOutInterceptors(
+ Collections.singletonList(new GZIPOutInterceptor())
+ );
+ sf.setInInterceptors(
+ Collections.singletonList(new GZIPInInterceptor()));
String url = "http://" + host + ":" + port + "/";
sf.setAddress(url);
diff --git a/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java b/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
index c5bfa8f..0275b7e 100644
--- a/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
+++ b/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
@@ -75,7 +75,6 @@ import java.io.Writer;
import java.lang.reflect.Field;
import java.lang.reflect.Method;
import java.util.Locale;
-import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
diff --git a/tika-server/src/test/java/org/apache/tika/server/CXFTestBase.java b/tika-server/src/test/java/org/apache/tika/server/CXFTestBase.java
index 32dd235..92c9d34 100644
--- a/tika-server/src/test/java/org/apache/tika/server/CXFTestBase.java
+++ b/tika-server/src/test/java/org/apache/tika/server/CXFTestBase.java
@@ -25,11 +25,15 @@ import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
+import java.io.OutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardCopyOption;
+import java.util.ArrayList;
+import java.util.Collections;
import java.util.Enumeration;
import java.util.HashMap;
+import java.util.List;
import java.util.Map;
import org.apache.commons.codec.digest.DigestUtils;
@@ -37,11 +41,16 @@ import org.apache.commons.compress.archivers.ArchiveEntry;
import org.apache.commons.compress.archivers.ArchiveInputStream;
import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
import org.apache.commons.compress.archivers.zip.ZipFile;
+import org.apache.commons.compress.compressors.gzip.GzipCompressorOutputStream;
import org.apache.commons.io.IOUtils;
import org.apache.cxf.binding.BindingFactoryManager;
import org.apache.cxf.endpoint.Server;
+import org.apache.cxf.interceptor.Interceptor;
import org.apache.cxf.jaxrs.JAXRSBindingFactory;
import org.apache.cxf.jaxrs.JAXRSServerFactoryBean;
+import org.apache.cxf.message.Message;
+import org.apache.cxf.transport.common.gzip.GZIPInInterceptor;
+import org.apache.cxf.transport.common.gzip.GZIPOutInterceptor;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.parser.utils.CommonsDigester;
import org.apache.tika.server.resource.TikaResource;
@@ -86,6 +95,13 @@ public abstract class CXFTestBase {
new CommonsDigester(DIGESTER_READ_LIMIT, "md5,sha1:32"),
new DefaultInputStreamFactory(), new ServerStatus(true));
JAXRSServerFactoryBean sf = new JAXRSServerFactoryBean();
+ //set compression interceptors
+ sf.setOutInterceptors(
+ Collections.singletonList(new GZIPOutInterceptor())
+ );
+ sf.setInInterceptors(
+ Collections.singletonList(new GZIPInInterceptor()));
+
setUpResources(sf);
setUpProviders(sf);
sf.setAddress(endPoint + "/");
@@ -101,7 +117,6 @@ public abstract class CXFTestBase {
JAXRSBindingFactory.JAXRS_BINDING_ID,
factory
);
-
server = sf.create();
}
@@ -176,4 +191,13 @@ public abstract class CXFTestBase {
return tmp;
}
+ public static InputStream gzip(InputStream is) throws IOException {
+ ByteArrayOutputStream bos = new ByteArrayOutputStream();
+ OutputStream gz = new GzipCompressorOutputStream(bos);
+ IOUtils.copy(is, gz);
+ gz.flush();
+ gz.close();
+ return new ByteArrayInputStream(bos.toByteArray());
+ }
+
}
diff --git a/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java b/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java
index ec7e389..b878f47 100644
--- a/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java
+++ b/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java
@@ -25,21 +25,25 @@ import static org.junit.Assert.assertTrue;
import javax.ws.rs.core.Response;
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.ArrayList;
import java.util.List;
+import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
+import org.apache.commons.compress.compressors.gzip.GzipCompressorOutputStream;
import org.apache.cxf.jaxrs.JAXRSServerFactoryBean;
import org.apache.cxf.jaxrs.client.WebClient;
import org.apache.cxf.jaxrs.ext.multipart.Attachment;
import org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider;
+import org.apache.tika.io.IOUtils;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.OfficeOpenXMLExtended;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.metadata.serialization.JsonMetadataList;
-import org.apache.tika.parser.RecursiveParserWrapper;
import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
import org.apache.tika.sax.RecursiveParserWrapperHandler;
import org.apache.tika.server.resource.RecursiveMetadataResource;
@@ -73,6 +77,42 @@ public class RecursiveMetadataResourceTest extends CXFTestBase {
}
@Test
+ public void testGZOut() throws Exception {
+ Response response = WebClient
+ .create(endPoint + META_PATH)
+ .accept("application/json")
+ .acceptEncoding("gzip")
+ .put(ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
+
+ Reader reader = new InputStreamReader(new GzipCompressorInputStream((InputStream) response.getEntity()), UTF_8);
+ List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
+ assertEquals(12, metadataList.size());
+ assertEquals("Microsoft Office Word", metadataList.get(0).get(OfficeOpenXMLExtended.APPLICATION));
+ assertContains("plundered our seas", metadataList.get(6).get("X-TIKA:content"));
+
+ assertEquals("a38e6c7b38541af87148dee9634cb811", metadataList.get(10).get("X-TIKA:digest:MD5"));
+ }
+
+ @Test
+ public void testGZIn() throws Exception {
+
+ Response response = WebClient
+ .create(endPoint + META_PATH)
+ .accept("application/json")
+ .encoding("gzip")
+ .put(gzip(ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC)));
+
+ Reader reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
+ List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
+ assertEquals(12, metadataList.size());
+ assertEquals("Microsoft Office Word", metadataList.get(0).get(OfficeOpenXMLExtended.APPLICATION));
+ assertContains("plundered our seas", metadataList.get(6).get("X-TIKA:content"));
+
+ assertEquals("a38e6c7b38541af87148dee9634cb811", metadataList.get(10).get("X-TIKA:digest:MD5"));
+
+ }
+
+ @Test
public void testSimpleWord() throws Exception {
Response response = WebClient
.create(endPoint + META_PATH)
diff --git a/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java b/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
index 3f65418..6b6fa23 100644
--- a/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
+++ b/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
@@ -18,6 +18,8 @@
package org.apache.tika.server;
import org.apache.commons.codec.binary.Base64;
+import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
+import org.apache.commons.io.IOUtils;
import org.apache.cxf.attachment.AttachmentUtil;
import org.apache.cxf.jaxrs.JAXRSServerFactoryBean;
import org.apache.cxf.jaxrs.client.WebClient;
@@ -30,12 +32,17 @@ import org.junit.Ignore;
import org.junit.Test;
import javax.ws.rs.ProcessingException;
+import javax.ws.rs.core.MultivaluedMap;
import javax.ws.rs.core.Response;
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
+import java.util.Set;
+import static org.apache.cxf.helpers.HttpHeaderHelper.CONTENT_ENCODING;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
@@ -85,6 +92,50 @@ public class TikaResourceTest extends CXFTestBase {
}
@Test
+ public void testWordGzipIn() throws Exception {
+ Response response = WebClient.create(endPoint + TIKA_PATH)
+ .type("application/msword")
+ .accept("text/plain")
+ .encoding("gzip")
+ .put(gzip(ClassLoader.getSystemResourceAsStream(TEST_DOC)));
+ String responseMsg = getStringFromInputStream((InputStream) response
+ .getEntity());
+ assertTrue(responseMsg.contains("test"));
+ }
+
+ @Test
+ public void testLongGzipOut() throws Exception {
+ //if the output is long enough, jax-rs will compress it, otherwise it won't
+ //this output is long enough, and should be compressed
+ Response response = WebClient.create(endPoint + TIKA_PATH)
+ .accept("text/plain")
+ .acceptEncoding("gzip")
+ .put(ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
+ assertTrue(response.getHeaders().containsKey(CONTENT_ENCODING));
+ assertEquals("gzip", response.getHeaderString(CONTENT_ENCODING));
+ String responseMsg = getStringFromInputStream(
+ new GzipCompressorInputStream((InputStream) response
+ .getEntity()));
+ assertTrue(responseMsg.contains("Course of human"));
+ }
+
+ @Test
+ public void testShortGzipOut() throws Exception {
+ //if the output is long enough, jax-rs will compress it, otherwise it won't
+ //this output is short enough, and should not be compressed
+ Response response = WebClient.create(endPoint + TIKA_PATH)
+ .accept("text/plain")
+ .acceptEncoding("gzip")
+ .put(ClassLoader.getSystemResourceAsStream(TEST_DOC));
+ assertFalse(response.getHeaders().containsKey(CONTENT_ENCODING));
+
+ String responseMsg = getStringFromInputStream(
+ (InputStream) response
+ .getEntity());
+ assertTrue(responseMsg.contains("test"));
+ }
+
+ @Test
public void testTextMain() throws Exception {
//boilerpipe
Response response = WebClient.create(endPoint + TIKA_PATH + "/main")