You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/03/19 14:05:18 UTC

[tika] 01/02: TIKA-3073 -- allow gz compression of input and output streams in tika-server

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 4bab6a885b48e1d2b41b2fb305b761145aa92fbc
Author: tallison <ta...@apache.org>
AuthorDate: Thu Mar 19 10:03:54 2020 -0400

    TIKA-3073 -- allow gz compression of input and output streams in tika-server
---
 CHANGES.txt                                        |  4 ++
 .../java/org/apache/tika/server/TikaServerCli.java |  9 ++++
 .../apache/tika/server/resource/TikaResource.java  |  1 -
 .../java/org/apache/tika/server/CXFTestBase.java   | 26 ++++++++++-
 .../tika/server/RecursiveMetadataResourceTest.java | 42 +++++++++++++++++-
 .../org/apache/tika/server/TikaResourceTest.java   | 51 ++++++++++++++++++++++
 6 files changed, 130 insertions(+), 3 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 4b1eddb..19d1985 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -5,6 +5,10 @@ Release 2.0.0 - ???
 
    Other changes
 
+Release 1.25 - ???
+
+   * Allow gzip compression of input and output streams for tika-server (TIKA-3073).
+
 Release 1.24 - 3/11/2019
 
    * Add scripts to run tika-server as a service via Eric Pugh,
diff --git a/tika-server/src/main/java/org/apache/tika/server/TikaServerCli.java b/tika-server/src/main/java/org/apache/tika/server/TikaServerCli.java
index a049373..10616cd 100644
--- a/tika-server/src/main/java/org/apache/tika/server/TikaServerCli.java
+++ b/tika-server/src/main/java/org/apache/tika/server/TikaServerCli.java
@@ -22,6 +22,7 @@ import java.io.InputStream;
 import java.nio.file.Paths;
 import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.Collections;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Set;
@@ -37,6 +38,8 @@ import org.apache.cxf.jaxrs.JAXRSServerFactoryBean;
 import org.apache.cxf.jaxrs.lifecycle.ResourceProvider;
 import org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider;
 import org.apache.cxf.rs.security.cors.CrossOriginResourceSharingFilter;
+import org.apache.cxf.transport.common.gzip.GZIPInInterceptor;
+import org.apache.cxf.transport.common.gzip.GZIPOutInterceptor;
 import org.apache.tika.Tika;
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.parser.DigestingParser;
@@ -323,6 +326,12 @@ public class TikaServerCli {
             }
             sf.setProviders(providers);
 
+            //set compression interceptors
+            sf.setOutInterceptors(
+                    Collections.singletonList(new GZIPOutInterceptor())
+            );
+            sf.setInInterceptors(
+                    Collections.singletonList(new GZIPInInterceptor()));
 
             String url = "http://" + host + ":" + port + "/";
             sf.setAddress(url);
diff --git a/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java b/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
index c5bfa8f..0275b7e 100644
--- a/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
+++ b/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
@@ -75,7 +75,6 @@ import java.io.Writer;
 import java.lang.reflect.Field;
 import java.lang.reflect.Method;
 import java.util.Locale;
-import java.util.Set;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
diff --git a/tika-server/src/test/java/org/apache/tika/server/CXFTestBase.java b/tika-server/src/test/java/org/apache/tika/server/CXFTestBase.java
index 32dd235..92c9d34 100644
--- a/tika-server/src/test/java/org/apache/tika/server/CXFTestBase.java
+++ b/tika-server/src/test/java/org/apache/tika/server/CXFTestBase.java
@@ -25,11 +25,15 @@ import java.io.ByteArrayInputStream;
 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.OutputStream;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.nio.file.StandardCopyOption;
+import java.util.ArrayList;
+import java.util.Collections;
 import java.util.Enumeration;
 import java.util.HashMap;
+import java.util.List;
 import java.util.Map;
 
 import org.apache.commons.codec.digest.DigestUtils;
@@ -37,11 +41,16 @@ import org.apache.commons.compress.archivers.ArchiveEntry;
 import org.apache.commons.compress.archivers.ArchiveInputStream;
 import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
 import org.apache.commons.compress.archivers.zip.ZipFile;
+import org.apache.commons.compress.compressors.gzip.GzipCompressorOutputStream;
 import org.apache.commons.io.IOUtils;
 import org.apache.cxf.binding.BindingFactoryManager;
 import org.apache.cxf.endpoint.Server;
+import org.apache.cxf.interceptor.Interceptor;
 import org.apache.cxf.jaxrs.JAXRSBindingFactory;
 import org.apache.cxf.jaxrs.JAXRSServerFactoryBean;
+import org.apache.cxf.message.Message;
+import org.apache.cxf.transport.common.gzip.GZIPInInterceptor;
+import org.apache.cxf.transport.common.gzip.GZIPOutInterceptor;
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.parser.utils.CommonsDigester;
 import org.apache.tika.server.resource.TikaResource;
@@ -86,6 +95,13 @@ public abstract class CXFTestBase {
                 new CommonsDigester(DIGESTER_READ_LIMIT, "md5,sha1:32"),
                 new DefaultInputStreamFactory(), new ServerStatus(true));
         JAXRSServerFactoryBean sf = new JAXRSServerFactoryBean();
+        //set compression interceptors
+        sf.setOutInterceptors(
+                Collections.singletonList(new GZIPOutInterceptor())
+        );
+        sf.setInInterceptors(
+                Collections.singletonList(new GZIPInInterceptor()));
+
         setUpResources(sf);
         setUpProviders(sf);
         sf.setAddress(endPoint + "/");
@@ -101,7 +117,6 @@ public abstract class CXFTestBase {
                 JAXRSBindingFactory.JAXRS_BINDING_ID,
                 factory
         );
-
         server = sf.create();
     }
 
@@ -176,4 +191,13 @@ public abstract class CXFTestBase {
         return tmp;
     }
 
+    public static InputStream gzip(InputStream is) throws IOException {
+        ByteArrayOutputStream bos = new ByteArrayOutputStream();
+        OutputStream gz = new GzipCompressorOutputStream(bos);
+        IOUtils.copy(is, gz);
+        gz.flush();
+        gz.close();
+        return new ByteArrayInputStream(bos.toByteArray());
+    }
+
 }
diff --git a/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java b/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java
index ec7e389..b878f47 100644
--- a/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java
+++ b/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java
@@ -25,21 +25,25 @@ import static org.junit.Assert.assertTrue;
 
 import javax.ws.rs.core.Response;
 
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.Reader;
 import java.util.ArrayList;
 import java.util.List;
 
+import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
+import org.apache.commons.compress.compressors.gzip.GzipCompressorOutputStream;
 import org.apache.cxf.jaxrs.JAXRSServerFactoryBean;
 import org.apache.cxf.jaxrs.client.WebClient;
 import org.apache.cxf.jaxrs.ext.multipart.Attachment;
 import org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider;
+import org.apache.tika.io.IOUtils;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.OfficeOpenXMLExtended;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.metadata.serialization.JsonMetadataList;
-import org.apache.tika.parser.RecursiveParserWrapper;
 import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
 import org.apache.tika.sax.RecursiveParserWrapperHandler;
 import org.apache.tika.server.resource.RecursiveMetadataResource;
@@ -73,6 +77,42 @@ public class RecursiveMetadataResourceTest extends CXFTestBase {
     }
 
     @Test
+    public void testGZOut() throws Exception {
+        Response response = WebClient
+                .create(endPoint + META_PATH)
+                .accept("application/json")
+                .acceptEncoding("gzip")
+                .put(ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
+
+        Reader reader = new InputStreamReader(new GzipCompressorInputStream((InputStream) response.getEntity()), UTF_8);
+        List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
+        assertEquals(12, metadataList.size());
+        assertEquals("Microsoft Office Word", metadataList.get(0).get(OfficeOpenXMLExtended.APPLICATION));
+        assertContains("plundered our seas", metadataList.get(6).get("X-TIKA:content"));
+
+        assertEquals("a38e6c7b38541af87148dee9634cb811", metadataList.get(10).get("X-TIKA:digest:MD5"));
+    }
+
+    @Test
+    public void testGZIn() throws Exception {
+
+        Response response = WebClient
+                .create(endPoint + META_PATH)
+                .accept("application/json")
+                .encoding("gzip")
+                .put(gzip(ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC)));
+
+        Reader reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
+        List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
+        assertEquals(12, metadataList.size());
+        assertEquals("Microsoft Office Word", metadataList.get(0).get(OfficeOpenXMLExtended.APPLICATION));
+        assertContains("plundered our seas", metadataList.get(6).get("X-TIKA:content"));
+
+        assertEquals("a38e6c7b38541af87148dee9634cb811", metadataList.get(10).get("X-TIKA:digest:MD5"));
+
+    }
+
+    @Test
     public void testSimpleWord() throws Exception {
         Response response = WebClient
                 .create(endPoint + META_PATH)
diff --git a/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java b/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
index 3f65418..6b6fa23 100644
--- a/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
+++ b/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
@@ -18,6 +18,8 @@
 package org.apache.tika.server;
 
 import org.apache.commons.codec.binary.Base64;
+import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
+import org.apache.commons.io.IOUtils;
 import org.apache.cxf.attachment.AttachmentUtil;
 import org.apache.cxf.jaxrs.JAXRSServerFactoryBean;
 import org.apache.cxf.jaxrs.client.WebClient;
@@ -30,12 +32,17 @@ import org.junit.Ignore;
 import org.junit.Test;
 
 import javax.ws.rs.ProcessingException;
+import javax.ws.rs.core.MultivaluedMap;
 import javax.ws.rs.core.Response;
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
 import java.io.InputStream;
 import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.List;
+import java.util.Set;
 
+import static org.apache.cxf.helpers.HttpHeaderHelper.CONTENT_ENCODING;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertTrue;
@@ -85,6 +92,50 @@ public class TikaResourceTest extends CXFTestBase {
     }
 
     @Test
+    public void testWordGzipIn() throws Exception {
+        Response response = WebClient.create(endPoint + TIKA_PATH)
+                .type("application/msword")
+                .accept("text/plain")
+                .encoding("gzip")
+                .put(gzip(ClassLoader.getSystemResourceAsStream(TEST_DOC)));
+        String responseMsg = getStringFromInputStream((InputStream) response
+                .getEntity());
+        assertTrue(responseMsg.contains("test"));
+    }
+
+    @Test
+    public void testLongGzipOut() throws Exception {
+        //if the output is long enough, jax-rs will compress it, otherwise it won't
+        //this output is long enough, and should be compressed
+        Response response = WebClient.create(endPoint + TIKA_PATH)
+                .accept("text/plain")
+                .acceptEncoding("gzip")
+                .put(ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
+        assertTrue(response.getHeaders().containsKey(CONTENT_ENCODING));
+        assertEquals("gzip", response.getHeaderString(CONTENT_ENCODING));
+        String responseMsg = getStringFromInputStream(
+                new GzipCompressorInputStream((InputStream) response
+                        .getEntity()));
+        assertTrue(responseMsg.contains("Course of human"));
+    }
+
+    @Test
+    public void testShortGzipOut() throws Exception {
+        //if the output is long enough, jax-rs will compress it, otherwise it won't
+        //this output is short enough, and should not be compressed
+        Response response = WebClient.create(endPoint + TIKA_PATH)
+                .accept("text/plain")
+                .acceptEncoding("gzip")
+                .put(ClassLoader.getSystemResourceAsStream(TEST_DOC));
+        assertFalse(response.getHeaders().containsKey(CONTENT_ENCODING));
+
+        String responseMsg = getStringFromInputStream(
+                (InputStream) response
+                        .getEntity());
+        assertTrue(responseMsg.contains("test"));
+    }
+
+    @Test
     public void testTextMain() throws Exception {
         //boilerpipe
         Response response = WebClient.create(endPoint + TIKA_PATH + "/main")