You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/12/02 14:53:00 UTC

[tika] branch main updated: TIKA-3609 -- fix: PipesServer is not concatenating embedded content as expected

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 26976b6  TIKA-3609 -- fix: PipesServer is not concatenating embedded content as expected
26976b6 is described below

commit 26976b6c7ed3386aecd476dd84afb1b69c020970
Author: tballison <ta...@apache.org>
AuthorDate: Thu Dec 2 09:52:47 2021 -0500

    TIKA-3609 -- fix: PipesServer is not concatenating embedded content as expected
---
 CHANGES.txt                                        |   3 +
 .../java/org/apache/tika/pipes/PipesServer.java    |   2 +-
 .../apache/tika/server/standard/TikaPipesTest.java | 216 +++++++++++++++++++++
 3 files changed, 220 insertions(+), 1 deletion(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index f3bcdf2..3d2e248 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,8 @@
 Release 2.2.0 - ???
 
+   * Fix logic bug in PipesServer that prevented concatenation of
+     content from attachments (TIKA-3609).
+
    * Added back ability to ignore load errors in TikaConfig (TIKA-3575).
 
    * Make SecureContentHandler and other parameters configurable in
diff --git a/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java b/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java
index 91ebae6..52c620f 100644
--- a/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java
@@ -492,7 +492,7 @@ public class PipesServer implements Runnable {
                 if (maxEmbedded < 0) {
                     return true;
                 }
-                return embedded++ > maxEmbedded;
+                return embedded++ < maxEmbedded;
             }
         });
 
diff --git a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaPipesTest.java b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaPipesTest.java
new file mode 100644
index 0000000..282098c
--- /dev/null
+++ b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaPipesTest.java
@@ -0,0 +1,216 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * <p>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.server.standard;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Reader;
+import java.io.StringWriter;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.StandardCopyOption;
+import java.util.ArrayList;
+import java.util.List;
+import javax.ws.rs.core.Response;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.cxf.jaxrs.JAXRSServerFactoryBean;
+import org.apache.cxf.jaxrs.client.WebClient;
+import org.apache.cxf.jaxrs.lifecycle.ResourceProvider;
+import org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider;
+import org.junit.jupiter.api.AfterAll;
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.serialization.JsonFetchEmitTuple;
+import org.apache.tika.metadata.serialization.JsonMetadataList;
+import org.apache.tika.pipes.FetchEmitTuple;
+import org.apache.tika.pipes.HandlerConfig;
+import org.apache.tika.pipes.emitter.EmitKey;
+import org.apache.tika.pipes.fetcher.FetchKey;
+import org.apache.tika.pipes.fetcher.FetcherManager;
+import org.apache.tika.sax.BasicContentHandlerFactory;
+import org.apache.tika.server.core.CXFTestBase;
+import org.apache.tika.server.core.FetcherStreamFactory;
+import org.apache.tika.server.core.InputStreamFactory;
+import org.apache.tika.server.core.TikaServerParseExceptionMapper;
+import org.apache.tika.server.core.resource.PipesResource;
+import org.apache.tika.server.core.writer.JSONObjWriter;
+import org.apache.tika.utils.ProcessUtils;
+
+/**
+ * This offers basic integration tests with fetchers and emitters.
+ * We use file system fetchers and emitters.
+ */
+public class TikaPipesTest extends CXFTestBase {
+
+    private static final String PIPES_PATH = "/pipes";
+    private static Path TMP_DIR;
+    private static Path TMP_OUTPUT_DIR;
+    private static Path TMP_OUTPUT_FILE;
+    private static Path TIKA_PIPES_LOG4j2_PATH;
+    private static Path TIKA_CONFIG_PATH;
+    private static String TIKA_CONFIG_XML;
+    private static final String TEST_RECURSIVE_DOC = "test_recursive_embedded.docx";
+    private static FetcherManager FETCHER_MANAGER;
+
+    @BeforeAll
+    public static void setUpBeforeClass() throws Exception {
+        TMP_DIR = Files.createTempDirectory("tika-pipes-test-");
+        Path inputDir = TMP_DIR.resolve("input");
+        TMP_OUTPUT_DIR = TMP_DIR.resolve("output");
+        TMP_OUTPUT_FILE = TMP_OUTPUT_DIR.resolve(TEST_RECURSIVE_DOC + ".json");
+
+        Files.createDirectories(inputDir);
+        Files.createDirectories(TMP_OUTPUT_DIR);
+        Files.copy(TikaPipesTest.class.getResourceAsStream("/test-documents/" + TEST_RECURSIVE_DOC),
+                inputDir.resolve("test_recursive_embedded.docx"),
+                StandardCopyOption.REPLACE_EXISTING);
+
+        TIKA_CONFIG_PATH = Files.createTempFile(TMP_DIR, "tika-pipes-", ".xml");
+        TIKA_PIPES_LOG4j2_PATH = Files.createTempFile(TMP_DIR, "log4j2-", ".xml");
+        Files.copy(TikaPipesTest.class.getResourceAsStream("/log4j2.xml"), TIKA_PIPES_LOG4j2_PATH,
+                StandardCopyOption.REPLACE_EXISTING);
+        TIKA_CONFIG_XML =
+                "<?xml version=\"1.0\" encoding=\"UTF-8\"?>" + "<properties>" + "<fetchers>" +
+                        "<fetcher class=\"org.apache.tika.pipes.fetcher.fs.FileSystemFetcher\">" +
+                        "<params>" + "<name>fsf</name>" +
+                        "<basePath>" + inputDir.toAbsolutePath() +
+                        "</basePath>" + "</params>" + "</fetcher>" + "</fetchers>" + "<emitters>" +
+                        "<emitter class=\"org.apache.tika.pipes.emitter.fs.FileSystemEmitter\">" +
+                        "<params>" + "<name>fse</name>" +
+                        "<basePath>" +
+                        TMP_OUTPUT_DIR.toAbsolutePath() + "</basePath>" + "</params>" +
+                        "</emitter>" +
+                        "</emitters>" + "<pipes><params><tikaConfig>" +
+                ProcessUtils.escapeCommandLine(TIKA_CONFIG_PATH.toAbsolutePath().toString()) +
+                        "</tikaConfig><numClients>10</numClients>" +
+                        "<forkedJvmArgs>" +
+                        "<arg>-Xmx256m</arg>" +
+                        "<arg>-Dlog4j.configurationFile=file:" +
+                        ProcessUtils.escapeCommandLine(TIKA_PIPES_LOG4j2_PATH.toAbsolutePath().toString()) + "</arg>" +
+                        "</forkedJvmArgs>" +
+                        "</params></pipes>" + "</properties>";
+        Files.write(TIKA_CONFIG_PATH, TIKA_CONFIG_XML.getBytes(StandardCharsets.UTF_8));
+    }
+
+    @AfterAll
+    public static void tearDownAfterClass() throws Exception {
+        FileUtils.deleteDirectory(TMP_DIR.toFile());
+    }
+
+    @BeforeEach
+    public void setUpEachTest() throws Exception {
+        if (Files.exists(TMP_OUTPUT_FILE)) {
+            Files.delete(TMP_OUTPUT_FILE);
+        }
+
+        assertFalse(Files.isRegularFile(TMP_OUTPUT_FILE));
+    }
+
+    @Override
+    protected void setUpResources(JAXRSServerFactoryBean sf) {
+        List<ResourceProvider> rCoreProviders = new ArrayList<>();
+        try {
+            rCoreProviders.add(new SingletonResourceProvider(new PipesResource(TIKA_CONFIG_PATH)));
+        } catch (IOException | TikaConfigException e) {
+            throw new RuntimeException(e);
+        }
+        sf.setResourceProviders(rCoreProviders);
+    }
+
+    @Override
+    protected void setUpProviders(JAXRSServerFactoryBean sf) {
+        List<Object> providers = new ArrayList<>();
+        providers.add(new TikaServerParseExceptionMapper(true));
+        providers.add(new JSONObjWriter());
+        sf.setProviders(providers);
+    }
+
+    @Override
+    protected InputStream getTikaConfigInputStream() {
+        return new ByteArrayInputStream(TIKA_CONFIG_XML.getBytes(StandardCharsets.UTF_8));
+    }
+
+    @Override
+    protected InputStreamFactory getInputStreamFactory(TikaConfig tikaConfig) {
+        return new FetcherStreamFactory(FETCHER_MANAGER);
+    }
+
+
+    @Test
+    public void testBasic() throws Exception {
+
+        FetchEmitTuple t =
+                new FetchEmitTuple("myId",
+                        new FetchKey("fsf", "test_recursive_embedded.docx"),
+                        new EmitKey("fse", ""));
+        StringWriter writer = new StringWriter();
+        JsonFetchEmitTuple.toJson(t, writer);
+
+        String getUrl = endPoint + PIPES_PATH;
+        Response response =
+                WebClient.create(getUrl).accept("application/json").post(writer.toString());
+        assertEquals(200, response.getStatus());
+
+        List<Metadata> metadataList = null;
+        try (Reader reader = Files.newBufferedReader(TMP_OUTPUT_FILE)) {
+            metadataList = JsonMetadataList.fromJson(reader);
+        }
+        assertEquals(12, metadataList.size());
+        assertContains("When in the Course",
+                metadataList.get(6).get(TikaCoreProperties.TIKA_CONTENT));
+    }
+
+    @Test
+    public void testConcatenated() throws Exception {
+
+        FetchEmitTuple t =
+                new FetchEmitTuple("myId",
+                        new FetchKey("fsf", "test_recursive_embedded.docx"),
+                        new EmitKey("fse", ""),
+                        new Metadata(),
+                        new HandlerConfig(BasicContentHandlerFactory.HANDLER_TYPE.TEXT,
+                                HandlerConfig.PARSE_MODE.CONCATENATE, -1, -1000),
+                        FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT);
+        StringWriter writer = new StringWriter();
+        JsonFetchEmitTuple.toJson(t, writer);
+
+        String getUrl = endPoint + PIPES_PATH;
+        Response response =
+                WebClient.create(getUrl).accept("application/json").post(writer.toString());
+        assertEquals(200, response.getStatus());
+
+        List<Metadata> metadataList = null;
+        try (Reader reader = Files.newBufferedReader(TMP_OUTPUT_FILE)) {
+            metadataList = JsonMetadataList.fromJson(reader);
+        }
+        assertEquals(1, metadataList.size());
+        assertContains("When in the Course",
+                metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT));
+    }
+}