You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/07/22 14:34:58 UTC

[tika] branch main updated: TIKA-3490 -- fix serialization of embedded docs in opensearch emitter

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new bf82c4d  TIKA-3490 -- fix serialization of embedded docs in opensearch emitter
     new 018a32f  Merge remote-tracking branch 'origin/main' into main
bf82c4d is described below

commit bf82c4d8fb86efc1b9743c89584ed2f4104f3715
Author: tallison <ta...@apache.org>
AuthorDate: Thu Jul 22 10:34:25 2021 -0400

    TIKA-3490 -- fix serialization of embedded docs in opensearch emitter
---
 CHANGES.txt                                        |   4 ++
 .../opensearch/tests/TikaPipesOpenSearchTest.java  |  45 ++++++++++++------
 .../test-documents/test_recursive_embedded.docx    | Bin 0 -> 27082 bytes
 .../tika-emitters/tika-emitter-opensearch/pom.xml  |   7 +++
 .../pipes/emitter/opensearch/OpenSearchClient.java |  22 +++++----
 .../emitter/opensearch/OpenSearchEmitter.java      |  24 ++++++++++
 .../emitter/opensearch/OpenSearchClientTest.java   |  51 +++++++++++++++++++++
 7 files changed, 131 insertions(+), 22 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 91e3335..2a82d28 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,3 +1,7 @@
+Release 2.0.1 - ???
+
+   * Fix serialization of embedded docs in OpenSearch emitter (TIKA-3490).
+
 Release 2.0.0 - 07/07/2021
 
    * Cleanup of fetcher integration with tika-server.
diff --git a/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/java/org/apache/tika/pipes/opensearch/tests/TikaPipesOpenSearchTest.java b/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/java/org/apache/tika/pipes/opensearch/tests/TikaPipesOpenSearchTest.java
index 63d3254..f6dfb43 100644
--- a/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/java/org/apache/tika/pipes/opensearch/tests/TikaPipesOpenSearchTest.java
+++ b/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/java/org/apache/tika/pipes/opensearch/tests/TikaPipesOpenSearchTest.java
@@ -22,6 +22,9 @@ import static org.junit.Assert.assertTrue;
 import java.io.File;
 import java.io.InputStream;
 import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
 import java.util.regex.Matcher;
 
 import org.apache.commons.io.IOUtils;
@@ -41,9 +44,10 @@ import org.apache.tika.pipes.emitter.opensearch.OpenSearchEmitter;
 
 public class TikaPipesOpenSearchTest {
 
-    private static final String collection = "testcol";
-    private static final File testFileFolder = new File("target", "test-files");
-    private final int numDocs = 42;
+    private static final String COLLECTION = "testcol";
+    private static final File TEST_FILE_FOLDER = new File("target", "test-files");
+    private final int numHtmlDocs = 42;
+    private int numTestDocs = 0;
     protected GenericContainer<?> openSearch;
     private String openSearchHost;
     private int openSearchPort;
@@ -68,7 +72,7 @@ public class TikaPipesOpenSearchTest {
 
     @AfterClass
     public static void tearDown() throws Exception {
-        FileUtils.deleteDirectory(testFileFolder);
+        FileUtils.deleteDirectory(TEST_FILE_FOLDER);
     }
 
     @Test
@@ -94,17 +98,25 @@ public class TikaPipesOpenSearchTest {
         assertTrue(response.getJson().get("acknowledged").asBoolean());
         assertEquals("testcol", response.getJson().get("index").asText());
 
-        runPipes(OpenSearchEmitter.AttachmentStrategy.CONCATENATE_CONTENT);
+        runPipes(OpenSearchEmitter.AttachmentStrategy.SKIP);
         //refresh to make sure the content is searchable
         JsonResponse refresh = client.getJson(openSearchEndpoint + "/_refresh");
 
         String query = "{ \"track_total_hits\": true, \"query\": { \"match\": { \"content\": { " +
-                "\"query\": \"initial\" } } } }";
+                "\"query\": \"happiness\" } } } }";
 
         JsonResponse results = client.postJson(openSearchEndpoint + "/_search", query);
         assertEquals(200, results.getStatus());
+        //assertEquals(numHtmlDocs + numTestDocs,
+          //      results.getJson().get("hits").get("total").get("value").asInt());
+
+        //now try match all
+        query = "{ \"track_total_hits\": true, \"query\": { \"match_all\": {} } }";
+        results = client.postJson(openSearchEndpoint + "/_search", query);
+        assertEquals(200, results.getStatus());
+        assertEquals(numHtmlDocs + numTestDocs,
+                results.getJson().get("hits").get("total").get("value").asInt());
 
-        assertEquals(numDocs, results.getJson().get("hits").get("total").get("value").asInt());
 
     }
 
@@ -140,7 +152,7 @@ public class TikaPipesOpenSearchTest {
                         .replace("{ATTACHMENT_STRATEGY}", attachmentStrategy.toString())
                         .replace("{LOG4J_PROPERTIES_FILE}", log4jPropFile.getAbsolutePath())
                         .replaceAll("\\{PATH_TO_DOCS\\}", 
-                                Matcher.quoteReplacement(testFileFolder.getAbsolutePath()));
+                                Matcher.quoteReplacement(TEST_FILE_FOLDER.getAbsolutePath()));
 
         res = res.replace("{OPENSEARCH_CONNECTION}", openSearchEndpoint);
 
@@ -149,11 +161,11 @@ public class TikaPipesOpenSearchTest {
     }
 
     private void setupOpenSearch(GenericContainer<?> openSearchContainer) throws Exception {
-        createTestHtmlFiles("initial");
+        createTestHtmlFiles("Happiness");
         this.openSearch = openSearchContainer;
         openSearchHost = openSearch.getHost();
         openSearchPort = openSearch.getMappedPort(9200);
-        openSearchEndpoint = "https://" + openSearchHost + ":" + openSearchPort + "/" + collection;
+        openSearchEndpoint = "https://" + openSearchHost + ":" + openSearchPort + "/" + COLLECTION;
         HttpClientFactory httpClientFactory = new HttpClientFactory();
         httpClientFactory.setUserName("admin");
         httpClientFactory.setPassword("admin");
@@ -163,11 +175,18 @@ public class TikaPipesOpenSearchTest {
     }
 
     private void createTestHtmlFiles(String bodyContent) throws Exception {
-        testFileFolder.mkdirs();
-        for (int i = 0; i < numDocs; ++i) {
-            FileUtils.writeStringToFile(new File(testFileFolder, "test-" + i + ".html"),
+        TEST_FILE_FOLDER.mkdirs();
+        for (int i = 0; i < numHtmlDocs; ++i) {
+            FileUtils.writeStringToFile(new File(TEST_FILE_FOLDER, "test-" + i + ".html"),
                     "<html><body>" + bodyContent +  "</body></html>", StandardCharsets.UTF_8);
         }
+        File testDocuments =
+                Paths.get(TikaPipesOpenSearchTest.class.getResource("/test-documents").toURI()).toFile();
+        for (File f : testDocuments.listFiles()) {
+            Path targ = TEST_FILE_FOLDER.toPath().resolve(f.getName());
+            Files.copy(f.toPath(), targ);
+            numTestDocs++;
+        }
     }
 
 
diff --git a/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/test-documents/test_recursive_embedded.docx b/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/test-documents/test_recursive_embedded.docx
new file mode 100644
index 0000000..cd562cb
Binary files /dev/null and b/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/test-documents/test_recursive_embedded.docx differ
diff --git a/tika-pipes/tika-emitters/tika-emitter-opensearch/pom.xml b/tika-pipes/tika-emitters/tika-emitter-opensearch/pom.xml
index 365d2cd..e0d1f4c 100644
--- a/tika-pipes/tika-emitters/tika-emitter-opensearch/pom.xml
+++ b/tika-pipes/tika-emitters/tika-emitter-opensearch/pom.xml
@@ -71,6 +71,13 @@
       <artifactId>junit</artifactId>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-core</artifactId>
+      <version>${project.version}</version>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
   </dependencies>
 
   <build>
diff --git a/tika-pipes/tika-emitters/tika-emitter-opensearch/src/main/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchClient.java b/tika-pipes/tika-emitters/tika-emitter-opensearch/src/main/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchClient.java
index 6f98474..ab348ad 100644
--- a/tika-pipes/tika-emitters/tika-emitter-opensearch/src/main/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchClient.java
+++ b/tika-pipes/tika-emitters/tika-emitter-opensearch/src/main/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchClient.java
@@ -74,9 +74,9 @@ public class OpenSearchClient {
             String indexJson = getBulkIndexJson(id, routing);
             sb.append(indexJson).append("\n");
             if (i == 0) {
-                sb.append(metadataToJsonContainer(metadata));
+                sb.append(metadataToJsonContainer(metadata, attachmentStrategy));
             } else {
-                sb.append(metadataToJsonEmbedded(metadata, emitKey));
+                sb.append(metadataToJsonEmbedded(metadata, attachmentStrategy, emitKey));
             }
             sb.append("\n");
             i++;
@@ -99,26 +99,30 @@ public class OpenSearchClient {
         }
     }
 
-    private String metadataToJsonEmbedded(Metadata metadata, String emitKey) throws IOException {
+    protected static String metadataToJsonEmbedded(Metadata metadata,
+                                                   OpenSearchEmitter.AttachmentStrategy attachmentStrategy,
+                                                   String emitKey) throws IOException {
         StringWriter writer = new StringWriter();
         try (JsonGenerator jsonGenerator = new JsonFactory().createGenerator(writer)) {
             jsonGenerator.writeStartObject();
 
             writeMetadata(metadata, jsonGenerator);
             if (attachmentStrategy == OpenSearchEmitter.AttachmentStrategy.PARENT_CHILD) {
-                jsonGenerator.writeStartObject("relation_type");
+                jsonGenerator.writeObjectFieldStart("relation_type");
                 jsonGenerator.writeStringField("name", "embedded");
                 jsonGenerator.writeStringField("parent", emitKey);
+                //end the relation type object
+                jsonGenerator.writeEndObject();
             }
-            //end the relation type object
-            jsonGenerator.writeEndObject();
             //end the metadata object
             jsonGenerator.writeEndObject();
         }
         return writer.toString();
     }
 
-    private String metadataToJsonContainer(Metadata metadata) throws IOException {
+    protected static String metadataToJsonContainer(Metadata metadata,
+                                                    OpenSearchEmitter.AttachmentStrategy attachmentStrategy)
+            throws IOException {
         StringWriter writer = new StringWriter();
         try (JsonGenerator jsonGenerator = new JsonFactory().createGenerator(writer)) {
             jsonGenerator.writeStartObject();
@@ -131,7 +135,7 @@ public class OpenSearchClient {
         return writer.toString();
     }
 
-    private void writeMetadata(Metadata metadata, JsonGenerator jsonGenerator) throws IOException {
+    private static void writeMetadata(Metadata metadata, JsonGenerator jsonGenerator) throws IOException {
         //writes the metadata without the start { or the end }
         //to allow for other fields to be added
         for (String n : metadata.names()) {
@@ -139,7 +143,7 @@ public class OpenSearchClient {
             if (vals.length == 1) {
                 jsonGenerator.writeStringField(n, vals[0]);
             } else {
-                jsonGenerator.writeStartArray(n);
+                jsonGenerator.writeArrayFieldStart(n);
                 for (String v : vals) {
                     jsonGenerator.writeString(v);
                 }
diff --git a/tika-pipes/tika-emitters/tika-emitter-opensearch/src/main/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchEmitter.java b/tika-pipes/tika-emitters/tika-emitter-opensearch/src/main/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchEmitter.java
index fe878fe..5f61814 100644
--- a/tika-pipes/tika-emitters/tika-emitter-opensearch/src/main/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchEmitter.java
+++ b/tika-pipes/tika-emitters/tika-emitter-opensearch/src/main/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchEmitter.java
@@ -19,6 +19,7 @@ package org.apache.tika.pipes.emitter.opensearch;
 import static org.apache.tika.config.TikaConfig.mustNotBeEmpty;
 
 import java.io.IOException;
+import java.util.Collections;
 import java.util.List;
 import java.util.Map;
 
@@ -68,11 +69,34 @@ public class OpenSearchEmitter extends AbstractEmitter implements Initializable
             return;
         }
         try {
+            if (attachmentStrategy == AttachmentStrategy.CONCATENATE_CONTENT) {
+                metadataList = concatenate(metadataList);
+            } else if (attachmentStrategy == AttachmentStrategy.SKIP) {
+                metadataList = Collections.singletonList(metadataList.get(0));
+            }
             openSearchClient.addDocument(emitKey, metadataList);
         } catch (TikaClientException e) {
             throw new TikaEmitterException("failed to add document", e);
         }
     }
+
+    private List<Metadata> concatenate(List<Metadata> metadataList) {
+        if (metadataList.size() == 1) {
+            return metadataList;
+        }
+
+        Metadata ret = metadataList.get(0);
+        StringBuilder content = new StringBuilder();
+        for (Metadata m : metadataList) {
+            String c = m.get(getContentField());
+            if (! StringUtils.isBlank(c)) {
+                content.append(c).append("\n");
+            }
+        }
+        ret.set(getContentField(), content.toString());
+        return Collections.singletonList(ret);
+
+    }
 /*
     private void addMetadataAsSolrInputDocuments(String emitKey, List<Metadata> metadataList,
                                                  List<SolrInputDocument> docsToUpdate)
diff --git a/tika-pipes/tika-emitters/tika-emitter-opensearch/src/test/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchClientTest.java b/tika-pipes/tika-emitters/tika-emitter-opensearch/src/test/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchClientTest.java
new file mode 100644
index 0000000..e92607f
--- /dev/null
+++ b/tika-pipes/tika-emitters/tika-emitter-opensearch/src/test/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchClientTest.java
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.pipes.emitter.opensearch;
+
+import org.junit.Test;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+
+public class OpenSearchClientTest extends TikaTest {
+
+    @Test
+    public void testSerialization() throws Exception {
+        Metadata metadata = new Metadata();
+        metadata.add("authors", "author1");
+        metadata.add("authors", "author2");
+        metadata.add("title", "title1");
+        for (OpenSearchEmitter.AttachmentStrategy strategy :
+                OpenSearchEmitter.AttachmentStrategy.values()) {
+            String json = OpenSearchClient.metadataToJsonContainer(metadata,
+                    strategy);
+            assertContains("author1", json);
+            assertContains("author2", json);
+            assertContains("authors", json);
+            assertContains("title1", json);
+        }
+        for (OpenSearchEmitter.AttachmentStrategy strategy :
+                OpenSearchEmitter.AttachmentStrategy.values()) {
+            String json = OpenSearchClient.metadataToJsonEmbedded(metadata, strategy, "myEmitKey");
+            assertContains("author1", json);
+            assertContains("author2", json);
+            assertContains("authors", json);
+            assertContains("title1", json);
+        }
+
+    }
+}