You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/07/29 20:31:01 UTC

[tika] branch main updated: TIKA-3495 -- convert from anonymous to named parent-child relationship

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 97c9b94  TIKA-3495 -- convert from anonymous to named parent-child relationship
97c9b94 is described below

commit 97c9b945903229f53c2adda26e95b9f1f6bf3433
Author: tallison <ta...@apache.org>
AuthorDate: Thu Jul 29 16:27:04 2021 -0400

    TIKA-3495 -- convert from anonymous to named parent-child relationship
---
 .../opensearch/tests/OpenSearchTestClient.java     |  5 +++--
 .../opensearch/tests/TikaPipesOpenSearchTest.java  |  4 +++-
 .../tika/pipes/solr/tests/TikaPipesSolr6Test.java  |  6 ++++++
 .../pipes/solr/tests/TikaPipesSolrTestBase.java    | 23 +++++++++++++++-------
 .../pipes/emitter/opensearch/OpenSearchClient.java | 13 +++++++-----
 .../emitter/opensearch/OpenSearchEmitter.java      | 20 ++++++++++++++++++-
 .../emitter/opensearch/OpenSearchClientTest.java   |  3 ++-
 .../tika/pipes/emitter/solr/SolrEmitter.java       | 20 +++++++++++++++++--
 8 files changed, 75 insertions(+), 19 deletions(-)

diff --git a/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/java/org/apache/tika/pipes/opensearch/tests/OpenSearchTestClient.java b/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/java/org/apache/tika/pipes/opensearch/tests/OpenSearchTestClient.java
index ca3abe1..27c71a9 100644
--- a/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/java/org/apache/tika/pipes/opensearch/tests/OpenSearchTestClient.java
+++ b/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/java/org/apache/tika/pipes/opensearch/tests/OpenSearchTestClient.java
@@ -43,8 +43,9 @@ import org.apache.tika.pipes.emitter.opensearch.OpenSearchEmitter;
 public class OpenSearchTestClient extends OpenSearchClient {
 
     protected OpenSearchTestClient(String openSearchUrl, HttpClient httpClient,
-                                   OpenSearchEmitter.AttachmentStrategy attachmentStrategy) {
-        super(openSearchUrl, httpClient, attachmentStrategy);
+                                   OpenSearchEmitter.AttachmentStrategy attachmentStrategy,
+                                   String embeddedFileFieldName) {
+        super(openSearchUrl, httpClient, attachmentStrategy, embeddedFileFieldName);
     }
 
     protected JsonResponse putJson(String url, String json) throws IOException {
diff --git a/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/java/org/apache/tika/pipes/opensearch/tests/TikaPipesOpenSearchTest.java b/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/java/org/apache/tika/pipes/opensearch/tests/TikaPipesOpenSearchTest.java
index 986ce7a..683092b 100644
--- a/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/java/org/apache/tika/pipes/opensearch/tests/TikaPipesOpenSearchTest.java
+++ b/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/java/org/apache/tika/pipes/opensearch/tests/TikaPipesOpenSearchTest.java
@@ -313,7 +313,9 @@ public class TikaPipesOpenSearchTest {
         httpClientFactory.setPassword("admin");
         //attachment strategy is not used here...TODO clean this up
         client = new OpenSearchTestClient(openSearchEndpointBase,
-                httpClientFactory.build(), OpenSearchEmitter.AttachmentStrategy.SEPARATE_DOCUMENTS);
+                httpClientFactory.build(),
+                OpenSearchEmitter.AttachmentStrategy.SEPARATE_DOCUMENTS,
+                OpenSearchEmitter.DEFAULT_EMBEDDED_FILE_FIELD_NAME);
     }
 
     private void createTestHtmlFiles(String bodyContent, int numHtmlDocs) throws Exception {
diff --git a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolr6Test.java b/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolr6Test.java
index 9ec9bee..b146188 100644
--- a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolr6Test.java
+++ b/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolr6Test.java
@@ -28,4 +28,10 @@ public class TikaPipesSolr6Test extends TikaPipesSolrTestBase {
         return "solr:6";
     }
 
+    @Override
+    public boolean handlesParentChild() {
+        //Solr 6 didn't automatically set _root_ with the parent-child indexing,
+        //apparently
+        return false;
+    }
 }
diff --git a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolrTestBase.java b/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolrTestBase.java
index 38fd306..76278e5 100644
--- a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolrTestBase.java
+++ b/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolrTestBase.java
@@ -16,6 +16,8 @@
  */
 package org.apache.tika.pipes.solr.tests;
 
+import static org.junit.jupiter.api.Assumptions.assumeTrue;
+
 import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
@@ -35,11 +37,12 @@ import org.apache.tika.cli.TikaCLI;
 import org.apache.tika.pipes.HandlerConfig;
 import org.apache.tika.pipes.emitter.solr.SolrEmitter;
 import org.jetbrains.annotations.NotNull;
-import org.junit.After;
 import org.junit.Assert;
 import org.junit.Before;
 import org.junit.Rule;
 import org.junit.Test;
+import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.BeforeEach;
 import org.testcontainers.containers.GenericContainer;
 import org.testcontainers.shaded.org.apache.commons.io.FileUtils;
 import org.testcontainers.utility.DockerImageName;
@@ -59,6 +62,11 @@ public abstract class TikaPipesSolrTestBase {
 
     public abstract String getSolrImageName();
 
+
+    public boolean handlesParentChild() {
+        return true;
+    }
+
     @Rule
     public GenericContainer<?> solrContainer =
             new GenericContainer<>(DockerImageName.parse(getSolrImageName())).withExposedPorts(8983,
@@ -70,13 +78,13 @@ public abstract class TikaPipesSolrTestBase {
         setupSolr(solrContainer);
     }
 
-    @After
+    @AfterEach
     public void tearDown() throws Exception {
         FileUtils.deleteDirectory(testFileFolder);
     }
 
     @Test
-    public void testFetchIteratorWithSolrUrls() throws Exception {
+    public void testPipesIteratorWithSolrUrls() throws Exception {
         runTikaAsyncSolrPipeIteratorFileFetcherSolrEmitter();
     }
 
@@ -172,10 +180,11 @@ public abstract class TikaPipesSolrTestBase {
             Assert.assertEquals(numDocs,
                     solrClient.query(collection, new SolrQuery("content_s:*initial*")).getResults()
                             .getNumFound());
-            Assert.assertEquals(3,
-                    solrClient.query(collection, new SolrQuery("_root_:\"test-embedded.docx\"")).getResults()
-                            .getNumFound());
-
+            if(handlesParentChild()) {
+                Assert.assertEquals(3,
+                        solrClient.query(collection, new SolrQuery("_root_:\"test-embedded.docx\""))
+                                .getResults().getNumFound());
+            }
             //clean up test-embedded.docx so that the iterator won't try to update its children
             //in the next test
             solrClient.deleteById(collection, "_root_:\"test-embedded.docx\"");
diff --git a/tika-pipes/tika-emitters/tika-emitter-opensearch/src/main/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchClient.java b/tika-pipes/tika-emitters/tika-emitter-opensearch/src/main/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchClient.java
index b30a648..208b431 100644
--- a/tika-pipes/tika-emitters/tika-emitter-opensearch/src/main/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchClient.java
+++ b/tika-pipes/tika-emitters/tika-emitter-opensearch/src/main/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchClient.java
@@ -52,12 +52,14 @@ public class OpenSearchClient {
     protected final String openSearchUrl;
     protected final HttpClient httpClient;
     private final OpenSearchEmitter.AttachmentStrategy attachmentStrategy;
-
+    private final String embeddedFileFieldName;
     protected OpenSearchClient(String openSearchUrl, HttpClient httpClient,
-                               OpenSearchEmitter.AttachmentStrategy attachmentStrategy) {
+                               OpenSearchEmitter.AttachmentStrategy attachmentStrategy,
+                               String embeddedFileFieldName) {
         this.openSearchUrl = openSearchUrl;
         this.httpClient = httpClient;
         this.attachmentStrategy = attachmentStrategy;
+        this.embeddedFileFieldName = embeddedFileFieldName;
     }
 
     public void addDocument(String emitKey, List<Metadata> metadataList) throws IOException,
@@ -77,7 +79,8 @@ public class OpenSearchClient {
             if (i == 0) {
                 sb.append(metadataToJsonContainer(metadata, attachmentStrategy));
             } else {
-                sb.append(metadataToJsonEmbedded(metadata, attachmentStrategy, emitKey));
+                sb.append(metadataToJsonEmbedded(metadata, attachmentStrategy,
+                        emitKey, embeddedFileFieldName));
             }
             sb.append("\n");
             i++;
@@ -102,7 +105,7 @@ public class OpenSearchClient {
 
     protected static String metadataToJsonEmbedded(Metadata metadata,
                                                    OpenSearchEmitter.AttachmentStrategy attachmentStrategy,
-                                                   String emitKey) throws IOException {
+                                                   String emitKey, String embeddedFileFieldName) throws IOException {
         StringWriter writer = new StringWriter();
         try (JsonGenerator jsonGenerator = new JsonFactory().createGenerator(writer)) {
             jsonGenerator.writeStartObject();
@@ -110,7 +113,7 @@ public class OpenSearchClient {
             writeMetadata(metadata, jsonGenerator);
             if (attachmentStrategy == OpenSearchEmitter.AttachmentStrategy.PARENT_CHILD) {
                 jsonGenerator.writeObjectFieldStart("relation_type");
-                jsonGenerator.writeStringField("name", "embedded");
+                jsonGenerator.writeStringField("name", embeddedFileFieldName);
                 jsonGenerator.writeStringField("parent", emitKey);
                 //end the relation type object
                 jsonGenerator.writeEndObject();
diff --git a/tika-pipes/tika-emitters/tika-emitter-opensearch/src/main/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchEmitter.java b/tika-pipes/tika-emitters/tika-emitter-opensearch/src/main/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchEmitter.java
index 208f138..8fe1849 100644
--- a/tika-pipes/tika-emitters/tika-emitter-opensearch/src/main/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchEmitter.java
+++ b/tika-pipes/tika-emitters/tika-emitter-opensearch/src/main/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchEmitter.java
@@ -46,6 +46,7 @@ public class OpenSearchEmitter extends AbstractEmitter implements Initializable
         //anything else?
     }
 
+    public static String DEFAULT_EMBEDDED_FILE_FIELD_NAME = "embedded";
     private static final Logger LOG = LoggerFactory.getLogger(OpenSearchEmitter.class);
     private AttachmentStrategy attachmentStrategy = AttachmentStrategy.PARENT_CHILD;
 
@@ -54,6 +55,7 @@ public class OpenSearchEmitter extends AbstractEmitter implements Initializable
     private int commitWithin = 1000;
     private OpenSearchClient openSearchClient;
     private final HttpClientFactory httpClientFactory;
+    private String embeddedFileFieldName = DEFAULT_EMBEDDED_FILE_FIELD_NAME;
 
     public OpenSearchEmitter() throws TikaConfigException {
         httpClientFactory = new HttpClientFactory();
@@ -157,13 +159,29 @@ public class OpenSearchEmitter extends AbstractEmitter implements Initializable
         httpClientFactory.setProxyPort(proxyPort);
     }
 
+    /**
+     * If using the {@link AttachmentStrategy#PARENT_CHILD}, this is the field name
+     * used to store the child documents.  Note that we artificially flatten all embedded
+     * documents, no matter how nested in the container document, into direct children
+     * of the root document.
+     *
+     * @param embeddedFileFieldName
+     */
+    @Field
+    public void setEmbeddedFileFieldName(String embeddedFileFieldName) {
+        this.embeddedFileFieldName = embeddedFileFieldName;
+    }
+
+
     @Override
     public void initialize(Map<String, Param> params) throws TikaConfigException {
         if (StringUtils.isBlank(openSearchUrl)) {
             throw new TikaConfigException("Must specify an open search url!");
         } else {
             openSearchClient =
-                    new OpenSearchClient(openSearchUrl, httpClientFactory.build(), attachmentStrategy);
+                    new OpenSearchClient(openSearchUrl,
+                            httpClientFactory.build(), attachmentStrategy,
+                            embeddedFileFieldName);
         }
     }
 
diff --git a/tika-pipes/tika-emitters/tika-emitter-opensearch/src/test/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchClientTest.java b/tika-pipes/tika-emitters/tika-emitter-opensearch/src/test/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchClientTest.java
index 34653dc..638533c 100644
--- a/tika-pipes/tika-emitters/tika-emitter-opensearch/src/test/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchClientTest.java
+++ b/tika-pipes/tika-emitters/tika-emitter-opensearch/src/test/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchClientTest.java
@@ -40,7 +40,8 @@ public class OpenSearchClientTest extends TikaTest {
         }
         for (OpenSearchEmitter.AttachmentStrategy strategy :
                 OpenSearchEmitter.AttachmentStrategy.values()) {
-            String json = OpenSearchClient.metadataToJsonEmbedded(metadata, strategy, "myEmitKey");
+            String json = OpenSearchClient.metadataToJsonEmbedded(metadata, strategy,
+                    "myEmitKey", OpenSearchEmitter.DEFAULT_EMBEDDED_FILE_FIELD_NAME);
             assertContains("author1", json);
             assertContains("author2", json);
             assertContains("authors", json);
diff --git a/tika-pipes/tika-emitters/tika-emitter-solr/src/main/java/org/apache/tika/pipes/emitter/solr/SolrEmitter.java b/tika-pipes/tika-emitters/tika-emitter-solr/src/main/java/org/apache/tika/pipes/emitter/solr/SolrEmitter.java
index 8e1615b..eb625ee 100644
--- a/tika-pipes/tika-emitters/tika-emitter-solr/src/main/java/org/apache/tika/pipes/emitter/solr/SolrEmitter.java
+++ b/tika-pipes/tika-emitters/tika-emitter-solr/src/main/java/org/apache/tika/pipes/emitter/solr/SolrEmitter.java
@@ -48,6 +48,7 @@ import org.apache.tika.pipes.emitter.TikaEmitterException;
 
 public class SolrEmitter extends AbstractEmitter implements Initializable {
 
+    public static String DEFAULT_EMBEDDED_FILE_FIELD_NAME = "embedded";
     private static final Logger LOG = LoggerFactory.getLogger(SolrEmitter.class);
     private final HttpClientFactory httpClientFactory;
     private AttachmentStrategy attachmentStrategy = AttachmentStrategy.PARENT_CHILD;
@@ -59,12 +60,12 @@ public class SolrEmitter extends AbstractEmitter implements Initializable {
     private List<String> solrUrls;
     private List<String> solrZkHosts;
     private String solrZkChroot;
-    private String contentField = "content";
     private String idField = "id";
     private int commitWithin = 1000;
     private int connectionTimeout = 10000;
     private int socketTimeout = 60000;
     private SolrClient solrClient;
+    private String embeddedFileFieldName = DEFAULT_EMBEDDED_FILE_FIELD_NAME;
 
     public SolrEmitter() throws TikaConfigException {
         httpClientFactory = new HttpClientFactory();
@@ -97,14 +98,16 @@ public class SolrEmitter extends AbstractEmitter implements Initializable {
             docsToUpdate.add(solrInputDocument);
         } else if (attachmentStrategy == AttachmentStrategy.PARENT_CHILD) {
             addMetadataToSolrInputDocument(metadataList.get(0), solrInputDocument, updateStrategy);
+            List<SolrInputDocument> children = new ArrayList<>();
             for (int i = 1; i < metadataList.size(); i++) {
                 SolrInputDocument childSolrInputDocument = new SolrInputDocument();
                 Metadata m = metadataList.get(i);
                 childSolrInputDocument
                         .setField(idField, emitKey + "-" + UUID.randomUUID().toString());
                 addMetadataToSolrInputDocument(m, childSolrInputDocument, updateStrategy);
-                solrInputDocument.addChildDocument(childSolrInputDocument);
+                children.add(childSolrInputDocument);
             }
+            solrInputDocument.setField(embeddedFileFieldName, children);
             docsToUpdate.add(solrInputDocument);
         } else if (attachmentStrategy == AttachmentStrategy.SEPARATE_DOCUMENTS) {
             addMetadataToSolrInputDocument(metadataList.get(0), solrInputDocument, updateStrategy);
@@ -282,6 +285,19 @@ public class SolrEmitter extends AbstractEmitter implements Initializable {
         httpClientFactory.setProxyPort(proxyPort);
     }
 
+    /**
+     * If using the {@link AttachmentStrategy#PARENT_CHILD}, this is the field name
+     * used to store the child documents.  Note that we artificially flatten all embedded
+     * documents, no matter how nested in the container document, into direct children
+     * of the root document.
+     *
+     * @param embeddedFileFieldName
+     */
+    @Field
+    public void setEmbeddedFileFieldName(String embeddedFileFieldName) {
+        this.embeddedFileFieldName = embeddedFileFieldName;
+    }
+
     @Override
     public void initialize(Map<String, Param> params) throws TikaConfigException {
         if (solrUrls == null || solrUrls.isEmpty()) {