You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/07/26 15:23:46 UTC

[tika] branch main updated: TIKA-3495 - verify embedded docs work (#449)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 6c30f2b  TIKA-3495 - verify embedded docs work (#449)
6c30f2b is described below

commit 6c30f2b273a7ad5b820cb236df129da71d1590b5
Author: Nicholas DiPiazza <ni...@lucidworks.com>
AuthorDate: Mon Jul 26 10:23:32 2021 -0500

    TIKA-3495 - verify embedded docs work (#449)
    
    * verify embedded docs work
    add the parent doc to the ID for child docs - might be more usable.
    
    * fix import order
    
    * format codestyle
---
 .../pipes/solr/tests/TikaPipesSolrTestBase.java    |  58 ++++++++++++++++-----
 .../src/test/resources/embedded/embedded.docx      | Bin 0 -> 99389 bytes
 .../tika/pipes/emitter/solr/SolrEmitter.java       |  16 +++---
 3 files changed, 55 insertions(+), 19 deletions(-)

diff --git a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolrTestBase.java b/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolrTestBase.java
index 3810b3d..1729961 100644
--- a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolrTestBase.java
+++ b/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolrTestBase.java
@@ -17,14 +17,23 @@
 package org.apache.tika.pipes.solr.tests;
 
 import java.io.File;
+import java.io.IOException;
 import java.io.InputStream;
 import java.nio.charset.StandardCharsets;
 
 import org.apache.commons.io.IOUtils;
+import org.apache.http.client.methods.CloseableHttpResponse;
+import org.apache.http.client.methods.HttpPost;
+import org.apache.http.entity.StringEntity;
+import org.apache.http.impl.client.CloseableHttpClient;
+import org.apache.http.impl.client.HttpClients;
 import org.apache.solr.client.solrj.SolrClient;
 import org.apache.solr.client.solrj.SolrQuery;
 import org.apache.solr.client.solrj.impl.LBHttpSolrClient;
 import org.apache.solr.common.SolrInputDocument;
+import org.apache.tika.cli.TikaCLI;
+import org.apache.tika.pipes.HandlerConfig;
+import org.apache.tika.pipes.emitter.solr.SolrEmitter;
 import org.jetbrains.annotations.NotNull;
 import org.junit.After;
 import org.junit.Assert;
@@ -35,10 +44,6 @@ import org.testcontainers.containers.GenericContainer;
 import org.testcontainers.shaded.org.apache.commons.io.FileUtils;
 import org.testcontainers.utility.DockerImageName;
 
-import org.apache.tika.cli.TikaCLI;
-import org.apache.tika.pipes.HandlerConfig;
-import org.apache.tika.pipes.emitter.solr.SolrEmitter;
-
 public abstract class TikaPipesSolrTestBase {
 
     private final String collection = "testcol";
@@ -51,6 +56,7 @@ public abstract class TikaPipesSolrTestBase {
     private String solrEndpoint;
 
     public abstract boolean useZk();
+
     public abstract String getSolrImageName();
 
     @Rule
@@ -74,16 +80,18 @@ public abstract class TikaPipesSolrTestBase {
         runTikaAsyncSolrPipeIteratorFileFetcherSolrEmitter();
     }
 
-    private void createTestHtmlFiles(String bodyContent) throws Exception {
+    private void createTestFiles(String bodyContent) throws Exception {
         testFileFolder.mkdirs();
         for (int i = 0; i < numDocs; ++i) {
             FileUtils.writeStringToFile(new File(testFileFolder, "test-" + i + ".html"),
                     "<html><body>" + bodyContent + "</body></html>", StandardCharsets.UTF_8);
         }
+        FileUtils.copyInputStreamToFile(this.getClass().getResourceAsStream("/embedded/embedded.docx"),
+                new File(testFileFolder, "test-embedded.doc"));
     }
 
     protected void setupSolr(GenericContainer<?> solr) throws Exception {
-        createTestHtmlFiles("initial");
+        createTestFiles("initial");
         this.solr = solr;
         solrHost = solr.getHost();
         solrPort = solr.getMappedPort(8983);
@@ -95,6 +103,7 @@ public abstract class TikaPipesSolrTestBase {
         try (SolrClient solrClient = new LBHttpSolrClient.Builder().withBaseSolrUrls(solrEndpoint)
                 .build()) {
 
+            addSchemaFieldsForNestedDocs(solrEndpoint + "/" + collection);
             for (int i = 0; i < numDocs; ++i) {
                 SolrInputDocument solrDoc = new SolrInputDocument();
                 String filename = "test-" + i + ".html";
@@ -102,13 +111,35 @@ public abstract class TikaPipesSolrTestBase {
                 solrDoc.setField("path", filename);
                 solrClient.add(collection, solrDoc);
             }
+            SolrInputDocument embeddedDoc = new SolrInputDocument();
+            String filename = "test-embedded.doc";
+            embeddedDoc.setField("id", filename);
+            embeddedDoc.setField("path", filename);
+            solrClient.add(collection, embeddedDoc);
             solrClient.commit(collection);
         }
     }
 
+    private void addSchemaFieldsForNestedDocs(String solrUrl) throws IOException {
+        try (CloseableHttpClient client = HttpClients.createMinimal()) {
+            HttpPost postAddRoot = new HttpPost(solrUrl + "/schema");
+            postAddRoot.setHeader("Content-Type", "application/json");
+            postAddRoot.setEntity(new StringEntity("{\n" +
+                    "  \"replace-field\":{\n" +
+                    "     \"name\":\"_root_\",\n" +
+                    "     \"type\":\"string\",\n" +
+                    "     \"indexed\":true,\n" +
+                    "     \"stored\":true, \n" +
+                    "     \"docValues\":false \n" +
+                    "  }\n" +
+                    "}"));
+            CloseableHttpResponse resp = client.execute(postAddRoot);
+            Assert.assertEquals(200, resp.getStatusLine().getStatusCode());
+        }
+    }
+
     /**
      * Runs a test using Solr Pipe Iterator, File Fetcher and Solr Emitter.
-     *
      */
     protected void runTikaAsyncSolrPipeIteratorFileFetcherSolrEmitter()
             throws Exception {
@@ -128,10 +159,10 @@ public abstract class TikaPipesSolrTestBase {
                 createTikaConfigXml(useZk(), tikaConfigFile, log4jPropFile, tikaConfigTemplateXml,
                         SolrEmitter.UpdateStrategy.ADD,
                         SolrEmitter.AttachmentStrategy.PARENT_CHILD,
-                        HandlerConfig.PARSE_MODE.CONCATENATE);
+                        HandlerConfig.PARSE_MODE.RMETA);
         FileUtils.writeStringToFile(tikaConfigFile, tikaConfigXml, StandardCharsets.UTF_8);
 
-        TikaCLI.main(new String[]{"-a", "--config=" + tikaConfigFile.getAbsolutePath()});
+        TikaCLI.main(new String[] {"-a", "--config=" + tikaConfigFile.getAbsolutePath()});
 
         try (SolrClient solrClient = new LBHttpSolrClient.Builder().withBaseSolrUrls(solrEndpoint)
                 .build()) {
@@ -142,19 +173,22 @@ public abstract class TikaPipesSolrTestBase {
             Assert.assertEquals(numDocs,
                     solrClient.query(collection, new SolrQuery("content_s:*initial*")).getResults()
                             .getNumFound());
+            Assert.assertEquals(3,
+                    solrClient.query(collection, new SolrQuery("_root_:\"test-embedded.doc\"")).getResults()
+                            .getNumFound());
         }
 
         // update the documents with "update must exist" and run tika async again with "UPDATE_MUST_EXIST".
         // It should not fail, and docs should be updated.
-        createTestHtmlFiles("updated");
+        createTestFiles("updated");
         tikaConfigXml =
                 createTikaConfigXml(useZk(), tikaConfigFile, log4jPropFile, tikaConfigTemplateXml,
                         SolrEmitter.UpdateStrategy.UPDATE_MUST_EXIST,
                         SolrEmitter.AttachmentStrategy.PARENT_CHILD,
-                        HandlerConfig.PARSE_MODE.CONCATENATE);
+                        HandlerConfig.PARSE_MODE.RMETA);
         FileUtils.writeStringToFile(tikaConfigFile, tikaConfigXml, StandardCharsets.UTF_8);
 
-        TikaCLI.main(new String[]{"-a", "--config=" + tikaConfigFile.getAbsolutePath()});
+        TikaCLI.main(new String[] {"-a", "--config=" + tikaConfigFile.getAbsolutePath()});
 
         try (SolrClient solrClient = new LBHttpSolrClient.Builder().withBaseSolrUrls(solrEndpoint)
                 .build()) {
diff --git a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/embedded/embedded.docx b/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/embedded/embedded.docx
new file mode 100644
index 0000000..255da0f
Binary files /dev/null and b/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/embedded/embedded.docx differ
diff --git a/tika-pipes/tika-emitters/tika-emitter-solr/src/main/java/org/apache/tika/pipes/emitter/solr/SolrEmitter.java b/tika-pipes/tika-emitters/tika-emitter-solr/src/main/java/org/apache/tika/pipes/emitter/solr/SolrEmitter.java
index a1388e0..6e3f3c5 100644
--- a/tika-pipes/tika-emitters/tika-emitter-solr/src/main/java/org/apache/tika/pipes/emitter/solr/SolrEmitter.java
+++ b/tika-pipes/tika-emitters/tika-emitter-solr/src/main/java/org/apache/tika/pipes/emitter/solr/SolrEmitter.java
@@ -31,9 +31,6 @@ import org.apache.solr.client.solrj.impl.CloudSolrClient;
 import org.apache.solr.client.solrj.impl.LBHttpSolrClient;
 import org.apache.solr.client.solrj.request.UpdateRequest;
 import org.apache.solr.common.SolrInputDocument;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
 import org.apache.tika.client.HttpClientFactory;
 import org.apache.tika.config.Field;
 import org.apache.tika.config.Initializable;
@@ -44,6 +41,8 @@ import org.apache.tika.metadata.Metadata;
 import org.apache.tika.pipes.emitter.AbstractEmitter;
 import org.apache.tika.pipes.emitter.EmitData;
 import org.apache.tika.pipes.emitter.TikaEmitterException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 
 public class SolrEmitter extends AbstractEmitter implements Initializable {
@@ -65,6 +64,7 @@ public class SolrEmitter extends AbstractEmitter implements Initializable {
     private int connectionTimeout = 10000;
     private int socketTimeout = 60000;
     private SolrClient solrClient;
+
     public SolrEmitter() throws TikaConfigException {
         httpClientFactory = new HttpClientFactory();
     }
@@ -99,7 +99,7 @@ public class SolrEmitter extends AbstractEmitter implements Initializable {
             for (int i = 1; i < metadataList.size(); i++) {
                 SolrInputDocument childSolrInputDocument = new SolrInputDocument();
                 Metadata m = metadataList.get(i);
-                childSolrInputDocument.setField(idField, UUID.randomUUID().toString());
+                childSolrInputDocument.setField(idField, emitKey + "_" + UUID.randomUUID().toString());
                 addMetadataToSolrInputDocument(m, childSolrInputDocument, updateStrategy);
                 solrInputDocument.addChildDocument(childSolrInputDocument);
             }
@@ -164,7 +164,8 @@ public class SolrEmitter extends AbstractEmitter implements Initializable {
                 if (updateStrategy == UpdateStrategy.ADD) {
                     solrInputDocument.setField(n, vals[0]);
                 } else {
-                    solrInputDocument.setField(n, new HashMap<String, String>() {{
+                    solrInputDocument.setField(n, new HashMap<String, String>() {
+                        {
                             put("set", vals[0]);
                         }
                     });
@@ -173,7 +174,8 @@ public class SolrEmitter extends AbstractEmitter implements Initializable {
                 if (updateStrategy == UpdateStrategy.ADD) {
                     solrInputDocument.setField(n, vals);
                 } else {
-                    solrInputDocument.setField(n, new HashMap<String, String[]>() {{
+                    solrInputDocument.setField(n, new HashMap<String, String[]>() {
+                        {
                             put("set", vals);
                         }
                     });
@@ -287,7 +289,7 @@ public class SolrEmitter extends AbstractEmitter implements Initializable {
         } else {
             solrClient = new LBHttpSolrClient.Builder().withConnectionTimeout(connectionTimeout)
                     .withSocketTimeout(socketTimeout).withHttpClient(httpClientFactory.build())
-                    .withBaseSolrUrls(solrUrls.toArray(new String[]{})).build();
+                    .withBaseSolrUrls(solrUrls.toArray(new String[] {})).build();
         }
     }