You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/07/26 15:23:46 UTC
[tika] branch main updated: TIKA-3495 - verify embedded docs work
(#449)
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 6c30f2b TIKA-3495 - verify embedded docs work (#449)
6c30f2b is described below
commit 6c30f2b273a7ad5b820cb236df129da71d1590b5
Author: Nicholas DiPiazza <ni...@lucidworks.com>
AuthorDate: Mon Jul 26 10:23:32 2021 -0500
TIKA-3495 - verify embedded docs work (#449)
* verify embedded docs work
add the parent doc to the ID for child docs - might be more usable.
* fix import order
* format codestyle
---
.../pipes/solr/tests/TikaPipesSolrTestBase.java | 58 ++++++++++++++++-----
.../src/test/resources/embedded/embedded.docx | Bin 0 -> 99389 bytes
.../tika/pipes/emitter/solr/SolrEmitter.java | 16 +++---
3 files changed, 55 insertions(+), 19 deletions(-)
diff --git a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolrTestBase.java b/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolrTestBase.java
index 3810b3d..1729961 100644
--- a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolrTestBase.java
+++ b/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolrTestBase.java
@@ -17,14 +17,23 @@
package org.apache.tika.pipes.solr.tests;
import java.io.File;
+import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import org.apache.commons.io.IOUtils;
+import org.apache.http.client.methods.CloseableHttpResponse;
+import org.apache.http.client.methods.HttpPost;
+import org.apache.http.entity.StringEntity;
+import org.apache.http.impl.client.CloseableHttpClient;
+import org.apache.http.impl.client.HttpClients;
import org.apache.solr.client.solrj.SolrClient;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.impl.LBHttpSolrClient;
import org.apache.solr.common.SolrInputDocument;
+import org.apache.tika.cli.TikaCLI;
+import org.apache.tika.pipes.HandlerConfig;
+import org.apache.tika.pipes.emitter.solr.SolrEmitter;
import org.jetbrains.annotations.NotNull;
import org.junit.After;
import org.junit.Assert;
@@ -35,10 +44,6 @@ import org.testcontainers.containers.GenericContainer;
import org.testcontainers.shaded.org.apache.commons.io.FileUtils;
import org.testcontainers.utility.DockerImageName;
-import org.apache.tika.cli.TikaCLI;
-import org.apache.tika.pipes.HandlerConfig;
-import org.apache.tika.pipes.emitter.solr.SolrEmitter;
-
public abstract class TikaPipesSolrTestBase {
private final String collection = "testcol";
@@ -51,6 +56,7 @@ public abstract class TikaPipesSolrTestBase {
private String solrEndpoint;
public abstract boolean useZk();
+
public abstract String getSolrImageName();
@Rule
@@ -74,16 +80,18 @@ public abstract class TikaPipesSolrTestBase {
runTikaAsyncSolrPipeIteratorFileFetcherSolrEmitter();
}
- private void createTestHtmlFiles(String bodyContent) throws Exception {
+ private void createTestFiles(String bodyContent) throws Exception {
testFileFolder.mkdirs();
for (int i = 0; i < numDocs; ++i) {
FileUtils.writeStringToFile(new File(testFileFolder, "test-" + i + ".html"),
"<html><body>" + bodyContent + "</body></html>", StandardCharsets.UTF_8);
}
+ FileUtils.copyInputStreamToFile(this.getClass().getResourceAsStream("/embedded/embedded.docx"),
+ new File(testFileFolder, "test-embedded.doc"));
}
protected void setupSolr(GenericContainer<?> solr) throws Exception {
- createTestHtmlFiles("initial");
+ createTestFiles("initial");
this.solr = solr;
solrHost = solr.getHost();
solrPort = solr.getMappedPort(8983);
@@ -95,6 +103,7 @@ public abstract class TikaPipesSolrTestBase {
try (SolrClient solrClient = new LBHttpSolrClient.Builder().withBaseSolrUrls(solrEndpoint)
.build()) {
+ addSchemaFieldsForNestedDocs(solrEndpoint + "/" + collection);
for (int i = 0; i < numDocs; ++i) {
SolrInputDocument solrDoc = new SolrInputDocument();
String filename = "test-" + i + ".html";
@@ -102,13 +111,35 @@ public abstract class TikaPipesSolrTestBase {
solrDoc.setField("path", filename);
solrClient.add(collection, solrDoc);
}
+ SolrInputDocument embeddedDoc = new SolrInputDocument();
+ String filename = "test-embedded.doc";
+ embeddedDoc.setField("id", filename);
+ embeddedDoc.setField("path", filename);
+ solrClient.add(collection, embeddedDoc);
solrClient.commit(collection);
}
}
+ private void addSchemaFieldsForNestedDocs(String solrUrl) throws IOException {
+ try (CloseableHttpClient client = HttpClients.createMinimal()) {
+ HttpPost postAddRoot = new HttpPost(solrUrl + "/schema");
+ postAddRoot.setHeader("Content-Type", "application/json");
+ postAddRoot.setEntity(new StringEntity("{\n" +
+ " \"replace-field\":{\n" +
+ " \"name\":\"_root_\",\n" +
+ " \"type\":\"string\",\n" +
+ " \"indexed\":true,\n" +
+ " \"stored\":true, \n" +
+ " \"docValues\":false \n" +
+ " }\n" +
+ "}"));
+ CloseableHttpResponse resp = client.execute(postAddRoot);
+ Assert.assertEquals(200, resp.getStatusLine().getStatusCode());
+ }
+ }
+
/**
* Runs a test using Solr Pipe Iterator, File Fetcher and Solr Emitter.
- *
*/
protected void runTikaAsyncSolrPipeIteratorFileFetcherSolrEmitter()
throws Exception {
@@ -128,10 +159,10 @@ public abstract class TikaPipesSolrTestBase {
createTikaConfigXml(useZk(), tikaConfigFile, log4jPropFile, tikaConfigTemplateXml,
SolrEmitter.UpdateStrategy.ADD,
SolrEmitter.AttachmentStrategy.PARENT_CHILD,
- HandlerConfig.PARSE_MODE.CONCATENATE);
+ HandlerConfig.PARSE_MODE.RMETA);
FileUtils.writeStringToFile(tikaConfigFile, tikaConfigXml, StandardCharsets.UTF_8);
- TikaCLI.main(new String[]{"-a", "--config=" + tikaConfigFile.getAbsolutePath()});
+ TikaCLI.main(new String[] {"-a", "--config=" + tikaConfigFile.getAbsolutePath()});
try (SolrClient solrClient = new LBHttpSolrClient.Builder().withBaseSolrUrls(solrEndpoint)
.build()) {
@@ -142,19 +173,22 @@ public abstract class TikaPipesSolrTestBase {
Assert.assertEquals(numDocs,
solrClient.query(collection, new SolrQuery("content_s:*initial*")).getResults()
.getNumFound());
+ Assert.assertEquals(3,
+ solrClient.query(collection, new SolrQuery("_root_:\"test-embedded.doc\"")).getResults()
+ .getNumFound());
}
// update the documents with "update must exist" and run tika async again with "UPDATE_MUST_EXIST".
// It should not fail, and docs should be updated.
- createTestHtmlFiles("updated");
+ createTestFiles("updated");
tikaConfigXml =
createTikaConfigXml(useZk(), tikaConfigFile, log4jPropFile, tikaConfigTemplateXml,
SolrEmitter.UpdateStrategy.UPDATE_MUST_EXIST,
SolrEmitter.AttachmentStrategy.PARENT_CHILD,
- HandlerConfig.PARSE_MODE.CONCATENATE);
+ HandlerConfig.PARSE_MODE.RMETA);
FileUtils.writeStringToFile(tikaConfigFile, tikaConfigXml, StandardCharsets.UTF_8);
- TikaCLI.main(new String[]{"-a", "--config=" + tikaConfigFile.getAbsolutePath()});
+ TikaCLI.main(new String[] {"-a", "--config=" + tikaConfigFile.getAbsolutePath()});
try (SolrClient solrClient = new LBHttpSolrClient.Builder().withBaseSolrUrls(solrEndpoint)
.build()) {
diff --git a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/embedded/embedded.docx b/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/embedded/embedded.docx
new file mode 100644
index 0000000..255da0f
Binary files /dev/null and b/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/embedded/embedded.docx differ
diff --git a/tika-pipes/tika-emitters/tika-emitter-solr/src/main/java/org/apache/tika/pipes/emitter/solr/SolrEmitter.java b/tika-pipes/tika-emitters/tika-emitter-solr/src/main/java/org/apache/tika/pipes/emitter/solr/SolrEmitter.java
index a1388e0..6e3f3c5 100644
--- a/tika-pipes/tika-emitters/tika-emitter-solr/src/main/java/org/apache/tika/pipes/emitter/solr/SolrEmitter.java
+++ b/tika-pipes/tika-emitters/tika-emitter-solr/src/main/java/org/apache/tika/pipes/emitter/solr/SolrEmitter.java
@@ -31,9 +31,6 @@ import org.apache.solr.client.solrj.impl.CloudSolrClient;
import org.apache.solr.client.solrj.impl.LBHttpSolrClient;
import org.apache.solr.client.solrj.request.UpdateRequest;
import org.apache.solr.common.SolrInputDocument;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
import org.apache.tika.client.HttpClientFactory;
import org.apache.tika.config.Field;
import org.apache.tika.config.Initializable;
@@ -44,6 +41,8 @@ import org.apache.tika.metadata.Metadata;
import org.apache.tika.pipes.emitter.AbstractEmitter;
import org.apache.tika.pipes.emitter.EmitData;
import org.apache.tika.pipes.emitter.TikaEmitterException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
public class SolrEmitter extends AbstractEmitter implements Initializable {
@@ -65,6 +64,7 @@ public class SolrEmitter extends AbstractEmitter implements Initializable {
private int connectionTimeout = 10000;
private int socketTimeout = 60000;
private SolrClient solrClient;
+
public SolrEmitter() throws TikaConfigException {
httpClientFactory = new HttpClientFactory();
}
@@ -99,7 +99,7 @@ public class SolrEmitter extends AbstractEmitter implements Initializable {
for (int i = 1; i < metadataList.size(); i++) {
SolrInputDocument childSolrInputDocument = new SolrInputDocument();
Metadata m = metadataList.get(i);
- childSolrInputDocument.setField(idField, UUID.randomUUID().toString());
+ childSolrInputDocument.setField(idField, emitKey + "_" + UUID.randomUUID().toString());
addMetadataToSolrInputDocument(m, childSolrInputDocument, updateStrategy);
solrInputDocument.addChildDocument(childSolrInputDocument);
}
@@ -164,7 +164,8 @@ public class SolrEmitter extends AbstractEmitter implements Initializable {
if (updateStrategy == UpdateStrategy.ADD) {
solrInputDocument.setField(n, vals[0]);
} else {
- solrInputDocument.setField(n, new HashMap<String, String>() {{
+ solrInputDocument.setField(n, new HashMap<String, String>() {
+ {
put("set", vals[0]);
}
});
@@ -173,7 +174,8 @@ public class SolrEmitter extends AbstractEmitter implements Initializable {
if (updateStrategy == UpdateStrategy.ADD) {
solrInputDocument.setField(n, vals);
} else {
- solrInputDocument.setField(n, new HashMap<String, String[]>() {{
+ solrInputDocument.setField(n, new HashMap<String, String[]>() {
+ {
put("set", vals);
}
});
@@ -287,7 +289,7 @@ public class SolrEmitter extends AbstractEmitter implements Initializable {
} else {
solrClient = new LBHttpSolrClient.Builder().withConnectionTimeout(connectionTimeout)
.withSocketTimeout(socketTimeout).withHttpClient(httpClientFactory.build())
- .withBaseSolrUrls(solrUrls.toArray(new String[]{})).build();
+ .withBaseSolrUrls(solrUrls.toArray(new String[] {})).build();
}
}