You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/07/29 20:31:01 UTC
[tika] branch main updated: TIKA-3495 -- convert from anonymous to
named parent-child relationship
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 97c9b94 TIKA-3495 -- convert from anonymous to named parent-child relationship
97c9b94 is described below
commit 97c9b945903229f53c2adda26e95b9f1f6bf3433
Author: tallison <ta...@apache.org>
AuthorDate: Thu Jul 29 16:27:04 2021 -0400
TIKA-3495 -- convert from anonymous to named parent-child relationship
---
.../opensearch/tests/OpenSearchTestClient.java | 5 +++--
.../opensearch/tests/TikaPipesOpenSearchTest.java | 4 +++-
.../tika/pipes/solr/tests/TikaPipesSolr6Test.java | 6 ++++++
.../pipes/solr/tests/TikaPipesSolrTestBase.java | 23 +++++++++++++++-------
.../pipes/emitter/opensearch/OpenSearchClient.java | 13 +++++++-----
.../emitter/opensearch/OpenSearchEmitter.java | 20 ++++++++++++++++++-
.../emitter/opensearch/OpenSearchClientTest.java | 3 ++-
.../tika/pipes/emitter/solr/SolrEmitter.java | 20 +++++++++++++++++--
8 files changed, 75 insertions(+), 19 deletions(-)
diff --git a/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/java/org/apache/tika/pipes/opensearch/tests/OpenSearchTestClient.java b/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/java/org/apache/tika/pipes/opensearch/tests/OpenSearchTestClient.java
index ca3abe1..27c71a9 100644
--- a/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/java/org/apache/tika/pipes/opensearch/tests/OpenSearchTestClient.java
+++ b/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/java/org/apache/tika/pipes/opensearch/tests/OpenSearchTestClient.java
@@ -43,8 +43,9 @@ import org.apache.tika.pipes.emitter.opensearch.OpenSearchEmitter;
public class OpenSearchTestClient extends OpenSearchClient {
protected OpenSearchTestClient(String openSearchUrl, HttpClient httpClient,
- OpenSearchEmitter.AttachmentStrategy attachmentStrategy) {
- super(openSearchUrl, httpClient, attachmentStrategy);
+ OpenSearchEmitter.AttachmentStrategy attachmentStrategy,
+ String embeddedFileFieldName) {
+ super(openSearchUrl, httpClient, attachmentStrategy, embeddedFileFieldName);
}
protected JsonResponse putJson(String url, String json) throws IOException {
diff --git a/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/java/org/apache/tika/pipes/opensearch/tests/TikaPipesOpenSearchTest.java b/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/java/org/apache/tika/pipes/opensearch/tests/TikaPipesOpenSearchTest.java
index 986ce7a..683092b 100644
--- a/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/java/org/apache/tika/pipes/opensearch/tests/TikaPipesOpenSearchTest.java
+++ b/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/java/org/apache/tika/pipes/opensearch/tests/TikaPipesOpenSearchTest.java
@@ -313,7 +313,9 @@ public class TikaPipesOpenSearchTest {
httpClientFactory.setPassword("admin");
//attachment strategy is not used here...TODO clean this up
client = new OpenSearchTestClient(openSearchEndpointBase,
- httpClientFactory.build(), OpenSearchEmitter.AttachmentStrategy.SEPARATE_DOCUMENTS);
+ httpClientFactory.build(),
+ OpenSearchEmitter.AttachmentStrategy.SEPARATE_DOCUMENTS,
+ OpenSearchEmitter.DEFAULT_EMBEDDED_FILE_FIELD_NAME);
}
private void createTestHtmlFiles(String bodyContent, int numHtmlDocs) throws Exception {
diff --git a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolr6Test.java b/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolr6Test.java
index 9ec9bee..b146188 100644
--- a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolr6Test.java
+++ b/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolr6Test.java
@@ -28,4 +28,10 @@ public class TikaPipesSolr6Test extends TikaPipesSolrTestBase {
return "solr:6";
}
+ @Override
+ public boolean handlesParentChild() {
+ //Solr 6 didn't automatically set _root_ with the parent-child indexing,
+ //apparently
+ return false;
+ }
}
diff --git a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolrTestBase.java b/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolrTestBase.java
index 38fd306..76278e5 100644
--- a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolrTestBase.java
+++ b/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolrTestBase.java
@@ -16,6 +16,8 @@
*/
package org.apache.tika.pipes.solr.tests;
+import static org.junit.jupiter.api.Assumptions.assumeTrue;
+
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
@@ -35,11 +37,12 @@ import org.apache.tika.cli.TikaCLI;
import org.apache.tika.pipes.HandlerConfig;
import org.apache.tika.pipes.emitter.solr.SolrEmitter;
import org.jetbrains.annotations.NotNull;
-import org.junit.After;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Rule;
import org.junit.Test;
+import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.BeforeEach;
import org.testcontainers.containers.GenericContainer;
import org.testcontainers.shaded.org.apache.commons.io.FileUtils;
import org.testcontainers.utility.DockerImageName;
@@ -59,6 +62,11 @@ public abstract class TikaPipesSolrTestBase {
public abstract String getSolrImageName();
+
+ public boolean handlesParentChild() {
+ return true;
+ }
+
@Rule
public GenericContainer<?> solrContainer =
new GenericContainer<>(DockerImageName.parse(getSolrImageName())).withExposedPorts(8983,
@@ -70,13 +78,13 @@ public abstract class TikaPipesSolrTestBase {
setupSolr(solrContainer);
}
- @After
+ @AfterEach
public void tearDown() throws Exception {
FileUtils.deleteDirectory(testFileFolder);
}
@Test
- public void testFetchIteratorWithSolrUrls() throws Exception {
+ public void testPipesIteratorWithSolrUrls() throws Exception {
runTikaAsyncSolrPipeIteratorFileFetcherSolrEmitter();
}
@@ -172,10 +180,11 @@ public abstract class TikaPipesSolrTestBase {
Assert.assertEquals(numDocs,
solrClient.query(collection, new SolrQuery("content_s:*initial*")).getResults()
.getNumFound());
- Assert.assertEquals(3,
- solrClient.query(collection, new SolrQuery("_root_:\"test-embedded.docx\"")).getResults()
- .getNumFound());
-
+ if(handlesParentChild()) {
+ Assert.assertEquals(3,
+ solrClient.query(collection, new SolrQuery("_root_:\"test-embedded.docx\""))
+ .getResults().getNumFound());
+ }
//clean up test-embedded.docx so that the iterator won't try to update its children
//in the next test
solrClient.deleteById(collection, "_root_:\"test-embedded.docx\"");
diff --git a/tika-pipes/tika-emitters/tika-emitter-opensearch/src/main/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchClient.java b/tika-pipes/tika-emitters/tika-emitter-opensearch/src/main/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchClient.java
index b30a648..208b431 100644
--- a/tika-pipes/tika-emitters/tika-emitter-opensearch/src/main/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchClient.java
+++ b/tika-pipes/tika-emitters/tika-emitter-opensearch/src/main/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchClient.java
@@ -52,12 +52,14 @@ public class OpenSearchClient {
protected final String openSearchUrl;
protected final HttpClient httpClient;
private final OpenSearchEmitter.AttachmentStrategy attachmentStrategy;
-
+ private final String embeddedFileFieldName;
protected OpenSearchClient(String openSearchUrl, HttpClient httpClient,
- OpenSearchEmitter.AttachmentStrategy attachmentStrategy) {
+ OpenSearchEmitter.AttachmentStrategy attachmentStrategy,
+ String embeddedFileFieldName) {
this.openSearchUrl = openSearchUrl;
this.httpClient = httpClient;
this.attachmentStrategy = attachmentStrategy;
+ this.embeddedFileFieldName = embeddedFileFieldName;
}
public void addDocument(String emitKey, List<Metadata> metadataList) throws IOException,
@@ -77,7 +79,8 @@ public class OpenSearchClient {
if (i == 0) {
sb.append(metadataToJsonContainer(metadata, attachmentStrategy));
} else {
- sb.append(metadataToJsonEmbedded(metadata, attachmentStrategy, emitKey));
+ sb.append(metadataToJsonEmbedded(metadata, attachmentStrategy,
+ emitKey, embeddedFileFieldName));
}
sb.append("\n");
i++;
@@ -102,7 +105,7 @@ public class OpenSearchClient {
protected static String metadataToJsonEmbedded(Metadata metadata,
OpenSearchEmitter.AttachmentStrategy attachmentStrategy,
- String emitKey) throws IOException {
+ String emitKey, String embeddedFileFieldName) throws IOException {
StringWriter writer = new StringWriter();
try (JsonGenerator jsonGenerator = new JsonFactory().createGenerator(writer)) {
jsonGenerator.writeStartObject();
@@ -110,7 +113,7 @@ public class OpenSearchClient {
writeMetadata(metadata, jsonGenerator);
if (attachmentStrategy == OpenSearchEmitter.AttachmentStrategy.PARENT_CHILD) {
jsonGenerator.writeObjectFieldStart("relation_type");
- jsonGenerator.writeStringField("name", "embedded");
+ jsonGenerator.writeStringField("name", embeddedFileFieldName);
jsonGenerator.writeStringField("parent", emitKey);
//end the relation type object
jsonGenerator.writeEndObject();
diff --git a/tika-pipes/tika-emitters/tika-emitter-opensearch/src/main/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchEmitter.java b/tika-pipes/tika-emitters/tika-emitter-opensearch/src/main/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchEmitter.java
index 208f138..8fe1849 100644
--- a/tika-pipes/tika-emitters/tika-emitter-opensearch/src/main/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchEmitter.java
+++ b/tika-pipes/tika-emitters/tika-emitter-opensearch/src/main/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchEmitter.java
@@ -46,6 +46,7 @@ public class OpenSearchEmitter extends AbstractEmitter implements Initializable
//anything else?
}
+ public static String DEFAULT_EMBEDDED_FILE_FIELD_NAME = "embedded";
private static final Logger LOG = LoggerFactory.getLogger(OpenSearchEmitter.class);
private AttachmentStrategy attachmentStrategy = AttachmentStrategy.PARENT_CHILD;
@@ -54,6 +55,7 @@ public class OpenSearchEmitter extends AbstractEmitter implements Initializable
private int commitWithin = 1000;
private OpenSearchClient openSearchClient;
private final HttpClientFactory httpClientFactory;
+ private String embeddedFileFieldName = DEFAULT_EMBEDDED_FILE_FIELD_NAME;
public OpenSearchEmitter() throws TikaConfigException {
httpClientFactory = new HttpClientFactory();
@@ -157,13 +159,29 @@ public class OpenSearchEmitter extends AbstractEmitter implements Initializable
httpClientFactory.setProxyPort(proxyPort);
}
+ /**
+ * If using the {@link AttachmentStrategy#PARENT_CHILD}, this is the field name
+ * used to store the child documents. Note that we artificially flatten all embedded
+ * documents, no matter how nested in the container document, into direct children
+ * of the root document.
+ *
+ * @param embeddedFileFieldName
+ */
+ @Field
+ public void setEmbeddedFileFieldName(String embeddedFileFieldName) {
+ this.embeddedFileFieldName = embeddedFileFieldName;
+ }
+
+
@Override
public void initialize(Map<String, Param> params) throws TikaConfigException {
if (StringUtils.isBlank(openSearchUrl)) {
throw new TikaConfigException("Must specify an open search url!");
} else {
openSearchClient =
- new OpenSearchClient(openSearchUrl, httpClientFactory.build(), attachmentStrategy);
+ new OpenSearchClient(openSearchUrl,
+ httpClientFactory.build(), attachmentStrategy,
+ embeddedFileFieldName);
}
}
diff --git a/tika-pipes/tika-emitters/tika-emitter-opensearch/src/test/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchClientTest.java b/tika-pipes/tika-emitters/tika-emitter-opensearch/src/test/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchClientTest.java
index 34653dc..638533c 100644
--- a/tika-pipes/tika-emitters/tika-emitter-opensearch/src/test/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchClientTest.java
+++ b/tika-pipes/tika-emitters/tika-emitter-opensearch/src/test/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchClientTest.java
@@ -40,7 +40,8 @@ public class OpenSearchClientTest extends TikaTest {
}
for (OpenSearchEmitter.AttachmentStrategy strategy :
OpenSearchEmitter.AttachmentStrategy.values()) {
- String json = OpenSearchClient.metadataToJsonEmbedded(metadata, strategy, "myEmitKey");
+ String json = OpenSearchClient.metadataToJsonEmbedded(metadata, strategy,
+ "myEmitKey", OpenSearchEmitter.DEFAULT_EMBEDDED_FILE_FIELD_NAME);
assertContains("author1", json);
assertContains("author2", json);
assertContains("authors", json);
diff --git a/tika-pipes/tika-emitters/tika-emitter-solr/src/main/java/org/apache/tika/pipes/emitter/solr/SolrEmitter.java b/tika-pipes/tika-emitters/tika-emitter-solr/src/main/java/org/apache/tika/pipes/emitter/solr/SolrEmitter.java
index 8e1615b..eb625ee 100644
--- a/tika-pipes/tika-emitters/tika-emitter-solr/src/main/java/org/apache/tika/pipes/emitter/solr/SolrEmitter.java
+++ b/tika-pipes/tika-emitters/tika-emitter-solr/src/main/java/org/apache/tika/pipes/emitter/solr/SolrEmitter.java
@@ -48,6 +48,7 @@ import org.apache.tika.pipes.emitter.TikaEmitterException;
public class SolrEmitter extends AbstractEmitter implements Initializable {
+ public static String DEFAULT_EMBEDDED_FILE_FIELD_NAME = "embedded";
private static final Logger LOG = LoggerFactory.getLogger(SolrEmitter.class);
private final HttpClientFactory httpClientFactory;
private AttachmentStrategy attachmentStrategy = AttachmentStrategy.PARENT_CHILD;
@@ -59,12 +60,12 @@ public class SolrEmitter extends AbstractEmitter implements Initializable {
private List<String> solrUrls;
private List<String> solrZkHosts;
private String solrZkChroot;
- private String contentField = "content";
private String idField = "id";
private int commitWithin = 1000;
private int connectionTimeout = 10000;
private int socketTimeout = 60000;
private SolrClient solrClient;
+ private String embeddedFileFieldName = DEFAULT_EMBEDDED_FILE_FIELD_NAME;
public SolrEmitter() throws TikaConfigException {
httpClientFactory = new HttpClientFactory();
@@ -97,14 +98,16 @@ public class SolrEmitter extends AbstractEmitter implements Initializable {
docsToUpdate.add(solrInputDocument);
} else if (attachmentStrategy == AttachmentStrategy.PARENT_CHILD) {
addMetadataToSolrInputDocument(metadataList.get(0), solrInputDocument, updateStrategy);
+ List<SolrInputDocument> children = new ArrayList<>();
for (int i = 1; i < metadataList.size(); i++) {
SolrInputDocument childSolrInputDocument = new SolrInputDocument();
Metadata m = metadataList.get(i);
childSolrInputDocument
.setField(idField, emitKey + "-" + UUID.randomUUID().toString());
addMetadataToSolrInputDocument(m, childSolrInputDocument, updateStrategy);
- solrInputDocument.addChildDocument(childSolrInputDocument);
+ children.add(childSolrInputDocument);
}
+ solrInputDocument.setField(embeddedFileFieldName, children);
docsToUpdate.add(solrInputDocument);
} else if (attachmentStrategy == AttachmentStrategy.SEPARATE_DOCUMENTS) {
addMetadataToSolrInputDocument(metadataList.get(0), solrInputDocument, updateStrategy);
@@ -282,6 +285,19 @@ public class SolrEmitter extends AbstractEmitter implements Initializable {
httpClientFactory.setProxyPort(proxyPort);
}
+ /**
+ * If using the {@link AttachmentStrategy#PARENT_CHILD}, this is the field name
+ * used to store the child documents. Note that we artificially flatten all embedded
+ * documents, no matter how nested in the container document, into direct children
+ * of the root document.
+ *
+ * @param embeddedFileFieldName
+ */
+ @Field
+ public void setEmbeddedFileFieldName(String embeddedFileFieldName) {
+ this.embeddedFileFieldName = embeddedFileFieldName;
+ }
+
@Override
public void initialize(Map<String, Param> params) throws TikaConfigException {
if (solrUrls == null || solrUrls.isEmpty()) {