You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/07/22 14:34:58 UTC
[tika] branch main updated: TIKA-3490 -- fix serialization of
embedded docs in opensearch emitter
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new bf82c4d TIKA-3490 -- fix serialization of embedded docs in opensearch emitter
new 018a32f Merge remote-tracking branch 'origin/main' into main
bf82c4d is described below
commit bf82c4d8fb86efc1b9743c89584ed2f4104f3715
Author: tallison <ta...@apache.org>
AuthorDate: Thu Jul 22 10:34:25 2021 -0400
TIKA-3490 -- fix serialization of embedded docs in opensearch emitter
---
CHANGES.txt | 4 ++
.../opensearch/tests/TikaPipesOpenSearchTest.java | 45 ++++++++++++------
.../test-documents/test_recursive_embedded.docx | Bin 0 -> 27082 bytes
.../tika-emitters/tika-emitter-opensearch/pom.xml | 7 +++
.../pipes/emitter/opensearch/OpenSearchClient.java | 22 +++++----
.../emitter/opensearch/OpenSearchEmitter.java | 24 ++++++++++
.../emitter/opensearch/OpenSearchClientTest.java | 51 +++++++++++++++++++++
7 files changed, 131 insertions(+), 22 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 91e3335..2a82d28 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,3 +1,7 @@
+Release 2.0.1 - ???
+
+ * Fix serialization of embedded docs in OpenSearch emitter (TIKA-3490).
+
Release 2.0.0 - 07/07/2021
* Cleanup of fetcher integration with tika-server.
diff --git a/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/java/org/apache/tika/pipes/opensearch/tests/TikaPipesOpenSearchTest.java b/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/java/org/apache/tika/pipes/opensearch/tests/TikaPipesOpenSearchTest.java
index 63d3254..f6dfb43 100644
--- a/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/java/org/apache/tika/pipes/opensearch/tests/TikaPipesOpenSearchTest.java
+++ b/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/java/org/apache/tika/pipes/opensearch/tests/TikaPipesOpenSearchTest.java
@@ -22,6 +22,9 @@ import static org.junit.Assert.assertTrue;
import java.io.File;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
import java.util.regex.Matcher;
import org.apache.commons.io.IOUtils;
@@ -41,9 +44,10 @@ import org.apache.tika.pipes.emitter.opensearch.OpenSearchEmitter;
public class TikaPipesOpenSearchTest {
- private static final String collection = "testcol";
- private static final File testFileFolder = new File("target", "test-files");
- private final int numDocs = 42;
+ private static final String COLLECTION = "testcol";
+ private static final File TEST_FILE_FOLDER = new File("target", "test-files");
+ private final int numHtmlDocs = 42;
+ private int numTestDocs = 0;
protected GenericContainer<?> openSearch;
private String openSearchHost;
private int openSearchPort;
@@ -68,7 +72,7 @@ public class TikaPipesOpenSearchTest {
@AfterClass
public static void tearDown() throws Exception {
- FileUtils.deleteDirectory(testFileFolder);
+ FileUtils.deleteDirectory(TEST_FILE_FOLDER);
}
@Test
@@ -94,17 +98,25 @@ public class TikaPipesOpenSearchTest {
assertTrue(response.getJson().get("acknowledged").asBoolean());
assertEquals("testcol", response.getJson().get("index").asText());
- runPipes(OpenSearchEmitter.AttachmentStrategy.CONCATENATE_CONTENT);
+ runPipes(OpenSearchEmitter.AttachmentStrategy.SKIP);
//refresh to make sure the content is searchable
JsonResponse refresh = client.getJson(openSearchEndpoint + "/_refresh");
String query = "{ \"track_total_hits\": true, \"query\": { \"match\": { \"content\": { " +
- "\"query\": \"initial\" } } } }";
+ "\"query\": \"happiness\" } } } }";
JsonResponse results = client.postJson(openSearchEndpoint + "/_search", query);
assertEquals(200, results.getStatus());
+ //assertEquals(numHtmlDocs + numTestDocs,
+ // results.getJson().get("hits").get("total").get("value").asInt());
+
+ //now try match all
+ query = "{ \"track_total_hits\": true, \"query\": { \"match_all\": {} } }";
+ results = client.postJson(openSearchEndpoint + "/_search", query);
+ assertEquals(200, results.getStatus());
+ assertEquals(numHtmlDocs + numTestDocs,
+ results.getJson().get("hits").get("total").get("value").asInt());
- assertEquals(numDocs, results.getJson().get("hits").get("total").get("value").asInt());
}
@@ -140,7 +152,7 @@ public class TikaPipesOpenSearchTest {
.replace("{ATTACHMENT_STRATEGY}", attachmentStrategy.toString())
.replace("{LOG4J_PROPERTIES_FILE}", log4jPropFile.getAbsolutePath())
.replaceAll("\\{PATH_TO_DOCS\\}",
- Matcher.quoteReplacement(testFileFolder.getAbsolutePath()));
+ Matcher.quoteReplacement(TEST_FILE_FOLDER.getAbsolutePath()));
res = res.replace("{OPENSEARCH_CONNECTION}", openSearchEndpoint);
@@ -149,11 +161,11 @@ public class TikaPipesOpenSearchTest {
}
private void setupOpenSearch(GenericContainer<?> openSearchContainer) throws Exception {
- createTestHtmlFiles("initial");
+ createTestHtmlFiles("Happiness");
this.openSearch = openSearchContainer;
openSearchHost = openSearch.getHost();
openSearchPort = openSearch.getMappedPort(9200);
- openSearchEndpoint = "https://" + openSearchHost + ":" + openSearchPort + "/" + collection;
+ openSearchEndpoint = "https://" + openSearchHost + ":" + openSearchPort + "/" + COLLECTION;
HttpClientFactory httpClientFactory = new HttpClientFactory();
httpClientFactory.setUserName("admin");
httpClientFactory.setPassword("admin");
@@ -163,11 +175,18 @@ public class TikaPipesOpenSearchTest {
}
private void createTestHtmlFiles(String bodyContent) throws Exception {
- testFileFolder.mkdirs();
- for (int i = 0; i < numDocs; ++i) {
- FileUtils.writeStringToFile(new File(testFileFolder, "test-" + i + ".html"),
+ TEST_FILE_FOLDER.mkdirs();
+ for (int i = 0; i < numHtmlDocs; ++i) {
+ FileUtils.writeStringToFile(new File(TEST_FILE_FOLDER, "test-" + i + ".html"),
"<html><body>" + bodyContent + "</body></html>", StandardCharsets.UTF_8);
}
+ File testDocuments =
+ Paths.get(TikaPipesOpenSearchTest.class.getResource("/test-documents").toURI()).toFile();
+ for (File f : testDocuments.listFiles()) {
+ Path targ = TEST_FILE_FOLDER.toPath().resolve(f.getName());
+ Files.copy(f.toPath(), targ);
+ numTestDocs++;
+ }
}
diff --git a/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/test-documents/test_recursive_embedded.docx b/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/test-documents/test_recursive_embedded.docx
new file mode 100644
index 0000000..cd562cb
Binary files /dev/null and b/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/test-documents/test_recursive_embedded.docx differ
diff --git a/tika-pipes/tika-emitters/tika-emitter-opensearch/pom.xml b/tika-pipes/tika-emitters/tika-emitter-opensearch/pom.xml
index 365d2cd..e0d1f4c 100644
--- a/tika-pipes/tika-emitters/tika-emitter-opensearch/pom.xml
+++ b/tika-pipes/tika-emitters/tika-emitter-opensearch/pom.xml
@@ -71,6 +71,13 @@
<artifactId>junit</artifactId>
<scope>test</scope>
</dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>${project.version}</version>
+ <type>test-jar</type>
+ <scope>test</scope>
+ </dependency>
</dependencies>
<build>
diff --git a/tika-pipes/tika-emitters/tika-emitter-opensearch/src/main/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchClient.java b/tika-pipes/tika-emitters/tika-emitter-opensearch/src/main/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchClient.java
index 6f98474..ab348ad 100644
--- a/tika-pipes/tika-emitters/tika-emitter-opensearch/src/main/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchClient.java
+++ b/tika-pipes/tika-emitters/tika-emitter-opensearch/src/main/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchClient.java
@@ -74,9 +74,9 @@ public class OpenSearchClient {
String indexJson = getBulkIndexJson(id, routing);
sb.append(indexJson).append("\n");
if (i == 0) {
- sb.append(metadataToJsonContainer(metadata));
+ sb.append(metadataToJsonContainer(metadata, attachmentStrategy));
} else {
- sb.append(metadataToJsonEmbedded(metadata, emitKey));
+ sb.append(metadataToJsonEmbedded(metadata, attachmentStrategy, emitKey));
}
sb.append("\n");
i++;
@@ -99,26 +99,30 @@ public class OpenSearchClient {
}
}
- private String metadataToJsonEmbedded(Metadata metadata, String emitKey) throws IOException {
+ protected static String metadataToJsonEmbedded(Metadata metadata,
+ OpenSearchEmitter.AttachmentStrategy attachmentStrategy,
+ String emitKey) throws IOException {
StringWriter writer = new StringWriter();
try (JsonGenerator jsonGenerator = new JsonFactory().createGenerator(writer)) {
jsonGenerator.writeStartObject();
writeMetadata(metadata, jsonGenerator);
if (attachmentStrategy == OpenSearchEmitter.AttachmentStrategy.PARENT_CHILD) {
- jsonGenerator.writeStartObject("relation_type");
+ jsonGenerator.writeObjectFieldStart("relation_type");
jsonGenerator.writeStringField("name", "embedded");
jsonGenerator.writeStringField("parent", emitKey);
+ //end the relation type object
+ jsonGenerator.writeEndObject();
}
- //end the relation type object
- jsonGenerator.writeEndObject();
//end the metadata object
jsonGenerator.writeEndObject();
}
return writer.toString();
}
- private String metadataToJsonContainer(Metadata metadata) throws IOException {
+ protected static String metadataToJsonContainer(Metadata metadata,
+ OpenSearchEmitter.AttachmentStrategy attachmentStrategy)
+ throws IOException {
StringWriter writer = new StringWriter();
try (JsonGenerator jsonGenerator = new JsonFactory().createGenerator(writer)) {
jsonGenerator.writeStartObject();
@@ -131,7 +135,7 @@ public class OpenSearchClient {
return writer.toString();
}
- private void writeMetadata(Metadata metadata, JsonGenerator jsonGenerator) throws IOException {
+ private static void writeMetadata(Metadata metadata, JsonGenerator jsonGenerator) throws IOException {
//writes the metadata without the start { or the end }
//to allow for other fields to be added
for (String n : metadata.names()) {
@@ -139,7 +143,7 @@ public class OpenSearchClient {
if (vals.length == 1) {
jsonGenerator.writeStringField(n, vals[0]);
} else {
- jsonGenerator.writeStartArray(n);
+ jsonGenerator.writeArrayFieldStart(n);
for (String v : vals) {
jsonGenerator.writeString(v);
}
diff --git a/tika-pipes/tika-emitters/tika-emitter-opensearch/src/main/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchEmitter.java b/tika-pipes/tika-emitters/tika-emitter-opensearch/src/main/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchEmitter.java
index fe878fe..5f61814 100644
--- a/tika-pipes/tika-emitters/tika-emitter-opensearch/src/main/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchEmitter.java
+++ b/tika-pipes/tika-emitters/tika-emitter-opensearch/src/main/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchEmitter.java
@@ -19,6 +19,7 @@ package org.apache.tika.pipes.emitter.opensearch;
import static org.apache.tika.config.TikaConfig.mustNotBeEmpty;
import java.io.IOException;
+import java.util.Collections;
import java.util.List;
import java.util.Map;
@@ -68,11 +69,34 @@ public class OpenSearchEmitter extends AbstractEmitter implements Initializable
return;
}
try {
+ if (attachmentStrategy == AttachmentStrategy.CONCATENATE_CONTENT) {
+ metadataList = concatenate(metadataList);
+ } else if (attachmentStrategy == AttachmentStrategy.SKIP) {
+ metadataList = Collections.singletonList(metadataList.get(0));
+ }
openSearchClient.addDocument(emitKey, metadataList);
} catch (TikaClientException e) {
throw new TikaEmitterException("failed to add document", e);
}
}
+
+ private List<Metadata> concatenate(List<Metadata> metadataList) {
+ if (metadataList.size() == 1) {
+ return metadataList;
+ }
+
+ Metadata ret = metadataList.get(0);
+ StringBuilder content = new StringBuilder();
+ for (Metadata m : metadataList) {
+ String c = m.get(getContentField());
+ if (! StringUtils.isBlank(c)) {
+ content.append(c).append("\n");
+ }
+ }
+ ret.set(getContentField(), content.toString());
+ return Collections.singletonList(ret);
+
+ }
/*
private void addMetadataAsSolrInputDocuments(String emitKey, List<Metadata> metadataList,
List<SolrInputDocument> docsToUpdate)
diff --git a/tika-pipes/tika-emitters/tika-emitter-opensearch/src/test/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchClientTest.java b/tika-pipes/tika-emitters/tika-emitter-opensearch/src/test/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchClientTest.java
new file mode 100644
index 0000000..e92607f
--- /dev/null
+++ b/tika-pipes/tika-emitters/tika-emitter-opensearch/src/test/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchClientTest.java
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.pipes.emitter.opensearch;
+
+import org.junit.Test;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+
+public class OpenSearchClientTest extends TikaTest {
+
+ @Test
+ public void testSerialization() throws Exception {
+ Metadata metadata = new Metadata();
+ metadata.add("authors", "author1");
+ metadata.add("authors", "author2");
+ metadata.add("title", "title1");
+ for (OpenSearchEmitter.AttachmentStrategy strategy :
+ OpenSearchEmitter.AttachmentStrategy.values()) {
+ String json = OpenSearchClient.metadataToJsonContainer(metadata,
+ strategy);
+ assertContains("author1", json);
+ assertContains("author2", json);
+ assertContains("authors", json);
+ assertContains("title1", json);
+ }
+ for (OpenSearchEmitter.AttachmentStrategy strategy :
+ OpenSearchEmitter.AttachmentStrategy.values()) {
+ String json = OpenSearchClient.metadataToJsonEmbedded(metadata, strategy, "myEmitKey");
+ assertContains("author1", json);
+ assertContains("author2", json);
+ assertContains("authors", json);
+ assertContains("title1", json);
+ }
+
+ }
+}