You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/07/02 20:43:08 UTC
[tika] branch TIKA-3440 updated (586a823 -> 52380e4)
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a change to branch TIKA-3440
in repository https://gitbox.apache.org/repos/asf/tika.git.
from 586a823 TIKA-3440 -- WIP do not merge
add 51fb33f TIKA-3441 -- increase chances that TesseractOCRParser will not orphan tesseract process and ensure that tika-server never enters an infinite loop on a bind exception.
add 6d4cdd9 add note about log4j2 in breaking changes
add b284e7c TIKA-3444 upgrade to PDFBox 2.0.24
add dbebeb5 TIKA-3412 --
add f230a9a TIKA-3412 -- checkstyle and logic improvements
add 9a05b2e TIKA-3412 -- fix thread safety issue with decimalformat
add 287e1f2 TIKA-3449 -- remove sannies iso parser
add 68c29bc move server and parser info to the right place in CHANGES.txt
add b5e78ac TIKA-3456 -- LanguageDetector should try to respect "hasEnoughText" if the user adds a large string
add 5f9e4f1 TIKA-3457 -- general upgrades
add 27202a2 mp4parser clean up
add c7fd0cb improve diffs in handling EOF in Java 8 vs Java 11 in the AIFFReader.
add cb94a90 fix java 11 compatibility -- need to create ArrayList to allow for .add()
add acaa2ff fix dependencies in tika-eval-app
add 17ed7e8 fix counting in tika-eval FileResourceConsumer
add 1b3ff12 TIKA-307 #334 via jendabenda
new 59cd309 Merge remote-tracking branch 'origin/main' into TIKA-3440
new 52380e4 TIKA-3440 -- first real draft of OpenSearch integration
The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails. The revisions
listed as "add" were already present in the repository and have only
been added to this reference.
Summary of changes:
CHANGES.txt | 27 +-
tika-app/pom.xml | 1 +
tika-batch/pom.xml | 2 +-
.../apache/tika/batch/FileResourceConsumer.java | 4 +-
tika-bundles/tika-bundle-standard/pom.xml | 16 +-
tika-core/pom.xml | 2 +-
.../tika/language/detect/LanguageDetector.java | 20 +-
.../apache/tika/metadata/TikaCoreProperties.java | 5 +-
.../tika/parser/AbstractExternalProcessParser.java | 57 +++
.../java/org/apache/tika/pipes/PipesClient.java | 4 +-
.../org/apache/tika/pipes/fetcher/FetchKey.java | 2 +-
tika-eval/pom.xml | 2 +-
tika-eval/tika-eval-app/pom.xml | 64 +++-
.../java/org/apache/tika/eval/app/TikaEvalCLI.java | 2 +-
tika-eval/tika-eval-core/pom.xml | 26 +-
tika-example/pom.xml | 2 +-
tika-langdetect/pom.xml | 2 +-
tika-parent/pom.xml | 40 +-
.../tika-parser-scientific-module/pom.xml | 5 +
.../tika-parser-sqlite3-module/pom.xml | 2 +
.../tika-parsers-ml/tika-age-recogniser/pom.xml | 2 +-
.../tika-parser-audiovideo-module/pom.xml | 6 +-
.../org/apache/tika/parser/audio/AudioParser.java | 9 +
.../apache/tika/parser/mp4/ISO6709Extractor.java | 93 -----
.../java/org/apache/tika/parser/mp4/MP4Parser.java | 412 +++++++++------------
.../apache/tika/parser/mp4/TikaMp4BoxHandler.java | 79 ++++
.../tika/parser/mp4/boxes/TikaUserDataBox.java | 278 ++++++++++++++
.../org/apache/tika/parser/mp4/MP4ParserTest.java | 147 +++++++-
.../tika-parser-code-module/pom.xml | 2 +-
.../tika-parser-image-module/pom.xml | 2 +-
.../apache/tika/parser/ocr/TesseractOCRParser.java | 9 +-
.../org/apache/tika/parser/pkg/PackageParser.java | 3 +-
.../tika-parsers-standard-package/pom.xml | 11 +
.../src/main/resources}/META-INF/LICENSE | 0
tika-pipes/pom.xml | 2 +-
.../pipes/emitter/opensearch/OpenSearchClient.java | 38 +-
.../emitter/opensearch/OpenSearchEmitter.java | 47 +--
.../src/test/resources/log4j.properties | 28 --
.../test/resources/tika-config-simple-emitter.xml | 15 +-
tika-pipes/tika-pipes-integration-tests/pom.xml | 10 +
.../apache/tika/pipes/PipeIntegrationTests.java | 2 +-
.../tika/pipes/opensearch/OpenSearchTest.java | 26 --
.../pipes/opensearch/OpenSearchTestClient.java | 121 ++++++
.../pipes/opensearch/TikaPipesOpenSearchTest.java | 172 +++++++++
.../resources/opensearch/opensearch-mappings.json | 16 +
.../opensearch-parent-child-mappings.json | 26 ++
.../tika-config-opensearch.xml} | 45 +--
tika-serialization/pom.xml | 2 +-
tika-server/pom.xml | 2 +-
tika-server/tika-server-core/pom.xml | 1 +
.../apache/tika/server/core/TikaServerProcess.java | 45 ++-
.../tika/server/core/TikaServerWatchDog.java | 36 +-
tika-server/tika-server-standard/pom.xml | 1 +
53 files changed, 1416 insertions(+), 557 deletions(-)
create mode 100644 tika-core/src/main/java/org/apache/tika/parser/AbstractExternalProcessParser.java
delete mode 100644 tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp4/ISO6709Extractor.java
create mode 100644 tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp4/TikaMp4BoxHandler.java
create mode 100644 tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp4/boxes/TikaUserDataBox.java
copy {tika-app/src/main/appended-resources => tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/main/resources}/META-INF/LICENSE (100%)
delete mode 100644 tika-pipes/tika-emitters/tika-emitter-opensearch/src/test/resources/log4j.properties
delete mode 100644 tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/opensearch/OpenSearchTest.java
create mode 100644 tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/opensearch/OpenSearchTestClient.java
create mode 100644 tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/opensearch/TikaPipesOpenSearchTest.java
create mode 100644 tika-pipes/tika-pipes-integration-tests/src/test/resources/opensearch/opensearch-mappings.json
create mode 100644 tika-pipes/tika-pipes-integration-tests/src/test/resources/opensearch/opensearch-parent-child-mappings.json
copy tika-pipes/tika-pipes-integration-tests/src/test/resources/{tika-config-solr-urls.xml => opensearch/tika-config-opensearch.xml} (76%)
[tika] 02/02: TIKA-3440 -- first real draft of OpenSearch
integration
Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch TIKA-3440
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 52380e4277095590a8b8db709fbaba3b5f8add3d
Author: tallison <ta...@apache.org>
AuthorDate: Fri Jul 2 16:42:42 2021 -0400
TIKA-3440 -- first real draft of OpenSearch integration
---
.../pipes/emitter/opensearch/OpenSearchClient.java | 38 +++--
.../emitter/opensearch/OpenSearchEmitter.java | 47 +++---
.../src/test/resources/log4j.properties | 28 ----
.../test/resources/tika-config-simple-emitter.xml | 15 +-
tika-pipes/tika-pipes-integration-tests/pom.xml | 6 +
.../apache/tika/pipes/PipeIntegrationTests.java | 2 +-
.../tika/pipes/opensearch/OpenSearchTest.java | 26 ----
.../pipes/opensearch/OpenSearchTestClient.java | 121 +++++++++++++++
.../pipes/opensearch/TikaPipesOpenSearchTest.java | 172 +++++++++++++++++++++
.../resources/opensearch/opensearch-mappings.json | 16 ++
.../opensearch-parent-child-mappings.json | 26 ++++
.../opensearch/tika-config-opensearch.xml | 113 ++++++++++++++
12 files changed, 502 insertions(+), 108 deletions(-)
diff --git a/tika-pipes/tika-emitters/tika-emitter-opensearch/src/main/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchClient.java b/tika-pipes/tika-emitters/tika-emitter-opensearch/src/main/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchClient.java
index 3e60c38..6f98474 100644
--- a/tika-pipes/tika-emitters/tika-emitter-opensearch/src/main/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchClient.java
+++ b/tika-pipes/tika-emitters/tika-emitter-opensearch/src/main/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchClient.java
@@ -48,24 +48,30 @@ public class OpenSearchClient {
//this includes the full url and the index, should not end in /
//e.g. https://localhost:9200/my-index
- private final String openSearchUrl;
- private final HttpClient httpClient;
+ protected final String openSearchUrl;
+ protected final HttpClient httpClient;
+ private final OpenSearchEmitter.AttachmentStrategy attachmentStrategy;
- private OpenSearchClient(String openSearchUrl, HttpClient httpClient) {
+ protected OpenSearchClient(String openSearchUrl, HttpClient httpClient,
+ OpenSearchEmitter.AttachmentStrategy attachmentStrategy) {
this.openSearchUrl = openSearchUrl;
this.httpClient = httpClient;
+ this.attachmentStrategy = attachmentStrategy;
}
public void addDocument(String emitKey, List<Metadata> metadataList) throws IOException,
TikaClientException {
StringBuilder sb = new StringBuilder();
int i = 0;
+ String routing = (attachmentStrategy == OpenSearchEmitter.AttachmentStrategy.PARENT_CHILD) ?
+ emitKey : null;
+
for (Metadata metadata : metadataList) {
String id = emitKey;
if (i > 0) {
id += "-" + i;
}
- String indexJson = getBulkIndexJson(id, emitKey);
+ String indexJson = getBulkIndexJson(id, routing);
sb.append(indexJson).append("\n");
if (i == 0) {
sb.append(metadataToJsonContainer(metadata));
@@ -75,9 +81,11 @@ public class OpenSearchClient {
sb.append("\n");
i++;
}
- //System.out.println(sb.toString());
- String requestUrl = openSearchUrl + "/bulk?routing=" + URLEncoder
- .encode(emitKey, StandardCharsets.UTF_8.name());
+ String requestUrl = openSearchUrl + "/_bulk";
+ if (attachmentStrategy == OpenSearchEmitter.AttachmentStrategy.PARENT_CHILD) {
+ requestUrl += "?routing=" + URLEncoder.encode(emitKey, StandardCharsets.UTF_8.name());
+ }
+
JsonResponse response = postJson(requestUrl, sb.toString());
if (response.getStatus() != 200) {
throw new TikaClientException(response.getMsg());
@@ -97,9 +105,11 @@ public class OpenSearchClient {
jsonGenerator.writeStartObject();
writeMetadata(metadata, jsonGenerator);
- jsonGenerator.writeStartObject("relation_type");
- jsonGenerator.writeStringField("name", "embedded");
- jsonGenerator.writeStringField("parent", emitKey);
+ if (attachmentStrategy == OpenSearchEmitter.AttachmentStrategy.PARENT_CHILD) {
+ jsonGenerator.writeStartObject("relation_type");
+ jsonGenerator.writeStringField("name", "embedded");
+ jsonGenerator.writeStringField("parent", emitKey);
+ }
//end the relation type object
jsonGenerator.writeEndObject();
//end the metadata object
@@ -113,7 +123,9 @@ public class OpenSearchClient {
try (JsonGenerator jsonGenerator = new JsonFactory().createGenerator(writer)) {
jsonGenerator.writeStartObject();
writeMetadata(metadata, jsonGenerator);
- jsonGenerator.writeStringField("relation_type", "container");
+ if (attachmentStrategy == OpenSearchEmitter.AttachmentStrategy.PARENT_CHILD) {
+ jsonGenerator.writeStringField("relation_type", "container");
+ }
jsonGenerator.writeEndObject();
}
return writer.toString();
@@ -140,7 +152,7 @@ public class OpenSearchClient {
StringWriter writer = new StringWriter();
try (JsonGenerator jsonGenerator = new JsonFactory().createGenerator(writer)) {
jsonGenerator.writeStartObject();
- jsonGenerator.writeStartObject("index");
+ jsonGenerator.writeObjectFieldStart("index");
jsonGenerator.writeStringField("_id", id);
if (!StringUtils.isEmpty(routing)) {
jsonGenerator.writeStringField("routing", routing);
@@ -152,7 +164,7 @@ public class OpenSearchClient {
return writer.toString();
}
- protected JsonResponse postJson(String url, String json) throws IOException {
+ public JsonResponse postJson(String url, String json) throws IOException {
HttpPost httpRequest = new HttpPost(url);
ByteArrayEntity entity = new ByteArrayEntity(json.getBytes(StandardCharsets.UTF_8));
httpRequest.setEntity(entity);
diff --git a/tika-pipes/tika-emitters/tika-emitter-opensearch/src/main/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchEmitter.java b/tika-pipes/tika-emitters/tika-emitter-opensearch/src/main/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchEmitter.java
index b5b9820..fe878fe 100644
--- a/tika-pipes/tika-emitters/tika-emitter-opensearch/src/main/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchEmitter.java
+++ b/tika-pipes/tika-emitters/tika-emitter-opensearch/src/main/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchEmitter.java
@@ -19,17 +19,14 @@ package org.apache.tika.pipes.emitter.opensearch;
import static org.apache.tika.config.TikaConfig.mustNotBeEmpty;
import java.io.IOException;
-import java.util.ArrayList;
-import java.util.HashMap;
import java.util.List;
import java.util.Map;
-import java.util.Optional;
-import java.util.UUID;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.tika.client.HttpClientFactory;
+import org.apache.tika.client.TikaClientException;
import org.apache.tika.config.Field;
import org.apache.tika.config.Initializable;
import org.apache.tika.config.InitializableProblemHandler;
@@ -37,18 +34,22 @@ import org.apache.tika.config.Param;
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.pipes.emitter.AbstractEmitter;
-import org.apache.tika.pipes.emitter.EmitData;
import org.apache.tika.pipes.emitter.TikaEmitterException;
import org.apache.tika.utils.StringUtils;
public class OpenSearchEmitter extends AbstractEmitter implements Initializable {
+
+ public enum AttachmentStrategy {
+ SKIP, CONCATENATE_CONTENT, PARENT_CHILD,
+ //anything else?
+ }
+
private static final Logger LOG = LoggerFactory.getLogger(OpenSearchEmitter.class);
private AttachmentStrategy attachmentStrategy = AttachmentStrategy.PARENT_CHILD;
- private String solrCollection;
- private String openSearchUrl;
+ private String openSearchUrl = null;
private String contentField = "content";
private String idField = "_id";
private int commitWithin = 1000;
@@ -66,9 +67,13 @@ public class OpenSearchEmitter extends AbstractEmitter implements Initializable
LOG.warn("metadataList is null or empty");
return;
}
- openSearchClient.addDocument(emitKey, metadataList);
+ try {
+ openSearchClient.addDocument(emitKey, metadataList);
+ } catch (TikaClientException e) {
+ throw new TikaEmitterException("failed to add document", e);
+ }
}
-
+/*
private void addMetadataAsSolrInputDocuments(String emitKey, List<Metadata> metadataList,
List<SolrInputDocument> docsToUpdate)
throws IOException, TikaEmitterException {
@@ -167,7 +172,7 @@ public class OpenSearchEmitter extends AbstractEmitter implements Initializable
}
}
}
- }
+ }*/
/**
* Options: SKIP, CONCATENATE_CONTENT, PARENT_CHILD. Default is "PARENT_CHILD".
@@ -233,6 +238,7 @@ public class OpenSearchEmitter extends AbstractEmitter implements Initializable
this.idField = idField;
}
+ //this is the full url, including the collection, e.g. https://localhost:9200/my-collection
@Field
public void setOpenSearchUrl(String openSearchUrl) {
this.openSearchUrl = openSearchUrl;
@@ -270,30 +276,15 @@ public class OpenSearchEmitter extends AbstractEmitter implements Initializable
throw new TikaConfigException("Must specify an open search url!");
} else {
openSearchClient =
- new OpenSearchClient(openSearchUrl, httpClientFactory.build());
+ new OpenSearchClient(openSearchUrl, httpClientFactory.build(), attachmentStrategy);
}
}
@Override
public void checkInitialization(InitializableProblemHandler problemHandler)
throws TikaConfigException {
- mustNotBeEmpty("solrCollection", this.solrCollection);
- mustNotBeEmpty("urlFieldName", this.idField);
- if ((this.solrUrls == null || this.solrUrls.isEmpty()) &&
- (this.solrZkHosts == null || this.solrZkHosts.isEmpty())) {
- throw new IllegalArgumentException(
- "expected either param solrUrls or param solrZkHosts, but neither was specified");
- }
- if (this.solrUrls != null && !this.solrUrls.isEmpty() && this.solrZkHosts != null &&
- !this.solrZkHosts.isEmpty()) {
- throw new IllegalArgumentException(
- "expected either param solrUrls or param solrZkHosts, but both were specified");
- }
- }
-
- public enum AttachmentStrategy {
- SKIP, CONCATENATE_CONTENT, PARENT_CHILD,
- //anything else?
+ mustNotBeEmpty("openSearchUrl", this.openSearchUrl);
+ mustNotBeEmpty("idField", this.idField);
}
}
diff --git a/tika-pipes/tika-emitters/tika-emitter-opensearch/src/test/resources/log4j.properties b/tika-pipes/tika-emitters/tika-emitter-opensearch/src/test/resources/log4j.properties
deleted file mode 100644
index d17a4a1..0000000
--- a/tika-pipes/tika-emitters/tika-emitter-opensearch/src/test/resources/log4j.properties
+++ /dev/null
@@ -1,28 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-status=info
-name=PropertiesConfig
-filters=threshold
-filter.threshold.type=ThresholdFilter
-filter.threshold.level=info
-appenders=console
-appender.console.type=Console
-appender.console.name=STDERR
-appender.console.layout.type=PatternLayout
-appender.console.layout.pattern=%-5p [%t] %d{HH:mm:ss,SSS} %c %m%n
-rootLogger.level=info
-rootLogger.appenderRefs=stderr
-rootLogger.appenderRef.stderr.ref=STDERR
diff --git a/tika-pipes/tika-emitters/tika-emitter-opensearch/src/test/resources/tika-config-simple-emitter.xml b/tika-pipes/tika-emitters/tika-emitter-opensearch/src/test/resources/tika-config-simple-emitter.xml
index c52da5e..7959bf6 100644
--- a/tika-pipes/tika-emitters/tika-emitter-opensearch/src/test/resources/tika-config-simple-emitter.xml
+++ b/tika-pipes/tika-emitters/tika-emitter-opensearch/src/test/resources/tika-config-simple-emitter.xml
@@ -30,23 +30,14 @@
</metadataFilter>
</metadataFilters>
<emitters>
- <emitter class="org.apache.tika.pipes.emitter.solr.SolrEmitter">
+ <emitter class="org.apache.tika.pipes.emitter.opensearch.OpenSearchEmitter">
<params>
- <name>solr1</name>
- <url>http://localhost:8983/solr/tika-test</url>
+ <name>opensearch1</name>
+ <url>http://localhost:9200/tika-test</url>
<attachmentStrategy>concatenate-content</attachmentStrategy>
<contentField>content</contentField>
<commitWithin>10</commitWithin>
</params>
</emitter>
- <emitter class="org.apache.tika.pipes.emitter.solr.SolrEmitter">
- <params>
- <name>solr2</name>
- <url>http://localhost:8983/solr/tika-test</url>
- <attachmentStrategy>parent-child</attachmentStrategy>
- <contentField>content</contentField>
- <commitWithin>10</commitWithin>
- </params>
- </emitter>
</emitters>
</properties>
\ No newline at end of file
diff --git a/tika-pipes/tika-pipes-integration-tests/pom.xml b/tika-pipes/tika-pipes-integration-tests/pom.xml
index d4db22b..db5805d 100644
--- a/tika-pipes/tika-pipes-integration-tests/pom.xml
+++ b/tika-pipes/tika-pipes-integration-tests/pom.xml
@@ -141,6 +141,12 @@
</exclusions>
<scope>test</scope>
</dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-emitter-opensearch</artifactId>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ </dependency>
</dependencies>
</project>
diff --git a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/PipeIntegrationTests.java b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/PipeIntegrationTests.java
index 7cad1da..8112142 100644
--- a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/PipeIntegrationTests.java
+++ b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/PipeIntegrationTests.java
@@ -50,7 +50,7 @@ import org.apache.tika.pipes.fetcher.Fetcher;
import org.apache.tika.pipes.fetcher.FetcherManager;
import org.apache.tika.pipes.pipesiterator.PipesIterator;
-@Ignore("turn these into actual tests")
+@Ignore("turn these into actual tests with mock s3")
public class PipeIntegrationTests {
private static final Path OUTDIR = Paths.get("");
diff --git a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/opensearch/OpenSearchTest.java b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/opensearch/OpenSearchTest.java
deleted file mode 100644
index 3d22250..0000000
--- a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/opensearch/OpenSearchTest.java
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.pipes.opensearch;
-
-import org.junit.Test;
-
-public class OpenSearchTest {
- @Test
- public void testOne() throws Exception {
-
- }
-}
diff --git a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/opensearch/OpenSearchTestClient.java b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/opensearch/OpenSearchTestClient.java
new file mode 100644
index 0000000..297891e
--- /dev/null
+++ b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/opensearch/OpenSearchTestClient.java
@@ -0,0 +1,121 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.pipes.opensearch;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.nio.charset.StandardCharsets;
+
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import org.apache.http.HttpResponse;
+import org.apache.http.client.HttpClient;
+import org.apache.http.client.methods.CloseableHttpResponse;
+import org.apache.http.client.methods.HttpGet;
+import org.apache.http.client.methods.HttpPut;
+import org.apache.http.entity.ByteArrayEntity;
+import org.apache.http.util.EntityUtils;
+
+import org.apache.tika.pipes.emitter.opensearch.JsonResponse;
+import org.apache.tika.pipes.emitter.opensearch.OpenSearchClient;
+import org.apache.tika.pipes.emitter.opensearch.OpenSearchEmitter;
+
+/**
+ * This expands on the OpenSearchClient for testing purposes.
+ * This has more functionality than is needed for sending docs to OpenSearch
+ */
+public class OpenSearchTestClient extends OpenSearchClient {
+
+ protected OpenSearchTestClient(String openSearchUrl, HttpClient httpClient,
+ OpenSearchEmitter.AttachmentStrategy attachmentStrategy) {
+ super(openSearchUrl, httpClient, attachmentStrategy);
+ }
+
+ protected JsonResponse putJson(String url, String json) throws IOException {
+ HttpPut httpRequest = new HttpPut(url);
+ ByteArrayEntity entity = new ByteArrayEntity(json.getBytes(StandardCharsets.UTF_8));
+ httpRequest.setEntity(entity);
+ httpRequest.setHeader("Accept", "application/json");
+ httpRequest.setHeader("Content-type", "application/json; charset=utf-8");
+ //At one point, this was required because of connection already
+ // bound exceptions on windows :(
+ //httpPost.setHeader("Connection", "close");
+
+ //try (CloseableHttpClient httpClient = HttpClients.createDefault()) {
+
+ HttpResponse response = null;
+ try {
+ response = httpClient.execute(httpRequest);
+ int status = response.getStatusLine().getStatusCode();
+ if (status == 200) {
+ try (Reader reader = new BufferedReader(
+ new InputStreamReader(response.getEntity().getContent(),
+ StandardCharsets.UTF_8))) {
+ ObjectMapper mapper = new ObjectMapper();
+ JsonNode node = mapper.readTree(reader);
+ return new JsonResponse(200, node);
+ }
+ } else {
+ return new JsonResponse(status,
+ new String(EntityUtils.toByteArray(response.getEntity()),
+ StandardCharsets.UTF_8));
+ }
+ } finally {
+ if (response != null && response instanceof CloseableHttpResponse) {
+ ((CloseableHttpResponse)response).close();
+ }
+ httpRequest.releaseConnection();
+ }
+ }
+
+ protected JsonResponse getJson(String url) throws IOException {
+ HttpGet httpRequest = new HttpGet(url);
+ httpRequest.setHeader("Accept", "application/json");
+ httpRequest.setHeader("Content-type", "application/json; charset=utf-8");
+ //At one point, this was required because of connection already
+ // bound exceptions on windows :(
+ //httpPost.setHeader("Connection", "close");
+
+ //try (CloseableHttpClient httpClient = HttpClients.createDefault()) {
+
+ HttpResponse response = null;
+ try {
+ response = httpClient.execute(httpRequest);
+ int status = response.getStatusLine().getStatusCode();
+ if (status == 200) {
+ try (Reader reader = new BufferedReader(
+ new InputStreamReader(response.getEntity().getContent(),
+ StandardCharsets.UTF_8))) {
+ ObjectMapper mapper = new ObjectMapper();
+ JsonNode node = mapper.readTree(reader);
+ return new JsonResponse(200, node);
+ }
+ } else {
+ return new JsonResponse(status,
+ new String(EntityUtils.toByteArray(response.getEntity()),
+ StandardCharsets.UTF_8));
+ }
+ } finally {
+ if (response != null && response instanceof CloseableHttpResponse) {
+ ((CloseableHttpResponse)response).close();
+ }
+ httpRequest.releaseConnection();
+ }
+ }
+}
diff --git a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/opensearch/TikaPipesOpenSearchTest.java b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/opensearch/TikaPipesOpenSearchTest.java
new file mode 100644
index 0000000..a6b6a5e
--- /dev/null
+++ b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/opensearch/TikaPipesOpenSearchTest.java
@@ -0,0 +1,172 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.pipes.opensearch;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.io.File;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+
+import org.apache.commons.io.IOUtils;
+import org.jetbrains.annotations.NotNull;
+import org.junit.AfterClass;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.Test;
+import org.testcontainers.containers.GenericContainer;
+import org.testcontainers.shaded.org.apache.commons.io.FileUtils;
+import org.testcontainers.utility.DockerImageName;
+
+import org.apache.tika.cli.TikaCLI;
+import org.apache.tika.client.HttpClientFactory;
+import org.apache.tika.pipes.PipeIntegrationTests;
+import org.apache.tika.pipes.emitter.opensearch.JsonResponse;
+import org.apache.tika.pipes.emitter.opensearch.OpenSearchEmitter;
+
+public class TikaPipesOpenSearchTest {
+
+ private static final String collection = "testcol";
+ private static final File testFileFolder = new File("target", "test-files");
+ private final int numDocs = 42;
+ protected GenericContainer<?> openSearch;
+ private String openSearchHost;
+ private int openSearchPort;
+ //this includes the collection, e.g. https://localhost:49213/testcol
+ private String openSearchEndpoint;
+ private OpenSearchTestClient client;
+
+ @Rule
+ public GenericContainer<?> openSearchContainer =
+ new GenericContainer<>(DockerImageName.parse(getOpenSearchImageName()))
+ .withExposedPorts(9200)
+ .withEnv("discovery.type", "single-node");
+
+ private String getOpenSearchImageName() {
+ return "opensearchproject/opensearch:1.0.0-rc1";
+ }
+
+ @Before
+ public void setupTest() throws Exception {
+ setupOpenSearch(openSearchContainer);
+ }
+
+ @AfterClass
+ public static void tearDown() throws Exception {
+ FileUtils.deleteDirectory(testFileFolder);
+ }
+
+ @Test
+ public void testFSToOpenSearch() throws Exception {
+ //create the collection with mappings
+ String mappings = IOUtils.toString(TikaPipesOpenSearchTest.class.getResourceAsStream(
+ "/opensearch/opensearch-mappings.json"), StandardCharsets.UTF_8);
+ int status = -1;
+ int tries = 0;
+ JsonResponse response = null;
+ //need to wait a bit sometimes before OpenSearch is up
+ while (status != 200 && tries++ < 3) {
+ response = client.putJson(openSearchEndpoint, mappings);
+ if (status != 200) {
+ Thread.sleep(1000);
+ }
+ status = response.getStatus();
+ }
+ if (status != 200) {
+ throw new IllegalArgumentException("couldn't create index/add mappings");
+ }
+ assertTrue(response.getJson().get("acknowledged").asBoolean());
+ assertEquals("testcol", response.getJson().get("index").asText());
+
+ runPipes(OpenSearchEmitter.AttachmentStrategy.CONCATENATE_CONTENT);
+ //refresh to make sure the content is searchable
+ JsonResponse refresh = client.getJson(openSearchEndpoint + "/_refresh");
+
+ String query = "{ \"track_total_hits\": true, \"query\": { \"match\": { \"content\": { " +
+ "\"query\": \"initial\" } } } }";
+
+ JsonResponse results = client.postJson(openSearchEndpoint + "/_search", query);
+ assertEquals(200, results.getStatus());
+
+ assertEquals(numDocs, results.getJson().get("hits").get("total").get("value").asInt());
+
+ }
+
+ private void runPipes(OpenSearchEmitter.AttachmentStrategy attachmentStrategy) throws Exception {
+
+ File tikaConfigFile = new File("target", "ta-opensearch.xml");
+ File log4jPropFile = new File("target", "tmp-log4j2.xml");
+ try (InputStream is = PipeIntegrationTests.class
+ .getResourceAsStream("/pipes-fork-server-custom-log4j2.xml")) {
+ FileUtils.copyInputStreamToFile(is, log4jPropFile);
+ }
+ String tikaConfigTemplateXml;
+ try (InputStream is = PipeIntegrationTests.class
+ .getResourceAsStream("/opensearch/tika-config-opensearch.xml")) {
+ tikaConfigTemplateXml = IOUtils.toString(is, StandardCharsets.UTF_8);
+ }
+
+ String tikaConfigXml =
+ createTikaConfigXml(tikaConfigFile, log4jPropFile, tikaConfigTemplateXml,
+ attachmentStrategy);
+ FileUtils.writeStringToFile(tikaConfigFile, tikaConfigXml, StandardCharsets.UTF_8);
+
+ TikaCLI.main(new String[]{"-a", "--config=" + tikaConfigFile.getAbsolutePath()});
+
+
+ }
+
+ @NotNull
+ private String createTikaConfigXml(File tikaConfigFile, File log4jPropFile, String tikaConfigTemplateXml,
+ OpenSearchEmitter.AttachmentStrategy attachmentStrategy) {
+ String res =
+ tikaConfigTemplateXml.replace("{TIKA_CONFIG}", tikaConfigFile.getAbsolutePath())
+ .replace("{ATTACHMENT_STRATEGY}", attachmentStrategy.toString())
+ .replace("{LOG4J_PROPERTIES_FILE}", log4jPropFile.getAbsolutePath())
+ .replaceAll("\\{PATH_TO_DOCS\\}", testFileFolder.getAbsolutePath());
+
+ res = res.replace("{OPENSEARCH_CONNECTION}", openSearchEndpoint);
+
+ return res;
+
+ }
+
+ private void setupOpenSearch(GenericContainer<?> openSearchContainer) throws Exception {
+ createTestHtmlFiles("initial");
+ this.openSearch = openSearchContainer;
+ openSearchHost = openSearch.getHost();
+ openSearchPort = openSearch.getMappedPort(9200);
+ openSearchEndpoint = "https://" + openSearchHost + ":" + openSearchPort + "/" + collection;
+ HttpClientFactory httpClientFactory = new HttpClientFactory();
+ httpClientFactory.setUserName("admin");
+ httpClientFactory.setPassword("admin");
+ //attachment strategy is not used here...TODO clean this up
+ client = new OpenSearchTestClient(openSearchEndpoint,
+ httpClientFactory.build(), OpenSearchEmitter.AttachmentStrategy.SKIP);
+ }
+
+ private void createTestHtmlFiles(String bodyContent) throws Exception {
+ testFileFolder.mkdirs();
+ for (int i = 0; i < numDocs; ++i) {
+ FileUtils.writeStringToFile(new File(testFileFolder, "test-" + i + ".html"),
+ "<html><body>" + bodyContent + "</body></html>", StandardCharsets.UTF_8);
+ }
+ }
+
+
+}
diff --git a/tika-pipes/tika-pipes-integration-tests/src/test/resources/opensearch/opensearch-mappings.json b/tika-pipes/tika-pipes-integration-tests/src/test/resources/opensearch/opensearch-mappings.json
new file mode 100644
index 0000000..4b87142
--- /dev/null
+++ b/tika-pipes/tika-pipes-integration-tests/src/test/resources/opensearch/opensearch-mappings.json
@@ -0,0 +1,16 @@
+{
+ "settings": {
+ "number_of_shards": 1
+ },
+ "mappings" : {
+ "dynamic": false,
+ "properties" : {
+ "content" : { "type" : "text"},
+ "length" : { "type" : "long"},
+ "creators" : { "type" : "text"},
+ "title" : { "type" : "text"},
+ "mime" : { "type" : "keyword"},
+ "tika_exception" : { "type" : "text"}
+ }
+ }
+}
\ No newline at end of file
diff --git a/tika-pipes/tika-pipes-integration-tests/src/test/resources/opensearch/opensearch-parent-child-mappings.json b/tika-pipes/tika-pipes-integration-tests/src/test/resources/opensearch/opensearch-parent-child-mappings.json
new file mode 100644
index 0000000..def8438
--- /dev/null
+++ b/tika-pipes/tika-pipes-integration-tests/src/test/resources/opensearch/opensearch-parent-child-mappings.json
@@ -0,0 +1,26 @@
+{
+ "settings": {
+ "number_of_shards": 1
+ },
+ "mappings" : {
+ "dynamic": false,
+ "_routing": {
+ "required":true
+ },
+ "properties" : {
+ "content" : { "type" : "text"},
+ "length" : { "type" : "long"},
+ "creators" : { "type" : "text"},
+ "title" : { "type" : "text"},
+ "mime" : { "type" : "keyword"},
+ "tika_exception" : { "type" : "text"},
+ "relation_type":{
+ "type":"join",
+ "eager_global_ordinals":true,
+ "relations":{
+ "container":"embedded"
+ }
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/tika-pipes/tika-pipes-integration-tests/src/test/resources/opensearch/tika-config-opensearch.xml b/tika-pipes/tika-pipes-integration-tests/src/test/resources/opensearch/tika-config-opensearch.xml
new file mode 100644
index 0000000..ccb0f8f
--- /dev/null
+++ b/tika-pipes/tika-pipes-integration-tests/src/test/resources/opensearch/tika-config-opensearch.xml
@@ -0,0 +1,113 @@
+<?xml version="1.0" encoding="UTF-8" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+<properties>
+ <parsers>
+ <parser class="org.apache.tika.parser.DefaultParser">
+ <parser-exclude class="org.apache.tika.parser.ocr.TesseractOCRParser"/>
+ <parser-exclude class="org.apache.tika.parser.pdf.PDFParser"/>
+ <parser-exclude class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser"/>
+ <parser-exclude class="org.apache.tika.parser.microsoft.OfficeParser"/>
+ </parser>
+ <parser class="org.apache.tika.parser.pdf.PDFParser">
+ <params>
+ <param name="extractActions" type="bool">true</param>
+ <param name="checkExtractAccessPermissions" type="bool">true</param>
+ </params>
+ </parser>
+ <parser class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser">
+ <params>
+ <param name="includeDeletedContent" type="bool">true</param>
+ <param name="includeMoveFromContent" type="bool">true</param>
+ <param name="extractMacros" type="bool">true</param>
+ </params>
+ </parser>
+ <parser class="org.apache.tika.parser.microsoft.OfficeParser">
+ <params>
+ <param name="extractMacros" type="bool">true</param>
+ </params>
+ </parser>
+ </parsers>
+ <metadataFilters>
+ <metadataFilter class="org.apache.tika.metadata.filter.FieldNameMappingFilter">
+ <params>
+ <excludeUnmapped>true</excludeUnmapped>
+ <mappings>
+ <mapping from="X-TIKA:content" to="content"/>
+ <mapping from="Content-Length" to="length"/>
+ <mapping from="dc:creator" to="creators"/>
+ <mapping from="dc:title" to="title"/>
+ <mapping from="Content-Type" to="mime"/>
+ <mapping from="X-TIKA:EXCEPTION:container_exception" to="tika_exception"/>
+ </mappings>
+ </params>
+ </metadataFilter>
+ </metadataFilters>
+ <async>
+ <params>
+ <maxForEmitBatchBytes>10000</maxForEmitBatchBytes>
+ <emitMaxEstimatedBytes>100000</emitMaxEstimatedBytes>
+ <emitWithinMillis>60000</emitWithinMillis>
+ <numEmitters>1</numEmitters>
+ <numClients>1</numClients>
+ <tikaConfig>{TIKA_CONFIG}</tikaConfig>
+ <forkedJvmArgs>
+ <arg>-Xmx1g</arg>
+ <arg>-XX:ParallelGCThreads=2</arg>
+ <arg>-XX:+ExitOnOutOfMemoryError</arg>
+ <arg>-Dlog4j.configurationFile={LOG4J_PROPERTIES_FILE}</arg>
+ </forkedJvmArgs>
+ <timeoutMillis>60000</timeoutMillis>
+ </params>
+ </async>
+ <fetchers>
+ <fetcher class="org.apache.tika.pipes.fetcher.fs.FileSystemFetcher">
+ <params>
+ <name>fsf</name>
+ <basePath>{PATH_TO_DOCS}</basePath>
+ </params>
+ </fetcher>
+ </fetchers>
+ <emitters>
+ <emitter class="org.apache.tika.pipes.emitter.opensearch.OpenSearchEmitter">
+ <params>
+ <name>ose</name>
+ <openSearchUrl>{OPENSEARCH_CONNECTION}</openSearchUrl>
+<!-- TODO: implement this
+ <updateStrategy>{UPDATE_STRATEGY}</updateStrategy>
+ -->
+ <attachmentStrategy>{ATTACHMENT_STRATEGY}</attachmentStrategy>
+ <contentField>content</contentField>
+ <commitWithin>10</commitWithin>
+ <idField>_id</idField>
+ <connectionTimeout>10000</connectionTimeout>
+ <socketTimeout>60000</socketTimeout>
+ <userName>admin</userName>
+ <password>admin</password>
+ </params>
+ </emitter>
+ </emitters>
+ <pipesIterator class="org.apache.tika.pipes.pipesiterator.FileSystemPipesIterator">
+ <params>
+ <basePath>{PATH_TO_DOCS}</basePath>
+ <fetcherName>fsf</fetcherName>
+ <emitterName>ose</emitterName>
+ </params>
+ </pipesIterator>
+</properties>
[tika] 01/02: Merge remote-tracking branch 'origin/main' into
TIKA-3440
Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch TIKA-3440
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 59cd309d5e1f18b97ff398605bca74e198ca24a2
Merge: 586a823 1b3ff12
Author: tallison <ta...@apache.org>
AuthorDate: Fri Jul 2 13:18:08 2021 -0400
Merge remote-tracking branch 'origin/main' into TIKA-3440
CHANGES.txt | 27 +-
tika-app/pom.xml | 1 +
tika-batch/pom.xml | 2 +-
.../apache/tika/batch/FileResourceConsumer.java | 4 +-
tika-bundles/tika-bundle-standard/pom.xml | 16 +-
tika-core/pom.xml | 2 +-
.../tika/language/detect/LanguageDetector.java | 20 +-
.../apache/tika/metadata/TikaCoreProperties.java | 5 +-
.../tika/parser/AbstractExternalProcessParser.java | 57 ++
.../java/org/apache/tika/pipes/PipesClient.java | 4 +-
.../org/apache/tika/pipes/fetcher/FetchKey.java | 2 +-
tika-eval/pom.xml | 2 +-
tika-eval/tika-eval-app/pom.xml | 64 +-
.../java/org/apache/tika/eval/app/TikaEvalCLI.java | 2 +-
tika-eval/tika-eval-core/pom.xml | 26 +-
tika-example/pom.xml | 2 +-
tika-langdetect/pom.xml | 2 +-
tika-parent/pom.xml | 40 +-
.../tika-parser-scientific-module/pom.xml | 5 +
.../tika-parser-sqlite3-module/pom.xml | 2 +
.../tika-parsers-ml/tika-age-recogniser/pom.xml | 2 +-
.../tika-parser-audiovideo-module/pom.xml | 6 +-
.../org/apache/tika/parser/audio/AudioParser.java | 9 +
.../apache/tika/parser/mp4/ISO6709Extractor.java | 93 ---
.../java/org/apache/tika/parser/mp4/MP4Parser.java | 412 ++++------
.../apache/tika/parser/mp4/TikaMp4BoxHandler.java | 79 ++
.../tika/parser/mp4/boxes/TikaUserDataBox.java | 278 +++++++
.../org/apache/tika/parser/mp4/MP4ParserTest.java | 147 +++-
.../tika-parser-code-module/pom.xml | 2 +-
.../tika-parser-image-module/pom.xml | 2 +-
.../apache/tika/parser/ocr/TesseractOCRParser.java | 9 +-
.../org/apache/tika/parser/pkg/PackageParser.java | 3 +-
.../tika-parsers-standard-package/pom.xml | 11 +
.../src/main/resources/META-INF/LICENSE | 902 +++++++++++++++++++++
tika-pipes/pom.xml | 2 +-
tika-pipes/tika-pipes-integration-tests/pom.xml | 4 +
tika-serialization/pom.xml | 2 +-
tika-server/pom.xml | 2 +-
tika-server/tika-server-core/pom.xml | 1 +
.../apache/tika/server/core/TikaServerProcess.java | 45 +-
.../tika/server/core/TikaServerWatchDog.java | 36 +-
tika-server/tika-server-standard/pom.xml | 1 +
42 files changed, 1911 insertions(+), 422 deletions(-)