You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/07/29 18:48:06 UTC

[tika] branch main updated (51b538b -> 4e151a7)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git.


    from 51b538b  TIKA-3507 -- add PipesReporter
     new 4bb614a  Make reason for forked process failure more evident.
     new 4e151a7  TIKA-3508 and update CHANGES.txt

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 CHANGES.txt                                                      | 9 +++++++--
 tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java   | 1 +
 .../tika/pipes/opensearch/tests/TikaPipesOpenSearchTest.java     | 6 +++---
 .../apache/tika/pipes/emitter/opensearch/OpenSearchClient.java   | 2 +-
 tika-pipes/tika-emitters/tika-emitter-s3/pom.xml                 | 1 -
 tika-pipes/tika-emitters/tika-emitter-solr/pom.xml               | 1 +
 .../java/org/apache/tika/pipes/emitter/solr/SolrEmitter.java     | 4 ++--
 tika-pipes/tika-pipes-iterators/tika-pipes-iterator-s3/pom.xml   | 1 +
 8 files changed, 16 insertions(+), 9 deletions(-)

[tika] 02/02: TIKA-3508 and update CHANGES.txt

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 4e151a73ec7ce1f5a8e08f4cf8d011117615d02c
Author: tallison <ta...@apache.org>
AuthorDate: Thu Jul 29 14:47:45 2021 -0400

    TIKA-3508 and update CHANGES.txt
---
 CHANGES.txt                                                      | 9 +++++++--
 .../tika/pipes/opensearch/tests/TikaPipesOpenSearchTest.java     | 6 +++---
 .../apache/tika/pipes/emitter/opensearch/OpenSearchClient.java   | 2 +-
 tika-pipes/tika-emitters/tika-emitter-s3/pom.xml                 | 1 -
 tika-pipes/tika-emitters/tika-emitter-solr/pom.xml               | 1 +
 .../java/org/apache/tika/pipes/emitter/solr/SolrEmitter.java     | 4 ++--
 tika-pipes/tika-pipes-iterators/tika-pipes-iterator-s3/pom.xml   | 1 +
 7 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index fe0ea2a..6898dee 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,12 @@
 Release 2.0.1 - ???
 
+   * Fix serialization of embedded docs in OpenSearch emitter
+     and fix embedded documents not being indexed in some use
+     cases in the Solr emitter (TIKA-3490).
+
+   * Add pipesClientId system property to PipesServer so that each
+     forked process can log to its own logger (TIKA-3480).
+
    * Add DateNormalizingMetadataFilter let users ensure that all dates
      emitted to Solr/OpenSearch are in UTC. Users can configure which
      timezone they'd like to use in cases where the file format does
@@ -9,8 +16,6 @@ Release 2.0.1 - ???
      the SKIP or CONCATENATE attachment strategy, modify the
      parseMode in the pipesiterators or in the FetchEmitTuple (TIKA-3494).
 
-   * Fix serialization of embedded docs in OpenSearch emitter (TIKA-3490).
-
 Release 2.0.0 - 07/07/2021
 
    * Cleanup of fetcher integration with tika-server.
diff --git a/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/java/org/apache/tika/pipes/opensearch/tests/TikaPipesOpenSearchTest.java b/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/java/org/apache/tika/pipes/opensearch/tests/TikaPipesOpenSearchTest.java
index 7037f5d..986ce7a 100644
--- a/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/java/org/apache/tika/pipes/opensearch/tests/TikaPipesOpenSearchTest.java
+++ b/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/java/org/apache/tika/pipes/opensearch/tests/TikaPipesOpenSearchTest.java
@@ -148,7 +148,7 @@ public class TikaPipesOpenSearchTest {
         JsonNode source = results.getJson().get("hits").get("hits").get(0).get("_source");
 
         Matcher m = Pattern.compile("\\Atest_recursive_embedded" +
-                ".docx_[0-9a-f]{8}-[0-9a-f]{4}-" +
+                ".docx-[0-9a-f]{8}-[0-9a-f]{4}-" +
                 "[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\\Z").matcher(
                 results.getJson().get("hits").get("hits").get(0).get("_id").asText()
         );
@@ -211,11 +211,11 @@ public class TikaPipesOpenSearchTest {
         JsonNode source = results.getJson().get("hits").get("hits").get(0).get("_source");
 
         Matcher m = Pattern.compile("\\Atest_recursive_embedded" +
-                ".docx_[0-9a-f]{8}-[0-9a-f]{4}-" +
+                ".docx-[0-9a-f]{8}-[0-9a-f]{4}-" +
                 "[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\\Z").matcher(
                 results.getJson().get("hits").get("hits").get(0).get("_id").asText()
         );
-        assertTrue("test_recursive_embedded.docx_$guid", m.find());
+        assertTrue("test_recursive_embedded.docx-$guid", m.find());
 
         assertNull("test_recursive_embedded.docx",
                 results.getJson().get("hits").get("hits").get(0).get("_routing"));
diff --git a/tika-pipes/tika-emitters/tika-emitter-opensearch/src/main/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchClient.java b/tika-pipes/tika-emitters/tika-emitter-opensearch/src/main/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchClient.java
index c3ec807..b30a648 100644
--- a/tika-pipes/tika-emitters/tika-emitter-opensearch/src/main/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchClient.java
+++ b/tika-pipes/tika-emitters/tika-emitter-opensearch/src/main/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchClient.java
@@ -70,7 +70,7 @@ public class OpenSearchClient {
         for (Metadata metadata : metadataList) {
             StringBuilder id = new StringBuilder(emitKey);
             if (i > 0) {
-                id.append("_").append(UUID.randomUUID());
+                id.append("-").append(UUID.randomUUID());
             }
             String indexJson = getBulkIndexJson(id.toString(), routing);
             sb.append(indexJson).append("\n");
diff --git a/tika-pipes/tika-emitters/tika-emitter-s3/pom.xml b/tika-pipes/tika-emitters/tika-emitter-s3/pom.xml
index d34112e..e51d37d 100644
--- a/tika-pipes/tika-emitters/tika-emitter-s3/pom.xml
+++ b/tika-pipes/tika-emitters/tika-emitter-s3/pom.xml
@@ -85,7 +85,6 @@
       <groupId>org.apache.logging.log4j</groupId>
       <artifactId>log4j-slf4j-impl</artifactId>
     </dependency>
-
   </dependencies>
 
   <build>
diff --git a/tika-pipes/tika-emitters/tika-emitter-solr/pom.xml b/tika-pipes/tika-emitters/tika-emitter-solr/pom.xml
index d613e57..142cd94 100644
--- a/tika-pipes/tika-emitters/tika-emitter-solr/pom.xml
+++ b/tika-pipes/tika-emitters/tika-emitter-solr/pom.xml
@@ -48,6 +48,7 @@
     <dependency>
       <groupId>org.apache.logging.log4j</groupId>
       <artifactId>log4j-slf4j-impl</artifactId>
+      <scope>provided</scope>
     </dependency>
     <dependency>
       <groupId>org.eclipse.jetty</groupId>
diff --git a/tika-pipes/tika-emitters/tika-emitter-solr/src/main/java/org/apache/tika/pipes/emitter/solr/SolrEmitter.java b/tika-pipes/tika-emitters/tika-emitter-solr/src/main/java/org/apache/tika/pipes/emitter/solr/SolrEmitter.java
index 12ec5a5..8e1615b 100644
--- a/tika-pipes/tika-emitters/tika-emitter-solr/src/main/java/org/apache/tika/pipes/emitter/solr/SolrEmitter.java
+++ b/tika-pipes/tika-emitters/tika-emitter-solr/src/main/java/org/apache/tika/pipes/emitter/solr/SolrEmitter.java
@@ -101,7 +101,7 @@ public class SolrEmitter extends AbstractEmitter implements Initializable {
                 SolrInputDocument childSolrInputDocument = new SolrInputDocument();
                 Metadata m = metadataList.get(i);
                 childSolrInputDocument
-                        .setField(idField, emitKey + "_" + UUID.randomUUID().toString());
+                        .setField(idField, emitKey + "-" + UUID.randomUUID().toString());
                 addMetadataToSolrInputDocument(m, childSolrInputDocument, updateStrategy);
                 solrInputDocument.addChildDocument(childSolrInputDocument);
             }
@@ -113,7 +113,7 @@ public class SolrEmitter extends AbstractEmitter implements Initializable {
                 SolrInputDocument childSolrInputDocument = new SolrInputDocument();
                 Metadata m = metadataList.get(i);
                 childSolrInputDocument.setField(idField,
-                        solrInputDocument.get(idField) + "-" + UUID.randomUUID().toString());
+                        solrInputDocument.get(idField).getValue() + "-" + UUID.randomUUID().toString());
                 addMetadataToSolrInputDocument(m, childSolrInputDocument, updateStrategy);
                 docsToUpdate.add(childSolrInputDocument);
             }
diff --git a/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-s3/pom.xml b/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-s3/pom.xml
index 8c40259..65917fb 100644
--- a/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-s3/pom.xml
+++ b/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-s3/pom.xml
@@ -80,6 +80,7 @@
       <groupId>org.apache.logging.log4j</groupId>
       <artifactId>log4j-slf4j-impl</artifactId>
       <version>${log4j2.version}</version>
+      <scope>provided</scope>
     </dependency>
   </dependencies>
   <build>

[tika] 01/02: Make reason for forked process failure more evident.

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 4bb614aa96ddbe55eacbc73289b66d2775746d4a
Author: tallison <ta...@apache.org>
AuthorDate: Thu Jul 29 14:33:59 2021 -0400

    Make reason for forked process failure more evident.
---
 tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java b/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java
index 7b14fe8..74ae1a3 100644
--- a/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java
@@ -191,6 +191,7 @@ public class PipesServer implements Runnable {
         try {
             initializeParser();
         } catch (Throwable t) {
+            t.printStackTrace();
             LOG.error("couldn't initialize parser", t);
             try {
                 output.writeByte(STATUS.FAILED_TO_START.getByte());