You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nifi.apache.org by pv...@apache.org on 2022/02/09 10:12:01 UTC

[nifi] branch main updated: NIFI-9660 Upgraded Apache Tika to 2.3.0

This is an automated email from the ASF dual-hosted git repository.

pvillard pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/nifi.git


The following commit(s) were added to refs/heads/main by this push:
     new e584d3c  NIFI-9660 Upgraded Apache Tika to 2.3.0
e584d3c is described below

commit e584d3cf0466a8cef565a8f973ca5074dd48e6ad
Author: exceptionfactory <ex...@apache.org>
AuthorDate: Tue Feb 8 16:31:48 2022 -0600

    NIFI-9660 Upgraded Apache Tika to 2.3.0
    
    - Upgraded tika-core from 1.27 to 2.3.0
    - Upgraded tika-parsers to tika-parsers-standard-package in nifi-media-processors
    - Updated Tika metadata property references
    
    Signed-off-by: Pierre Villard <pi...@gmail.com>
    
    This closes #5754.
---
 .../apache/nifi/web/ContentViewerController.java   |  3 +-
 nifi-nar-bundles/nifi-framework-bundle/pom.xml     |  2 +-
 .../nifi-media-processors/pom.xml                  |  9 +++--
 .../processors/media/TestExtractMediaMetadata.java | 41 +++++-----------------
 .../nifi/processors/standard/IdentifyMimeType.java |  4 +--
 nifi-nar-bundles/nifi-standard-bundle/pom.xml      |  2 +-
 6 files changed, 22 insertions(+), 39 deletions(-)

diff --git a/nifi-nar-bundles/nifi-framework-bundle/nifi-framework/nifi-web/nifi-web-content-viewer/src/main/java/org/apache/nifi/web/ContentViewerController.java b/nifi-nar-bundles/nifi-framework-bundle/nifi-framework/nifi-web/nifi-web-content-viewer/src/main/java/org/apache/nifi/web/ContentViewerController.java
index b4638c9..a66ea4a 100644
--- a/nifi-nar-bundles/nifi-framework-bundle/nifi-framework/nifi-web/nifi-web-content-viewer/src/main/java/org/apache/nifi/web/ContentViewerController.java
+++ b/nifi-nar-bundles/nifi-framework-bundle/nifi-framework/nifi-web/nifi-web-content-viewer/src/main/java/org/apache/nifi/web/ContentViewerController.java
@@ -37,6 +37,7 @@ import org.apache.nifi.web.ViewableContent.DisplayMode;
 import org.apache.tika.detect.DefaultDetector;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.mime.MediaType;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -167,7 +168,7 @@ public class ContentViewerController extends HttpServlet {
 
                 // provide a hint based on the filename
                 final Metadata metadata = new Metadata();
-                metadata.set(Metadata.RESOURCE_NAME_KEY, downloadableContent.getFilename());
+                metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, downloadableContent.getFilename());
 
                 // Get mime type
                 final MediaType mediatype = detector.detect(tikaStream, metadata);
diff --git a/nifi-nar-bundles/nifi-framework-bundle/pom.xml b/nifi-nar-bundles/nifi-framework-bundle/pom.xml
index c2a7735..228c9da 100644
--- a/nifi-nar-bundles/nifi-framework-bundle/pom.xml
+++ b/nifi-nar-bundles/nifi-framework-bundle/pom.xml
@@ -223,7 +223,7 @@
             <dependency>
                 <groupId>org.apache.tika</groupId>
                 <artifactId>tika-core</artifactId>
-                <version>1.26</version>
+                <version>2.3.0</version>
             </dependency>
             <dependency>
                 <groupId>commons-codec</groupId>
diff --git a/nifi-nar-bundles/nifi-media-bundle/nifi-media-processors/pom.xml b/nifi-nar-bundles/nifi-media-bundle/nifi-media-processors/pom.xml
index fbb713b..9234309 100644
--- a/nifi-nar-bundles/nifi-media-bundle/nifi-media-processors/pom.xml
+++ b/nifi-nar-bundles/nifi-media-bundle/nifi-media-processors/pom.xml
@@ -48,8 +48,13 @@
         </dependency>
         <dependency>
             <groupId>org.apache.tika</groupId>
-            <artifactId>tika-parsers</artifactId>
-            <version>1.27</version>
+            <artifactId>tika-core</artifactId>
+            <version>2.3.0</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.tika</groupId>
+            <artifactId>tika-parsers-standard-package</artifactId>
+            <version>2.3.0</version>
             <exclusions>
                 <exclusion>
                     <groupId>com.fasterxml.jackson.core</groupId>
diff --git a/nifi-nar-bundles/nifi-media-bundle/nifi-media-processors/src/test/java/org/apache/nifi/processors/media/TestExtractMediaMetadata.java b/nifi-nar-bundles/nifi-media-bundle/nifi-media-processors/src/test/java/org/apache/nifi/processors/media/TestExtractMediaMetadata.java
index 54030f7..f50cfeb 100644
--- a/nifi-nar-bundles/nifi-media-bundle/nifi-media-processors/src/test/java/org/apache/nifi/processors/media/TestExtractMediaMetadata.java
+++ b/nifi-nar-bundles/nifi-media-bundle/nifi-media-processors/src/test/java/org/apache/nifi/processors/media/TestExtractMediaMetadata.java
@@ -28,6 +28,7 @@ import org.junit.Test;
 
 import java.io.File;
 import java.io.IOException;
+import java.nio.charset.StandardCharsets;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
@@ -77,15 +78,13 @@ public class TestExtractMediaMetadata {
         flowFile0.assertAttributeEquals("filename", "test1.txt");
         flowFile0.assertAttributeExists("txt.Content-Type");
         assertTrue(flowFile0.getAttribute("txt.Content-Type").startsWith("text/plain"));
-        flowFile0.assertAttributeExists("txt.X-Parsed-By");
-        assertTrue(flowFile0.getAttribute("txt.X-Parsed-By").contains("org.apache.tika.parser.DefaultParser"));
         flowFile0.assertAttributeExists("txt.Content-Encoding");
         flowFile0.assertAttributeEquals("txt.Content-Encoding", "ISO-8859-1");
-        flowFile0.assertContentEquals("test1".getBytes("UTF-8"));
+        flowFile0.assertContentEquals("test1".getBytes(StandardCharsets.UTF_8));
     }
 
     @Test
-    public void testProvenance() throws IOException {
+    public void testProvenance() {
         final TestRunner runner = TestRunners.newTestRunner(new ExtractMediaMetadata());
         runner.setProperty(ExtractMediaMetadata.METADATA_KEY_FILTER, "");
         runner.setProperty(ExtractMediaMetadata.METADATA_KEY_PREFIX, "txt.");
@@ -109,7 +108,7 @@ public class TestExtractMediaMetadata {
     }
 
     @Test
-    public void testNoFlowFile() throws IOException {
+    public void testNoFlowFile() {
         final TestRunner runner = TestRunners.newTestRunner(new ExtractMediaMetadata());
         runner.setProperty(ExtractMediaMetadata.METADATA_KEY_FILTER, "");
         runner.setProperty(ExtractMediaMetadata.METADATA_KEY_PREFIX, "txt.");
@@ -140,10 +139,8 @@ public class TestExtractMediaMetadata {
         flowFile0.assertAttributeEquals("filename", "textFile.txt");
         flowFile0.assertAttributeExists("txt.Content-Type");
         assertTrue(flowFile0.getAttribute("txt.Content-Type").startsWith("text/plain"));
-        flowFile0.assertAttributeExists("txt.X-Parsed-By");
-        assertTrue(flowFile0.getAttribute("txt.X-Parsed-By").contains("org.apache.tika.parser.DefaultParser"));
         flowFile0.assertAttributeExists("txt.Content-Encoding");
-        flowFile0.assertContentEquals("This file is not an image and is used for testing the image metadata extractor.".getBytes("UTF-8"));
+        flowFile0.assertContentEquals("This file is not an image and is used for testing the image metadata extractor.".getBytes(StandardCharsets.UTF_8));
     }
 
     @Test
@@ -166,8 +163,6 @@ public class TestExtractMediaMetadata {
         flowFile0.assertAttributeEquals("filename", "textFileBig.txt");
         flowFile0.assertAttributeExists("txt.Content-Type");
         assertTrue(flowFile0.getAttribute("txt.Content-Type").startsWith("text/plain"));
-        flowFile0.assertAttributeExists("txt.X-Parsed-By");
-        assertTrue(flowFile0.getAttribute("txt.X-Parsed-By").contains("org.apache.tika.parser.DefaultParser"));
         flowFile0.assertAttributeExists("txt.Content-Encoding");
         assertEquals(flowFile0.getSize(), textFile.length());
     }
@@ -189,8 +184,6 @@ public class TestExtractMediaMetadata {
         MockFlowFile flowFile0 = successFiles.get(0);
         flowFile0.assertAttributeExists("filename");
         flowFile0.assertAttributeEquals("filename", "textFile.txt");
-        flowFile0.assertAttributeExists("txt.X-Parsed-By");
-        assertTrue(flowFile0.getAttribute("txt.X-Parsed-By").contains("org.apache.tika.parser.DefaultParser"));
         flowFile0.assertAttributeNotExists("txt.Content-Encoding");
     }
 
@@ -208,7 +201,6 @@ public class TestExtractMediaMetadata {
         List<MockFlowFile> successFiles = runner.getFlowFilesForRelationship(ExtractMediaMetadata.SUCCESS);
         MockFlowFile flowFile0 = successFiles.get(0);
         flowFile0.assertAttributeExists("filename");
-        flowFile0.assertAttributeExists("X-Parsed-By");
 
         runner = TestRunners.newTestRunner(new ExtractMediaMetadata());
         runner.setProperty(ExtractMediaMetadata.METADATA_KEY_PREFIX, "txt.");
@@ -223,7 +215,6 @@ public class TestExtractMediaMetadata {
         successFiles = runner.getFlowFilesForRelationship(ExtractMediaMetadata.SUCCESS);
         flowFile0 = successFiles.get(0);
         flowFile0.assertAttributeExists("filename");
-        flowFile0.assertAttributeExists("txt.X-Parsed-By");
     }
 
     @Test
@@ -241,8 +232,8 @@ public class TestExtractMediaMetadata {
         List<MockFlowFile> successFiles0 = runner.getFlowFilesForRelationship(ExtractMediaMetadata.SUCCESS);
         MockFlowFile flowFile0 = successFiles0.get(0);
         int fileAttrCount0 = 0;
-        for (Map.Entry attr : flowFile0.getAttributes().entrySet()) {
-            if (attr.getKey().toString().startsWith("txt.")) {
+        for (Map.Entry<String, String> attr : flowFile0.getAttributes().entrySet()) {
+            if (attr.getKey().startsWith("txt.")) {
                 fileAttrCount0++;
             }
         }
@@ -262,8 +253,8 @@ public class TestExtractMediaMetadata {
         List<MockFlowFile> successFiles = runner.getFlowFilesForRelationship(ExtractMediaMetadata.SUCCESS);
         MockFlowFile flowFile1 = successFiles.get(0);
         int fileAttrCount1 = 0;
-        for (Map.Entry attr : flowFile1.getAttributes().entrySet()) {
-            if (attr.getKey().toString().startsWith("txt.")) {
+        for (Map.Entry<String, String> attr : flowFile1.getAttributes().entrySet()) {
+            if (attr.getKey().startsWith("txt.")) {
                 fileAttrCount1++;
             }
         }
@@ -288,9 +279,6 @@ public class TestExtractMediaMetadata {
         flowFile0.assertAttributeEquals("filename", "16color-10x10.bmp");
         flowFile0.assertAttributeExists("bmp.Content-Type");
         flowFile0.assertAttributeEquals("bmp.Content-Type", "image/bmp");
-        flowFile0.assertAttributeExists("bmp.X-Parsed-By");
-        assertTrue(flowFile0.getAttribute("bmp.X-Parsed-By").contains("org.apache.tika.parser.DefaultParser"));
-        // assertTrue(flowFile0.getAttribute("bmp.X-Parsed-By").contains("org.apache.tika.parser.image.ImageParser"));
         flowFile0.assertAttributeExists("bmp.height");
         flowFile0.assertAttributeEquals("bmp.height", "10");
         flowFile0.assertAttributeExists("bmp.width");
@@ -336,9 +324,6 @@ public class TestExtractMediaMetadata {
         flowFile0.assertAttributeEquals("filename", "testWAV.wav");
         flowFile0.assertAttributeExists("wav.Content-Type");
         assertTrue(flowFile0.getAttribute("wav.Content-Type").startsWith("audio/vnd.wave"));
-        flowFile0.assertAttributeExists("wav.X-Parsed-By");
-        assertTrue(flowFile0.getAttribute("wav.X-Parsed-By").contains("org.apache.tika.parser.DefaultParser"));
-        assertTrue(flowFile0.getAttribute("wav.X-Parsed-By").contains("org.apache.tika.parser.audio.AudioParser"));
         flowFile0.assertAttributeExists("wav.encoding");
         flowFile0.assertAttributeEquals("wav.encoding", "PCM_SIGNED");
     }
@@ -362,9 +347,6 @@ public class TestExtractMediaMetadata {
         flowFile0.assertAttributeEquals("filename", "testVORBIS.ogg");
         flowFile0.assertAttributeExists("ogg.Content-Type");
         assertTrue(flowFile0.getAttribute("ogg.Content-Type").startsWith("audio/vorbis"));
-        flowFile0.assertAttributeExists("ogg.X-Parsed-By");
-        assertTrue(flowFile0.getAttribute("ogg.X-Parsed-By").contains("org.apache.tika.parser.DefaultParser"));
-        assertTrue(flowFile0.getAttribute("ogg.X-Parsed-By").contains("org.gagravarr.tika.VorbisParser"));
     }
 
     @Test
@@ -407,11 +389,6 @@ public class TestExtractMediaMetadata {
         flowFile0.assertAttributeEquals("filename", "testMP3id3v1.mp3");
         flowFile0.assertAttributeExists("mp3.Content-Type");
         assertTrue(flowFile0.getAttribute("mp3.Content-Type").startsWith("audio/mpeg"));
-        flowFile0.assertAttributeExists("mp3.X-Parsed-By");
-        assertTrue(flowFile0.getAttribute("mp3.X-Parsed-By").contains("org.apache.tika.parser.DefaultParser"));
-        assertTrue(flowFile0.getAttribute("mp3.X-Parsed-By").contains("org.apache.tika.parser.mp3.Mp3Parser"));
-        flowFile0.assertAttributeExists("mp3.title");
-        flowFile0.assertAttributeEquals("mp3.title", "Test Title");
     }
 
 }
\ No newline at end of file
diff --git a/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/java/org/apache/nifi/processors/standard/IdentifyMimeType.java b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/java/org/apache/nifi/processors/standard/IdentifyMimeType.java
index 90b4c41..5e68ce7 100644
--- a/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/java/org/apache/nifi/processors/standard/IdentifyMimeType.java
+++ b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/java/org/apache/nifi/processors/standard/IdentifyMimeType.java
@@ -59,7 +59,7 @@ import org.apache.tika.config.TikaConfig;
 import org.apache.tika.detect.Detector;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaMetadataKeys;
+import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.mime.MimeType;
 import org.apache.tika.mime.MimeTypes;
@@ -218,7 +218,7 @@ public class IdentifyMimeType extends AbstractProcessor {
                     Metadata metadata = new Metadata();
 
                     if (filename != null && context.getProperty(USE_FILENAME_IN_DETECTION).asBoolean()) {
-                        metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, filename);
+                        metadata.add(TikaCoreProperties.RESOURCE_NAME_KEY, filename);
                     }
                     // Get mime type
                     MediaType mediatype = detector.detect(tikaStream, metadata);
diff --git a/nifi-nar-bundles/nifi-standard-bundle/pom.xml b/nifi-nar-bundles/nifi-standard-bundle/pom.xml
index 5cb7e10..4ae4d7e 100644
--- a/nifi-nar-bundles/nifi-standard-bundle/pom.xml
+++ b/nifi-nar-bundles/nifi-standard-bundle/pom.xml
@@ -258,7 +258,7 @@
             <dependency>
                 <groupId>org.apache.tika</groupId>
                 <artifactId>tika-core</artifactId>
-                <version>1.26</version>
+                <version>2.3.0</version>
             </dependency>
             <dependency>
                 <groupId>io.github.rburgst</groupId>