You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nifi.apache.org by pv...@apache.org on 2022/02/09 10:12:01 UTC
[nifi] branch main updated: NIFI-9660 Upgraded Apache Tika to 2.3.0
This is an automated email from the ASF dual-hosted git repository.
pvillard pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/nifi.git
The following commit(s) were added to refs/heads/main by this push:
new e584d3c NIFI-9660 Upgraded Apache Tika to 2.3.0
e584d3c is described below
commit e584d3cf0466a8cef565a8f973ca5074dd48e6ad
Author: exceptionfactory <ex...@apache.org>
AuthorDate: Tue Feb 8 16:31:48 2022 -0600
NIFI-9660 Upgraded Apache Tika to 2.3.0
- Upgraded tika-core from 1.27 to 2.3.0
- Upgraded tika-parsers to tika-parsers-standard-package in nifi-media-processors
- Updated Tika metadata property references
Signed-off-by: Pierre Villard <pi...@gmail.com>
This closes #5754.
---
.../apache/nifi/web/ContentViewerController.java | 3 +-
nifi-nar-bundles/nifi-framework-bundle/pom.xml | 2 +-
.../nifi-media-processors/pom.xml | 9 +++--
.../processors/media/TestExtractMediaMetadata.java | 41 +++++-----------------
.../nifi/processors/standard/IdentifyMimeType.java | 4 +--
nifi-nar-bundles/nifi-standard-bundle/pom.xml | 2 +-
6 files changed, 22 insertions(+), 39 deletions(-)
diff --git a/nifi-nar-bundles/nifi-framework-bundle/nifi-framework/nifi-web/nifi-web-content-viewer/src/main/java/org/apache/nifi/web/ContentViewerController.java b/nifi-nar-bundles/nifi-framework-bundle/nifi-framework/nifi-web/nifi-web-content-viewer/src/main/java/org/apache/nifi/web/ContentViewerController.java
index b4638c9..a66ea4a 100644
--- a/nifi-nar-bundles/nifi-framework-bundle/nifi-framework/nifi-web/nifi-web-content-viewer/src/main/java/org/apache/nifi/web/ContentViewerController.java
+++ b/nifi-nar-bundles/nifi-framework-bundle/nifi-framework/nifi-web/nifi-web-content-viewer/src/main/java/org/apache/nifi/web/ContentViewerController.java
@@ -37,6 +37,7 @@ import org.apache.nifi.web.ViewableContent.DisplayMode;
import org.apache.tika.detect.DefaultDetector;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -167,7 +168,7 @@ public class ContentViewerController extends HttpServlet {
// provide a hint based on the filename
final Metadata metadata = new Metadata();
- metadata.set(Metadata.RESOURCE_NAME_KEY, downloadableContent.getFilename());
+ metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, downloadableContent.getFilename());
// Get mime type
final MediaType mediatype = detector.detect(tikaStream, metadata);
diff --git a/nifi-nar-bundles/nifi-framework-bundle/pom.xml b/nifi-nar-bundles/nifi-framework-bundle/pom.xml
index c2a7735..228c9da 100644
--- a/nifi-nar-bundles/nifi-framework-bundle/pom.xml
+++ b/nifi-nar-bundles/nifi-framework-bundle/pom.xml
@@ -223,7 +223,7 @@
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-core</artifactId>
- <version>1.26</version>
+ <version>2.3.0</version>
</dependency>
<dependency>
<groupId>commons-codec</groupId>
diff --git a/nifi-nar-bundles/nifi-media-bundle/nifi-media-processors/pom.xml b/nifi-nar-bundles/nifi-media-bundle/nifi-media-processors/pom.xml
index fbb713b..9234309 100644
--- a/nifi-nar-bundles/nifi-media-bundle/nifi-media-processors/pom.xml
+++ b/nifi-nar-bundles/nifi-media-bundle/nifi-media-processors/pom.xml
@@ -48,8 +48,13 @@
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
- <artifactId>tika-parsers</artifactId>
- <version>1.27</version>
+ <artifactId>tika-core</artifactId>
+ <version>2.3.0</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parsers-standard-package</artifactId>
+ <version>2.3.0</version>
<exclusions>
<exclusion>
<groupId>com.fasterxml.jackson.core</groupId>
diff --git a/nifi-nar-bundles/nifi-media-bundle/nifi-media-processors/src/test/java/org/apache/nifi/processors/media/TestExtractMediaMetadata.java b/nifi-nar-bundles/nifi-media-bundle/nifi-media-processors/src/test/java/org/apache/nifi/processors/media/TestExtractMediaMetadata.java
index 54030f7..f50cfeb 100644
--- a/nifi-nar-bundles/nifi-media-bundle/nifi-media-processors/src/test/java/org/apache/nifi/processors/media/TestExtractMediaMetadata.java
+++ b/nifi-nar-bundles/nifi-media-bundle/nifi-media-processors/src/test/java/org/apache/nifi/processors/media/TestExtractMediaMetadata.java
@@ -28,6 +28,7 @@ import org.junit.Test;
import java.io.File;
import java.io.IOException;
+import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
@@ -77,15 +78,13 @@ public class TestExtractMediaMetadata {
flowFile0.assertAttributeEquals("filename", "test1.txt");
flowFile0.assertAttributeExists("txt.Content-Type");
assertTrue(flowFile0.getAttribute("txt.Content-Type").startsWith("text/plain"));
- flowFile0.assertAttributeExists("txt.X-Parsed-By");
- assertTrue(flowFile0.getAttribute("txt.X-Parsed-By").contains("org.apache.tika.parser.DefaultParser"));
flowFile0.assertAttributeExists("txt.Content-Encoding");
flowFile0.assertAttributeEquals("txt.Content-Encoding", "ISO-8859-1");
- flowFile0.assertContentEquals("test1".getBytes("UTF-8"));
+ flowFile0.assertContentEquals("test1".getBytes(StandardCharsets.UTF_8));
}
@Test
- public void testProvenance() throws IOException {
+ public void testProvenance() {
final TestRunner runner = TestRunners.newTestRunner(new ExtractMediaMetadata());
runner.setProperty(ExtractMediaMetadata.METADATA_KEY_FILTER, "");
runner.setProperty(ExtractMediaMetadata.METADATA_KEY_PREFIX, "txt.");
@@ -109,7 +108,7 @@ public class TestExtractMediaMetadata {
}
@Test
- public void testNoFlowFile() throws IOException {
+ public void testNoFlowFile() {
final TestRunner runner = TestRunners.newTestRunner(new ExtractMediaMetadata());
runner.setProperty(ExtractMediaMetadata.METADATA_KEY_FILTER, "");
runner.setProperty(ExtractMediaMetadata.METADATA_KEY_PREFIX, "txt.");
@@ -140,10 +139,8 @@ public class TestExtractMediaMetadata {
flowFile0.assertAttributeEquals("filename", "textFile.txt");
flowFile0.assertAttributeExists("txt.Content-Type");
assertTrue(flowFile0.getAttribute("txt.Content-Type").startsWith("text/plain"));
- flowFile0.assertAttributeExists("txt.X-Parsed-By");
- assertTrue(flowFile0.getAttribute("txt.X-Parsed-By").contains("org.apache.tika.parser.DefaultParser"));
flowFile0.assertAttributeExists("txt.Content-Encoding");
- flowFile0.assertContentEquals("This file is not an image and is used for testing the image metadata extractor.".getBytes("UTF-8"));
+ flowFile0.assertContentEquals("This file is not an image and is used for testing the image metadata extractor.".getBytes(StandardCharsets.UTF_8));
}
@Test
@@ -166,8 +163,6 @@ public class TestExtractMediaMetadata {
flowFile0.assertAttributeEquals("filename", "textFileBig.txt");
flowFile0.assertAttributeExists("txt.Content-Type");
assertTrue(flowFile0.getAttribute("txt.Content-Type").startsWith("text/plain"));
- flowFile0.assertAttributeExists("txt.X-Parsed-By");
- assertTrue(flowFile0.getAttribute("txt.X-Parsed-By").contains("org.apache.tika.parser.DefaultParser"));
flowFile0.assertAttributeExists("txt.Content-Encoding");
assertEquals(flowFile0.getSize(), textFile.length());
}
@@ -189,8 +184,6 @@ public class TestExtractMediaMetadata {
MockFlowFile flowFile0 = successFiles.get(0);
flowFile0.assertAttributeExists("filename");
flowFile0.assertAttributeEquals("filename", "textFile.txt");
- flowFile0.assertAttributeExists("txt.X-Parsed-By");
- assertTrue(flowFile0.getAttribute("txt.X-Parsed-By").contains("org.apache.tika.parser.DefaultParser"));
flowFile0.assertAttributeNotExists("txt.Content-Encoding");
}
@@ -208,7 +201,6 @@ public class TestExtractMediaMetadata {
List<MockFlowFile> successFiles = runner.getFlowFilesForRelationship(ExtractMediaMetadata.SUCCESS);
MockFlowFile flowFile0 = successFiles.get(0);
flowFile0.assertAttributeExists("filename");
- flowFile0.assertAttributeExists("X-Parsed-By");
runner = TestRunners.newTestRunner(new ExtractMediaMetadata());
runner.setProperty(ExtractMediaMetadata.METADATA_KEY_PREFIX, "txt.");
@@ -223,7 +215,6 @@ public class TestExtractMediaMetadata {
successFiles = runner.getFlowFilesForRelationship(ExtractMediaMetadata.SUCCESS);
flowFile0 = successFiles.get(0);
flowFile0.assertAttributeExists("filename");
- flowFile0.assertAttributeExists("txt.X-Parsed-By");
}
@Test
@@ -241,8 +232,8 @@ public class TestExtractMediaMetadata {
List<MockFlowFile> successFiles0 = runner.getFlowFilesForRelationship(ExtractMediaMetadata.SUCCESS);
MockFlowFile flowFile0 = successFiles0.get(0);
int fileAttrCount0 = 0;
- for (Map.Entry attr : flowFile0.getAttributes().entrySet()) {
- if (attr.getKey().toString().startsWith("txt.")) {
+ for (Map.Entry<String, String> attr : flowFile0.getAttributes().entrySet()) {
+ if (attr.getKey().startsWith("txt.")) {
fileAttrCount0++;
}
}
@@ -262,8 +253,8 @@ public class TestExtractMediaMetadata {
List<MockFlowFile> successFiles = runner.getFlowFilesForRelationship(ExtractMediaMetadata.SUCCESS);
MockFlowFile flowFile1 = successFiles.get(0);
int fileAttrCount1 = 0;
- for (Map.Entry attr : flowFile1.getAttributes().entrySet()) {
- if (attr.getKey().toString().startsWith("txt.")) {
+ for (Map.Entry<String, String> attr : flowFile1.getAttributes().entrySet()) {
+ if (attr.getKey().startsWith("txt.")) {
fileAttrCount1++;
}
}
@@ -288,9 +279,6 @@ public class TestExtractMediaMetadata {
flowFile0.assertAttributeEquals("filename", "16color-10x10.bmp");
flowFile0.assertAttributeExists("bmp.Content-Type");
flowFile0.assertAttributeEquals("bmp.Content-Type", "image/bmp");
- flowFile0.assertAttributeExists("bmp.X-Parsed-By");
- assertTrue(flowFile0.getAttribute("bmp.X-Parsed-By").contains("org.apache.tika.parser.DefaultParser"));
- // assertTrue(flowFile0.getAttribute("bmp.X-Parsed-By").contains("org.apache.tika.parser.image.ImageParser"));
flowFile0.assertAttributeExists("bmp.height");
flowFile0.assertAttributeEquals("bmp.height", "10");
flowFile0.assertAttributeExists("bmp.width");
@@ -336,9 +324,6 @@ public class TestExtractMediaMetadata {
flowFile0.assertAttributeEquals("filename", "testWAV.wav");
flowFile0.assertAttributeExists("wav.Content-Type");
assertTrue(flowFile0.getAttribute("wav.Content-Type").startsWith("audio/vnd.wave"));
- flowFile0.assertAttributeExists("wav.X-Parsed-By");
- assertTrue(flowFile0.getAttribute("wav.X-Parsed-By").contains("org.apache.tika.parser.DefaultParser"));
- assertTrue(flowFile0.getAttribute("wav.X-Parsed-By").contains("org.apache.tika.parser.audio.AudioParser"));
flowFile0.assertAttributeExists("wav.encoding");
flowFile0.assertAttributeEquals("wav.encoding", "PCM_SIGNED");
}
@@ -362,9 +347,6 @@ public class TestExtractMediaMetadata {
flowFile0.assertAttributeEquals("filename", "testVORBIS.ogg");
flowFile0.assertAttributeExists("ogg.Content-Type");
assertTrue(flowFile0.getAttribute("ogg.Content-Type").startsWith("audio/vorbis"));
- flowFile0.assertAttributeExists("ogg.X-Parsed-By");
- assertTrue(flowFile0.getAttribute("ogg.X-Parsed-By").contains("org.apache.tika.parser.DefaultParser"));
- assertTrue(flowFile0.getAttribute("ogg.X-Parsed-By").contains("org.gagravarr.tika.VorbisParser"));
}
@Test
@@ -407,11 +389,6 @@ public class TestExtractMediaMetadata {
flowFile0.assertAttributeEquals("filename", "testMP3id3v1.mp3");
flowFile0.assertAttributeExists("mp3.Content-Type");
assertTrue(flowFile0.getAttribute("mp3.Content-Type").startsWith("audio/mpeg"));
- flowFile0.assertAttributeExists("mp3.X-Parsed-By");
- assertTrue(flowFile0.getAttribute("mp3.X-Parsed-By").contains("org.apache.tika.parser.DefaultParser"));
- assertTrue(flowFile0.getAttribute("mp3.X-Parsed-By").contains("org.apache.tika.parser.mp3.Mp3Parser"));
- flowFile0.assertAttributeExists("mp3.title");
- flowFile0.assertAttributeEquals("mp3.title", "Test Title");
}
}
\ No newline at end of file
diff --git a/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/java/org/apache/nifi/processors/standard/IdentifyMimeType.java b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/java/org/apache/nifi/processors/standard/IdentifyMimeType.java
index 90b4c41..5e68ce7 100644
--- a/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/java/org/apache/nifi/processors/standard/IdentifyMimeType.java
+++ b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/java/org/apache/nifi/processors/standard/IdentifyMimeType.java
@@ -59,7 +59,7 @@ import org.apache.tika.config.TikaConfig;
import org.apache.tika.detect.Detector;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaMetadataKeys;
+import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.mime.MimeType;
import org.apache.tika.mime.MimeTypes;
@@ -218,7 +218,7 @@ public class IdentifyMimeType extends AbstractProcessor {
Metadata metadata = new Metadata();
if (filename != null && context.getProperty(USE_FILENAME_IN_DETECTION).asBoolean()) {
- metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, filename);
+ metadata.add(TikaCoreProperties.RESOURCE_NAME_KEY, filename);
}
// Get mime type
MediaType mediatype = detector.detect(tikaStream, metadata);
diff --git a/nifi-nar-bundles/nifi-standard-bundle/pom.xml b/nifi-nar-bundles/nifi-standard-bundle/pom.xml
index 5cb7e10..4ae4d7e 100644
--- a/nifi-nar-bundles/nifi-standard-bundle/pom.xml
+++ b/nifi-nar-bundles/nifi-standard-bundle/pom.xml
@@ -258,7 +258,7 @@
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-core</artifactId>
- <version>1.26</version>
+ <version>2.3.0</version>
</dependency>
<dependency>
<groupId>io.github.rburgst</groupId>