You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nifi.apache.org by ex...@apache.org on 2022/02/22 03:47:32 UTC
[nifi] branch main updated: NIFI-9647 Added ExtractDocumentText Processor
This is an automated email from the ASF dual-hosted git repository.
exceptionfactory pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/nifi.git
The following commit(s) were added to refs/heads/main by this push:
new 4141ed2 NIFI-9647 Added ExtractDocumentText Processor
4141ed2 is described below
commit 4141ed29ecfcd6329e3af0b7f7290a9d6aa7d8c7
Author: Mike Thomsen <mt...@apache.org>
AuthorDate: Tue Feb 1 12:34:31 2022 -0500
NIFI-9647 Added ExtractDocumentText Processor
- Based on https://github.com/tspannhw/nifi-extracttext-processor
This closes #5732
Signed-off-by: David Handermann <ex...@apache.org>
---
nifi-assembly/pom.xml | 16 ++
.../processors/document/ExtractDocumentText.java | 92 ++++++++++
.../services/org.apache.nifi.processor.Processor | 1 +
.../document/ExtractDocumentTextTest.java | 88 +++++++++
.../src/test/resources/simple.doc | Bin 0 -> 24064 bytes
.../src/test/resources/simple.docx | Bin 0 -> 14871 bytes
.../src/test/resources/simple.pdf | 198 +++++++++++++++++++++
7 files changed, 395 insertions(+)
diff --git a/nifi-assembly/pom.xml b/nifi-assembly/pom.xml
index 2a60a44..86fd60d 100644
--- a/nifi-assembly/pom.xml
+++ b/nifi-assembly/pom.xml
@@ -955,6 +955,22 @@ language governing permissions and limitations under the License. -->
</dependencies>
</profile>
<profile>
+ <id>include-media</id>
+ <!-- This profile includes the NiFi Media Bundle which is a large package that exposes Apache Tika functionality
+ through multiple processors. It is not included with the convenience binary due to its size. -->
+ <activation>
+ <activeByDefault>false</activeByDefault>
+ </activation>
+ <dependencies>
+ <dependency>
+ <groupId>org.apache.nifi</groupId>
+ <artifactId>nifi-media-nar</artifactId>
+ <version>1.16.0-SNAPSHOT</version>
+ <type>nar</type>
+ </dependency>
+ </dependencies>
+ </profile>
+ <profile>
<id>include-rules</id>
<!-- This profile handles includes of rules related artifacts. -->
<activation>
diff --git a/nifi-nar-bundles/nifi-media-bundle/nifi-media-processors/src/main/java/org/apache/nifi/processors/document/ExtractDocumentText.java b/nifi-nar-bundles/nifi-media-bundle/nifi-media-processors/src/main/java/org/apache/nifi/processors/document/ExtractDocumentText.java
new file mode 100644
index 0000000..8312b6f
--- /dev/null
+++ b/nifi-nar-bundles/nifi-media-bundle/nifi-media-processors/src/main/java/org/apache/nifi/processors/document/ExtractDocumentText.java
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nifi.processors.document;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.nifi.annotation.documentation.CapabilityDescription;
+import org.apache.nifi.annotation.documentation.Tags;
+import org.apache.nifi.flowfile.FlowFile;
+import org.apache.nifi.flowfile.attributes.CoreAttributes;
+import org.apache.nifi.processor.AbstractProcessor;
+import org.apache.nifi.processor.ProcessContext;
+import org.apache.nifi.processor.ProcessSession;
+import org.apache.nifi.processor.Relationship;
+import org.apache.nifi.processor.exception.ProcessException;
+import org.apache.tika.Tika;
+
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.io.OutputStreamWriter;
+import java.io.Reader;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+@Tags({"extract, document, text"})
+@CapabilityDescription("Extract text contents from supported binary document formats using Apache Tika")
+public class ExtractDocumentText extends AbstractProcessor {
+ private static final String TEXT_PLAIN = "text/plain";
+
+ public static final Relationship REL_ORIGINAL = new Relationship.Builder().name("original")
+ .description("Success for original input FlowFiles").build();
+
+ public static final Relationship REL_EXTRACTED = new Relationship.Builder().name("extracted")
+ .description("Success for extracted text FlowFiles").build();
+
+ public static final Relationship REL_FAILURE = new Relationship.Builder().name("failure")
+ .description("Content extraction failed").build();
+
+ private static final Set<Relationship> RELATIONSHIPS = Collections.unmodifiableSet(new HashSet<>(Arrays.asList(REL_ORIGINAL, REL_EXTRACTED, REL_FAILURE)));
+
+ @Override
+ public Set<Relationship> getRelationships() {
+ return RELATIONSHIPS;
+ }
+
+ @Override
+ public void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException {
+ FlowFile flowFile = session.get();
+ if (flowFile == null) {
+ return;
+ }
+
+ FlowFile extracted = session.create(flowFile);
+ boolean error = false;
+ try (InputStream is = session.read(flowFile);
+ Reader tikaReader = new Tika().parse(is);
+ OutputStream os = session.write(extracted);
+ OutputStreamWriter writer = new OutputStreamWriter(os)) {
+ IOUtils.copy(tikaReader, writer);
+ } catch (final Throwable t) {
+ error = true;
+ getLogger().error("Extraction Failed {}", flowFile, t);
+ session.remove(extracted);
+ session.transfer(flowFile, REL_FAILURE);
+ } finally {
+ if (!error) {
+ final Map<String, String> attributes = new HashMap<>();
+ attributes.put(CoreAttributes.MIME_TYPE.key(), TEXT_PLAIN);
+ extracted = session.putAllAttributes(extracted, attributes);
+ session.transfer(extracted, REL_EXTRACTED);
+ session.transfer(flowFile, REL_ORIGINAL);
+ }
+ }
+ }
+}
diff --git a/nifi-nar-bundles/nifi-media-bundle/nifi-media-processors/src/main/resources/META-INF/services/org.apache.nifi.processor.Processor b/nifi-nar-bundles/nifi-media-bundle/nifi-media-processors/src/main/resources/META-INF/services/org.apache.nifi.processor.Processor
index d5d0075..a287968 100644
--- a/nifi-nar-bundles/nifi-media-bundle/nifi-media-processors/src/main/resources/META-INF/services/org.apache.nifi.processor.Processor
+++ b/nifi-nar-bundles/nifi-media-bundle/nifi-media-processors/src/main/resources/META-INF/services/org.apache.nifi.processor.Processor
@@ -12,6 +12,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
+org.apache.nifi.processors.document.ExtractDocumentText
org.apache.nifi.processors.image.ExtractImageMetadata
org.apache.nifi.processors.image.ResizeImage
org.apache.nifi.processors.media.ExtractMediaMetadata
diff --git a/nifi-nar-bundles/nifi-media-bundle/nifi-media-processors/src/test/java/org/apache/nifi/processors/document/ExtractDocumentTextTest.java b/nifi-nar-bundles/nifi-media-bundle/nifi-media-processors/src/test/java/org/apache/nifi/processors/document/ExtractDocumentTextTest.java
new file mode 100644
index 0000000..2c677dc
--- /dev/null
+++ b/nifi-nar-bundles/nifi-media-bundle/nifi-media-processors/src/test/java/org/apache/nifi/processors/document/ExtractDocumentTextTest.java
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nifi.processors.document;
+
+import org.apache.nifi.util.MockFlowFile;
+import org.apache.nifi.util.TestRunner;
+import org.apache.nifi.util.TestRunners;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.nio.charset.StandardCharsets;
+import java.util.List;
+
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+public class ExtractDocumentTextTest {
+ private TestRunner testRunner;
+
+ @BeforeEach
+ public void setTestRunner() {
+ testRunner = TestRunners.newTestRunner(ExtractDocumentText.class);
+ }
+
+ @Test
+ public void testRunPdf() throws Exception {
+ final String filename = "simple.pdf";
+ testRunner.enqueue(getFileInputStream(filename));
+ testRunner.run();
+ testRunner.assertTransferCount(ExtractDocumentText.REL_FAILURE, 0);
+
+ List<MockFlowFile> successFiles = testRunner.getFlowFilesForRelationship(ExtractDocumentText.REL_EXTRACTED);
+ for (MockFlowFile mockFile : successFiles) {
+ String result = new String(mockFile.toByteArray(), StandardCharsets.UTF_8);
+ String trimmedResult = result.trim();
+ assertTrue(trimmedResult.startsWith("A Simple PDF File"));
+ }
+ }
+
+ @Test
+ public void testRunDoc() throws Exception {
+ final String filename = "simple.doc";
+ testRunner.enqueue(getFileInputStream(filename));
+ testRunner.run();
+ testRunner.assertTransferCount(ExtractDocumentText.REL_FAILURE, 0);
+
+ List<MockFlowFile> successFiles = testRunner.getFlowFilesForRelationship(ExtractDocumentText.REL_EXTRACTED);
+ for (MockFlowFile mockFile : successFiles) {
+ String result = new String(mockFile.toByteArray(), StandardCharsets.UTF_8);
+ String trimmedResult = result.trim();
+ assertTrue(trimmedResult.startsWith("A Simple WORD DOC File"));
+ }
+ }
+
+ @Test
+ public void testRunDocx() throws Exception {
+ final String filename = "simple.docx";
+ testRunner.enqueue(getFileInputStream(filename));
+ testRunner.run();
+ testRunner.assertTransferCount(ExtractDocumentText.REL_FAILURE, 0);
+
+ List<MockFlowFile> successFiles = testRunner.getFlowFilesForRelationship(ExtractDocumentText.REL_EXTRACTED);
+ for (MockFlowFile mockFile : successFiles) {
+ String result = new String(mockFile.toByteArray(), StandardCharsets.UTF_8);
+ String trimmedResult = result.trim();
+ assertTrue(trimmedResult.startsWith("A Simple WORD DOCX File"));
+ }
+ }
+
+ private FileInputStream getFileInputStream(final String filename) throws FileNotFoundException {
+ return new FileInputStream("src/test/resources/" + filename);
+ }
+}
\ No newline at end of file
diff --git a/nifi-nar-bundles/nifi-media-bundle/nifi-media-processors/src/test/resources/simple.doc b/nifi-nar-bundles/nifi-media-bundle/nifi-media-processors/src/test/resources/simple.doc
new file mode 100644
index 0000000..826b5a9
Binary files /dev/null and b/nifi-nar-bundles/nifi-media-bundle/nifi-media-processors/src/test/resources/simple.doc differ
diff --git a/nifi-nar-bundles/nifi-media-bundle/nifi-media-processors/src/test/resources/simple.docx b/nifi-nar-bundles/nifi-media-bundle/nifi-media-processors/src/test/resources/simple.docx
new file mode 100644
index 0000000..5fbcbbd
Binary files /dev/null and b/nifi-nar-bundles/nifi-media-bundle/nifi-media-processors/src/test/resources/simple.docx differ
diff --git a/nifi-nar-bundles/nifi-media-bundle/nifi-media-processors/src/test/resources/simple.pdf b/nifi-nar-bundles/nifi-media-bundle/nifi-media-processors/src/test/resources/simple.pdf
new file mode 100644
index 0000000..dbf091d
--- /dev/null
+++ b/nifi-nar-bundles/nifi-media-bundle/nifi-media-processors/src/test/resources/simple.pdf
@@ -0,0 +1,198 @@
+%PDF-1.3
+%����
+
+1 0 obj
+<<
+/Type /Catalog
+/Outlines 2 0 R
+/Pages 3 0 R
+>>
+endobj
+
+2 0 obj
+<<
+/Type /Outlines
+/Count 0
+>>
+endobj
+
+3 0 obj
+<<
+/Type /Pages
+/Count 2
+/Kids [ 4 0 R 6 0 R ]
+>>
+endobj
+
+4 0 obj
+<<
+/Type /Page
+/Parent 3 0 R
+/Resources <<
+/Font <<
+/F1 9 0 R
+>>
+/ProcSet 8 0 R
+>>
+/MediaBox [0 0 612.0000 792.0000]
+/Contents 5 0 R
+>>
+endobj
+
+5 0 obj
+<< /Length 1074 >>
+stream
+2 J
+BT
+0 0 0 rg
+/F1 0027 Tf
+57.3750 722.2800 Td
+( A Simple PDF File ) Tj
+ET
+BT
+/F1 0010 Tf
+69.2500 688.6080 Td
+( This is a small demonstration .pdf file - ) Tj
+ET
+BT
+/F1 0010 Tf
+69.2500 664.7040 Td
+( just for use in the Virtual Mechanics tutorials. More text. And more ) Tj
+ET
+BT
+/F1 0010 Tf
+69.2500 652.7520 Td
+( text. And more text. And more text. And more text. ) Tj
+ET
+BT
+/F1 0010 Tf
+69.2500 628.8480 Td
+( And more text. And more text. And more text. And more text. And more ) Tj
+ET
+BT
+/F1 0010 Tf
+69.2500 616.8960 Td
+( text. And more text. Boring, zzzzz. And more text. And more text. And ) Tj
+ET
+BT
+/F1 0010 Tf
+69.2500 604.9440 Td
+( more text. And more text. And more text. And more text. And more text. ) Tj
+ET
+BT
+/F1 0010 Tf
+69.2500 592.9920 Td
+( And more text. And more text. ) Tj
+ET
+BT
+/F1 0010 Tf
+69.2500 569.0880 Td
+( And more text. And more text. And more text. And more text. And more ) Tj
+ET
+BT
+/F1 0010 Tf
+69.2500 557.1360 Td
+( text. And more text. And more text. Even more. Continued on page 2 ...) Tj
+ET
+endstream
+endobj
+
+6 0 obj
+<<
+/Type /Page
+/Parent 3 0 R
+/Resources <<
+/Font <<
+/F1 9 0 R
+>>
+/ProcSet 8 0 R
+>>
+/MediaBox [0 0 612.0000 792.0000]
+/Contents 7 0 R
+>>
+endobj
+
+7 0 obj
+<< /Length 676 >>
+stream
+2 J
+BT
+0 0 0 rg
+/F1 0027 Tf
+57.3750 722.2800 Td
+( Simple PDF File 2 ) Tj
+ET
+BT
+/F1 0010 Tf
+69.2500 688.6080 Td
+( ...continued from page 1. Yet more text. And more text. And more text. ) Tj
+ET
+BT
+/F1 0010 Tf
+69.2500 676.6560 Td
+( And more text. And more text. And more text. And more text. And more ) Tj
+ET
+BT
+/F1 0010 Tf
+69.2500 664.7040 Td
+( text. Oh, how boring typing this stuff. But not as boring as watching ) Tj
+ET
+BT
+/F1 0010 Tf
+69.2500 652.7520 Td
+( paint dry. And more text. And more text. And more text. And more text. ) Tj
+ET
+BT
+/F1 0010 Tf
+69.2500 640.8000 Td
+( Boring. More, a little more text. The end, and just as well. ) Tj
+ET
+endstream
+endobj
+
+8 0 obj
+[/PDF /Text]
+endobj
+
+9 0 obj
+<<
+/Type /Font
+/Subtype /Type1
+/Name /F1
+/BaseFont /Helvetica
+/Encoding /WinAnsiEncoding
+>>
+endobj
+
+10 0 obj
+<<
+/Creator (Rave \(http://www.nevrona.com/rave\))
+/Producer (Nevrona Designs)
+/CreationDate (D:20060301072826)
+>>
+endobj
+
+xref
+0 11
+0000000000 65535 f
+0000000019 00000 n
+0000000093 00000 n
+0000000147 00000 n
+0000000222 00000 n
+0000000390 00000 n
+0000001522 00000 n
+0000001690 00000 n
+0000002423 00000 n
+0000002456 00000 n
+0000002574 00000 n
+
+trailer
+<<
+/Size 11
+/Root 1 0 R
+/Info 10 0 R
+>>
+
+startxref
+2714
+%%EOF