You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nifi.apache.org by ex...@apache.org on 2022/02/22 03:47:32 UTC

[nifi] branch main updated: NIFI-9647 Added ExtractDocumentText Processor

This is an automated email from the ASF dual-hosted git repository.

exceptionfactory pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/nifi.git


The following commit(s) were added to refs/heads/main by this push:
     new 4141ed2  NIFI-9647 Added ExtractDocumentText Processor
4141ed2 is described below

commit 4141ed29ecfcd6329e3af0b7f7290a9d6aa7d8c7
Author: Mike Thomsen <mt...@apache.org>
AuthorDate: Tue Feb 1 12:34:31 2022 -0500

    NIFI-9647 Added ExtractDocumentText Processor
    
    - Based on https://github.com/tspannhw/nifi-extracttext-processor
    
    This closes #5732
    
    Signed-off-by: David Handermann <ex...@apache.org>
---
 nifi-assembly/pom.xml                              |  16 ++
 .../processors/document/ExtractDocumentText.java   |  92 ++++++++++
 .../services/org.apache.nifi.processor.Processor   |   1 +
 .../document/ExtractDocumentTextTest.java          |  88 +++++++++
 .../src/test/resources/simple.doc                  | Bin 0 -> 24064 bytes
 .../src/test/resources/simple.docx                 | Bin 0 -> 14871 bytes
 .../src/test/resources/simple.pdf                  | 198 +++++++++++++++++++++
 7 files changed, 395 insertions(+)

diff --git a/nifi-assembly/pom.xml b/nifi-assembly/pom.xml
index 2a60a44..86fd60d 100644
--- a/nifi-assembly/pom.xml
+++ b/nifi-assembly/pom.xml
@@ -955,6 +955,22 @@ language governing permissions and limitations under the License. -->
             </dependencies>
         </profile>
         <profile>
+            <id>include-media</id>
+            <!-- This profile includes the NiFi Media Bundle which is a large package that exposes Apache Tika functionality
+                through multiple processors. It is not included with the convenience binary due to its size. -->
+            <activation>
+                <activeByDefault>false</activeByDefault>
+            </activation>
+            <dependencies>
+                <dependency>
+                    <groupId>org.apache.nifi</groupId>
+                    <artifactId>nifi-media-nar</artifactId>
+                    <version>1.16.0-SNAPSHOT</version>
+                    <type>nar</type>
+                </dependency>
+            </dependencies>
+        </profile>
+        <profile>
             <id>include-rules</id>
             <!-- This profile handles includes of rules related artifacts. -->
             <activation>
diff --git a/nifi-nar-bundles/nifi-media-bundle/nifi-media-processors/src/main/java/org/apache/nifi/processors/document/ExtractDocumentText.java b/nifi-nar-bundles/nifi-media-bundle/nifi-media-processors/src/main/java/org/apache/nifi/processors/document/ExtractDocumentText.java
new file mode 100644
index 0000000..8312b6f
--- /dev/null
+++ b/nifi-nar-bundles/nifi-media-bundle/nifi-media-processors/src/main/java/org/apache/nifi/processors/document/ExtractDocumentText.java
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nifi.processors.document;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.nifi.annotation.documentation.CapabilityDescription;
+import org.apache.nifi.annotation.documentation.Tags;
+import org.apache.nifi.flowfile.FlowFile;
+import org.apache.nifi.flowfile.attributes.CoreAttributes;
+import org.apache.nifi.processor.AbstractProcessor;
+import org.apache.nifi.processor.ProcessContext;
+import org.apache.nifi.processor.ProcessSession;
+import org.apache.nifi.processor.Relationship;
+import org.apache.nifi.processor.exception.ProcessException;
+import org.apache.tika.Tika;
+
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.io.OutputStreamWriter;
+import java.io.Reader;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+@Tags({"extract, document, text"})
+@CapabilityDescription("Extract text contents from supported binary document formats using Apache Tika")
+public class ExtractDocumentText extends AbstractProcessor {
+    private static final String TEXT_PLAIN = "text/plain";
+
+    public static final Relationship REL_ORIGINAL = new Relationship.Builder().name("original")
+            .description("Success for original input FlowFiles").build();
+
+    public static final Relationship REL_EXTRACTED = new Relationship.Builder().name("extracted")
+            .description("Success for extracted text FlowFiles").build();
+
+    public static final Relationship REL_FAILURE = new Relationship.Builder().name("failure")
+            .description("Content extraction failed").build();
+
+    private static final Set<Relationship> RELATIONSHIPS = Collections.unmodifiableSet(new HashSet<>(Arrays.asList(REL_ORIGINAL, REL_EXTRACTED, REL_FAILURE)));
+
+    @Override
+    public Set<Relationship> getRelationships() {
+        return RELATIONSHIPS;
+    }
+
+    @Override
+    public void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException {
+        FlowFile flowFile = session.get();
+        if (flowFile == null) {
+            return;
+        }
+
+        FlowFile extracted = session.create(flowFile);
+        boolean error = false;
+        try (InputStream is = session.read(flowFile);
+             Reader tikaReader = new Tika().parse(is);
+             OutputStream os = session.write(extracted);
+             OutputStreamWriter writer = new OutputStreamWriter(os)) {
+            IOUtils.copy(tikaReader, writer);
+        } catch (final Throwable t) {
+            error = true;
+            getLogger().error("Extraction Failed {}", flowFile, t);
+            session.remove(extracted);
+            session.transfer(flowFile, REL_FAILURE);
+        } finally {
+            if (!error) {
+                final Map<String, String> attributes = new HashMap<>();
+                attributes.put(CoreAttributes.MIME_TYPE.key(), TEXT_PLAIN);
+                extracted = session.putAllAttributes(extracted, attributes);
+                session.transfer(extracted, REL_EXTRACTED);
+                session.transfer(flowFile, REL_ORIGINAL);
+            }
+        }
+    }
+}
diff --git a/nifi-nar-bundles/nifi-media-bundle/nifi-media-processors/src/main/resources/META-INF/services/org.apache.nifi.processor.Processor b/nifi-nar-bundles/nifi-media-bundle/nifi-media-processors/src/main/resources/META-INF/services/org.apache.nifi.processor.Processor
index d5d0075..a287968 100644
--- a/nifi-nar-bundles/nifi-media-bundle/nifi-media-processors/src/main/resources/META-INF/services/org.apache.nifi.processor.Processor
+++ b/nifi-nar-bundles/nifi-media-bundle/nifi-media-processors/src/main/resources/META-INF/services/org.apache.nifi.processor.Processor
@@ -12,6 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+org.apache.nifi.processors.document.ExtractDocumentText
 org.apache.nifi.processors.image.ExtractImageMetadata
 org.apache.nifi.processors.image.ResizeImage
 org.apache.nifi.processors.media.ExtractMediaMetadata
diff --git a/nifi-nar-bundles/nifi-media-bundle/nifi-media-processors/src/test/java/org/apache/nifi/processors/document/ExtractDocumentTextTest.java b/nifi-nar-bundles/nifi-media-bundle/nifi-media-processors/src/test/java/org/apache/nifi/processors/document/ExtractDocumentTextTest.java
new file mode 100644
index 0000000..2c677dc
--- /dev/null
+++ b/nifi-nar-bundles/nifi-media-bundle/nifi-media-processors/src/test/java/org/apache/nifi/processors/document/ExtractDocumentTextTest.java
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nifi.processors.document;
+
+import org.apache.nifi.util.MockFlowFile;
+import org.apache.nifi.util.TestRunner;
+import org.apache.nifi.util.TestRunners;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.nio.charset.StandardCharsets;
+import java.util.List;
+
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+public class ExtractDocumentTextTest {
+    private TestRunner testRunner;
+
+    @BeforeEach
+    public void setTestRunner() {
+        testRunner = TestRunners.newTestRunner(ExtractDocumentText.class);
+    }
+
+    @Test
+    public void testRunPdf() throws Exception {
+        final String filename = "simple.pdf";
+        testRunner.enqueue(getFileInputStream(filename));
+        testRunner.run();
+        testRunner.assertTransferCount(ExtractDocumentText.REL_FAILURE, 0);
+
+        List<MockFlowFile> successFiles = testRunner.getFlowFilesForRelationship(ExtractDocumentText.REL_EXTRACTED);
+        for (MockFlowFile mockFile : successFiles) {
+            String result = new String(mockFile.toByteArray(), StandardCharsets.UTF_8);
+            String trimmedResult = result.trim();
+            assertTrue(trimmedResult.startsWith("A Simple PDF File"));
+        }
+    }
+
+    @Test
+    public void testRunDoc() throws Exception {
+        final String filename = "simple.doc";
+        testRunner.enqueue(getFileInputStream(filename));
+        testRunner.run();
+        testRunner.assertTransferCount(ExtractDocumentText.REL_FAILURE, 0);
+
+        List<MockFlowFile> successFiles = testRunner.getFlowFilesForRelationship(ExtractDocumentText.REL_EXTRACTED);
+        for (MockFlowFile mockFile : successFiles) {
+            String result = new String(mockFile.toByteArray(), StandardCharsets.UTF_8);
+            String trimmedResult = result.trim();
+            assertTrue(trimmedResult.startsWith("A Simple WORD DOC File"));
+        }
+    }
+
+    @Test
+    public void testRunDocx() throws Exception {
+        final String filename = "simple.docx";
+        testRunner.enqueue(getFileInputStream(filename));
+        testRunner.run();
+        testRunner.assertTransferCount(ExtractDocumentText.REL_FAILURE, 0);
+
+        List<MockFlowFile> successFiles = testRunner.getFlowFilesForRelationship(ExtractDocumentText.REL_EXTRACTED);
+        for (MockFlowFile mockFile : successFiles) {
+            String result = new String(mockFile.toByteArray(), StandardCharsets.UTF_8);
+            String trimmedResult = result.trim();
+            assertTrue(trimmedResult.startsWith("A Simple WORD DOCX File"));
+        }
+    }
+
+    private FileInputStream getFileInputStream(final String filename) throws FileNotFoundException {
+        return new FileInputStream("src/test/resources/" + filename);
+    }
+}
\ No newline at end of file
diff --git a/nifi-nar-bundles/nifi-media-bundle/nifi-media-processors/src/test/resources/simple.doc b/nifi-nar-bundles/nifi-media-bundle/nifi-media-processors/src/test/resources/simple.doc
new file mode 100644
index 0000000..826b5a9
Binary files /dev/null and b/nifi-nar-bundles/nifi-media-bundle/nifi-media-processors/src/test/resources/simple.doc differ
diff --git a/nifi-nar-bundles/nifi-media-bundle/nifi-media-processors/src/test/resources/simple.docx b/nifi-nar-bundles/nifi-media-bundle/nifi-media-processors/src/test/resources/simple.docx
new file mode 100644
index 0000000..5fbcbbd
Binary files /dev/null and b/nifi-nar-bundles/nifi-media-bundle/nifi-media-processors/src/test/resources/simple.docx differ
diff --git a/nifi-nar-bundles/nifi-media-bundle/nifi-media-processors/src/test/resources/simple.pdf b/nifi-nar-bundles/nifi-media-bundle/nifi-media-processors/src/test/resources/simple.pdf
new file mode 100644
index 0000000..dbf091d
--- /dev/null
+++ b/nifi-nar-bundles/nifi-media-bundle/nifi-media-processors/src/test/resources/simple.pdf
@@ -0,0 +1,198 @@
+%PDF-1.3
+%����
+
+1 0 obj
+<<
+/Type /Catalog
+/Outlines 2 0 R
+/Pages 3 0 R
+>>
+endobj
+
+2 0 obj
+<<
+/Type /Outlines
+/Count 0
+>>
+endobj
+
+3 0 obj
+<<
+/Type /Pages
+/Count 2
+/Kids [ 4 0 R 6 0 R ] 
+>>
+endobj
+
+4 0 obj
+<<
+/Type /Page
+/Parent 3 0 R
+/Resources <<
+/Font <<
+/F1 9 0 R 
+>>
+/ProcSet 8 0 R
+>>
+/MediaBox [0 0 612.0000 792.0000]
+/Contents 5 0 R
+>>
+endobj
+
+5 0 obj
+<< /Length 1074 >>
+stream
+2 J
+BT
+0 0 0 rg
+/F1 0027 Tf
+57.3750 722.2800 Td
+( A Simple PDF File ) Tj
+ET
+BT
+/F1 0010 Tf
+69.2500 688.6080 Td
+( This is a small demonstration .pdf file - ) Tj
+ET
+BT
+/F1 0010 Tf
+69.2500 664.7040 Td
+( just for use in the Virtual Mechanics tutorials. More text. And more ) Tj
+ET
+BT
+/F1 0010 Tf
+69.2500 652.7520 Td
+( text. And more text. And more text. And more text. ) Tj
+ET
+BT
+/F1 0010 Tf
+69.2500 628.8480 Td
+( And more text. And more text. And more text. And more text. And more ) Tj
+ET
+BT
+/F1 0010 Tf
+69.2500 616.8960 Td
+( text. And more text. Boring, zzzzz. And more text. And more text. And ) Tj
+ET
+BT
+/F1 0010 Tf
+69.2500 604.9440 Td
+( more text. And more text. And more text. And more text. And more text. ) Tj
+ET
+BT
+/F1 0010 Tf
+69.2500 592.9920 Td
+( And more text. And more text. ) Tj
+ET
+BT
+/F1 0010 Tf
+69.2500 569.0880 Td
+( And more text. And more text. And more text. And more text. And more ) Tj
+ET
+BT
+/F1 0010 Tf
+69.2500 557.1360 Td
+( text. And more text. And more text. Even more. Continued on page 2 ...) Tj
+ET
+endstream
+endobj
+
+6 0 obj
+<<
+/Type /Page
+/Parent 3 0 R
+/Resources <<
+/Font <<
+/F1 9 0 R 
+>>
+/ProcSet 8 0 R
+>>
+/MediaBox [0 0 612.0000 792.0000]
+/Contents 7 0 R
+>>
+endobj
+
+7 0 obj
+<< /Length 676 >>
+stream
+2 J
+BT
+0 0 0 rg
+/F1 0027 Tf
+57.3750 722.2800 Td
+( Simple PDF File 2 ) Tj
+ET
+BT
+/F1 0010 Tf
+69.2500 688.6080 Td
+( ...continued from page 1. Yet more text. And more text. And more text. ) Tj
+ET
+BT
+/F1 0010 Tf
+69.2500 676.6560 Td
+( And more text. And more text. And more text. And more text. And more ) Tj
+ET
+BT
+/F1 0010 Tf
+69.2500 664.7040 Td
+( text. Oh, how boring typing this stuff. But not as boring as watching ) Tj
+ET
+BT
+/F1 0010 Tf
+69.2500 652.7520 Td
+( paint dry. And more text. And more text. And more text. And more text. ) Tj
+ET
+BT
+/F1 0010 Tf
+69.2500 640.8000 Td
+( Boring.  More, a little more text. The end, and just as well. ) Tj
+ET
+endstream
+endobj
+
+8 0 obj
+[/PDF /Text]
+endobj
+
+9 0 obj
+<<
+/Type /Font
+/Subtype /Type1
+/Name /F1
+/BaseFont /Helvetica
+/Encoding /WinAnsiEncoding
+>>
+endobj
+
+10 0 obj
+<<
+/Creator (Rave \(http://www.nevrona.com/rave\))
+/Producer (Nevrona Designs)
+/CreationDate (D:20060301072826)
+>>
+endobj
+
+xref
+0 11
+0000000000 65535 f
+0000000019 00000 n
+0000000093 00000 n
+0000000147 00000 n
+0000000222 00000 n
+0000000390 00000 n
+0000001522 00000 n
+0000001690 00000 n
+0000002423 00000 n
+0000002456 00000 n
+0000002574 00000 n
+
+trailer
+<<
+/Size 11
+/Root 1 0 R
+/Info 10 0 R
+>>
+
+startxref
+2714
+%%EOF