You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nifi.apache.org by ma...@apache.org on 2017/07/13 14:49:40 UTC

nifi git commit: NIFI-4112: Fix ConvertExcelToCSV to handle empty cells.

Repository: nifi
Updated Branches:
  refs/heads/master 8acee0239 -> 5c48655e6


NIFI-4112: Fix ConvertExcelToCSV to handle empty cells.

Signed-off-by: Matt Burgess <ma...@apache.org>

This closes #1973


Project: http://git-wip-us.apache.org/repos/asf/nifi/repo
Commit: http://git-wip-us.apache.org/repos/asf/nifi/commit/5c48655e
Tree: http://git-wip-us.apache.org/repos/asf/nifi/tree/5c48655e
Diff: http://git-wip-us.apache.org/repos/asf/nifi/diff/5c48655e

Branch: refs/heads/master
Commit: 5c48655e69d1e1d9745a02099357f6b8b3d9d874
Parents: 8acee02
Author: Koji Kawamura <ij...@apache.org>
Authored: Tue Jul 4 16:30:19 2017 +0900
Committer: Matt Burgess <ma...@apache.org>
Committed: Thu Jul 13 10:40:01 2017 -0400

----------------------------------------------------------------------
 .../nifi-poi-bundle/nifi-poi-processors/pom.xml |  13 +++
 .../poi/ConvertExcelToCSVProcessor.java         |  89 ++++++++++++++---
 .../additionalDetails.html                      |  97 +++++++++++++++++++
 .../poi/ConvertExcelToCSVProcessorTest.java     |  36 +++++++
 .../src/test/resources/with-blank-cells.csv     |   8 ++
 .../src/test/resources/with-blank-cells.xlsx    | Bin 0 -> 8489 bytes
 6 files changed, 227 insertions(+), 16 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/nifi/blob/5c48655e/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/pom.xml
----------------------------------------------------------------------
diff --git a/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/pom.xml b/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/pom.xml
index 85b38ac..d6db9c7 100644
--- a/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/pom.xml
+++ b/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/pom.xml
@@ -29,6 +29,19 @@
     <artifactId>nifi-poi-processors</artifactId>
     <packaging>jar</packaging>
 
+    <build>
+        <plugins>
+            <plugin>
+                <groupId>org.apache.rat</groupId>
+                <artifactId>apache-rat-plugin</artifactId>
+                <configuration>
+                    <excludes combine.children="append">
+                        <exclude>src/test/resources/with-blank-cells.csv</exclude>
+                    </excludes>
+                </configuration>
+            </plugin>
+        </plugins>
+    </build>
     <dependencies>
         <!-- https://mvnrepository.com/artifact/xerces/xerces -->
         <dependency>

http://git-wip-us.apache.org/repos/asf/nifi/blob/5c48655e/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/main/java/org/apache/nifi/processors/poi/ConvertExcelToCSVProcessor.java
----------------------------------------------------------------------
diff --git a/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/main/java/org/apache/nifi/processors/poi/ConvertExcelToCSVProcessor.java b/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/main/java/org/apache/nifi/processors/poi/ConvertExcelToCSVProcessor.java
index b881c69..6d8274b 100644
--- a/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/main/java/org/apache/nifi/processors/poi/ConvertExcelToCSVProcessor.java
+++ b/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/main/java/org/apache/nifi/processors/poi/ConvertExcelToCSVProcessor.java
@@ -24,6 +24,8 @@ import java.util.Collections;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 
 import org.apache.commons.io.FilenameUtils;
 import org.apache.commons.lang3.StringUtils;
@@ -78,6 +80,7 @@ public class ConvertExcelToCSVProcessor
     public static final String SOURCE_FILE_NAME = "sourcefilename";
     private static final String SAX_CELL_REF = "c";
     private static final String SAX_CELL_TYPE = "t";
+    private static final String SAX_CELL_ADDRESS = "r";
     private static final String SAX_CELL_STRING = "s";
     private static final String SAX_CELL_CONTENT_REF = "v";
     private static final String SAX_ROW_REF = "row";
@@ -85,6 +88,7 @@ public class ConvertExcelToCSVProcessor
     private static final String DESIRED_SHEETS_DELIMITER = ",";
     private static final String UNKNOWN_SHEET_NAME = "UNKNOWN";
     private static final String SAX_PARSER = "org.apache.xerces.parsers.SAXParser";
+    private static final Pattern CELL_ADDRESS_REGEX = Pattern.compile("^([a-zA-Z]+)([\\d]+)$");
 
     public static final PropertyDescriptor DESIRED_SHEETS = new PropertyDescriptor
             .Builder().name("extract-sheets")
@@ -279,6 +283,27 @@ public class ConvertExcelToCSVProcessor
         }
     }
 
+    static Integer columnToIndex(String col) {
+        int length = col.length();
+        int accumulator = 0;
+        for (int i = length; i > 0; i--) {
+            char c = col.charAt(i - 1);
+            int x = ((int) c) - 64;
+            accumulator += x * Math.pow(26, length - i);
+        }
+        // Make it to start with 0.
+        return accumulator - 1;
+    }
+
+    private static class CellAddress {
+        final int row;
+        final int col;
+
+        private CellAddress(int row, int col) {
+            this.row = row;
+            this.col = col;
+        }
+    }
 
     /**
      * Extracts every row from an Excel Sheet and generates a corresponding JSONObject whose key is the Excel CellAddress and value
@@ -290,6 +315,10 @@ public class ConvertExcelToCSVProcessor
         private SharedStringsTable sst;
         private String currentContent;
         private boolean nextIsString;
+        private CellAddress firstCellAddress;
+        private CellAddress firstRowLastCellAddress;
+        private CellAddress previousCellAddress;
+        private CellAddress nextCellAddress;
         private OutputStream outputStream;
         private boolean firstColInRow;
         long rowCount;
@@ -306,18 +335,35 @@ public class ConvertExcelToCSVProcessor
             this.outputStream = outputStream;
         }
 
+
         public void startElement(String uri, String localName, String name,
                 Attributes attributes) throws SAXException {
 
             if (name.equals(SAX_CELL_REF)) {
                 String cellType = attributes.getValue(SAX_CELL_TYPE);
-                if(cellType != null && cellType.equals(SAX_CELL_STRING)) {
+                // Analyze cell address.
+                Matcher cellAddressMatcher = CELL_ADDRESS_REGEX.matcher(attributes.getValue(SAX_CELL_ADDRESS));
+                if (cellAddressMatcher.matches()) {
+                    String col = cellAddressMatcher.group(1);
+                    String row = cellAddressMatcher.group(2);
+                    nextCellAddress = new CellAddress(Integer.parseInt(row), columnToIndex(col));
+
+                    if (firstCellAddress == null) {
+                        firstCellAddress = nextCellAddress;
+                    }
+                }
+                if (cellType != null && cellType.equals(SAX_CELL_STRING)) {
                     nextIsString = true;
                 } else {
                     nextIsString = false;
                 }
             } else if (name.equals(SAX_ROW_REF)) {
+                if (firstRowLastCellAddress == null) {
+                    firstRowLastCellAddress = previousCellAddress;
+                }
                 firstColInRow = true;
+                previousCellAddress = null;
+                nextCellAddress = null;
             } else if (name.equals(SAX_SHEET_NAME_REF)) {
                 sheetName = attributes.getValue(0);
             }
@@ -325,6 +371,16 @@ public class ConvertExcelToCSVProcessor
             currentContent = "";
         }
 
+        private void fillEmptyColumns(int nextColumn) throws IOException {
+            final CellAddress previousCell = previousCellAddress != null ? previousCellAddress : firstCellAddress;
+            if (previousCell != null) {
+                for (int i = 0; i < (nextColumn - previousCell.col); i++) {
+                    // Fill columns.
+                    outputStream.write(",".getBytes());
+                }
+            }
+        }
+
         public void endElement(String uri, String localName, String name)
                 throws SAXException {
 
@@ -334,22 +390,20 @@ public class ConvertExcelToCSVProcessor
                 nextIsString = false;
             }
 
-            if (name.equals(SAX_CELL_CONTENT_REF)) {
-                if (firstColInRow) {
+            if (name.equals(SAX_CELL_CONTENT_REF)
+                    // Limit scanning from the first column, and up to the last column.
+                    && (firstCellAddress == null || firstCellAddress.col <= nextCellAddress.col)
+                    && (firstRowLastCellAddress == null || nextCellAddress.col <= firstRowLastCellAddress.col)) {
+                try {
+                    // A cell is found.
+                    fillEmptyColumns(nextCellAddress.col);
                     firstColInRow = false;
-                    try {
-                        outputStream.write(currentContent.getBytes());
-                    } catch (IOException e) {
-                        getLogger().error("IO error encountered while writing content of parsed cell " +
-                                "value from sheet {}", new Object[]{getSheetName()}, e);
-                    }
-                } else {
-                    try {
-                        outputStream.write(("," + currentContent).getBytes());
-                    } catch (IOException e) {
-                        getLogger().error("IO error encountered while writing content of parsed cell " +
-                                "value from sheet {}", new Object[]{getSheetName()}, e);
-                    }
+                    outputStream.write(currentContent.getBytes());
+                    // Keep previously found cell address.
+                    previousCellAddress = nextCellAddress;
+                } catch (IOException e) {
+                    getLogger().error("IO error encountered while writing content of parsed cell " +
+                            "value from sheet {}", new Object[]{getSheetName()}, e);
                 }
             }
 
@@ -357,6 +411,9 @@ public class ConvertExcelToCSVProcessor
                 //If this is the first row and the end of the row element has been encountered then that means no columns were present.
                 if (!firstColInRow) {
                     try {
+                        if (firstRowLastCellAddress != null) {
+                            fillEmptyColumns(firstRowLastCellAddress.col);
+                        }
                         rowCount++;
                         outputStream.write("\n".getBytes());
                     } catch (IOException e) {

http://git-wip-us.apache.org/repos/asf/nifi/blob/5c48655e/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/main/resources/docs/org.apache.nifi.processors.poi.ConvertExcelToCSVProcessor/additionalDetails.html
----------------------------------------------------------------------
diff --git a/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/main/resources/docs/org.apache.nifi.processors.poi.ConvertExcelToCSVProcessor/additionalDetails.html b/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/main/resources/docs/org.apache.nifi.processors.poi.ConvertExcelToCSVProcessor/additionalDetails.html
new file mode 100644
index 0000000..fbefa08
--- /dev/null
+++ b/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/main/resources/docs/org.apache.nifi.processors.poi.ConvertExcelToCSVProcessor/additionalDetails.html
@@ -0,0 +1,97 @@
+<!DOCTYPE html>
+<html lang="en">
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+      http://www.apache.org/licenses/LICENSE-2.0
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<head>
+    <meta charset="utf-8" />
+    <title>ConvertExcelToCSVProcessor</title>
+    <style>
+table {
+    border-collapse: collapse;
+}
+
+table, th, td {
+    border: 1px solid #ccc;
+}
+
+td.r {
+    text-align: right;
+}
+
+td {
+    width: 50px;
+    padding: 5px;
+}
+    </style>
+    <link rel="stylesheet" href="../../../../../css/component-usage.css" type="text/css" />
+</head>
+
+<body>
+<h2>How it extracts CSV data from a sheet</h2>
+<p>
+    ConvertExcelToCSVProcessor extracts CSV data with following rules:
+</p>
+<ul>
+    <li>Find the fist cell which has a value in it (the FirstCell).</li>
+    <li>Scan cells in the first row, starting from the FirstCell,
+        until it reaches to a cell after which no cell with a value can not be found in the row (the FirstRowLastCell).</li>
+    <li>Process the 2nd row and later, from the column of FirstCell to the column of FirstRowLastCell.</li>
+    <li>If a row does not have any cell that has a value, then the row is ignored.</li>
+</ul>
+
+<p>
+    As an example, the sheet shown below will be:
+</p>
+
+<table>
+    <tbody>
+    <tr><th>row          </th><th>A</th><th>B</th><th>C</th><th>D</th><th>E</th><th>F</th><th>G</th></tr>
+    <tr><td class="r">  1</td><td> </td><td> </td><td> </td><td> </td><td> </td><td> </td><td> </td></tr>
+    <tr><td class="r">  2</td><td> </td><td> </td><td>x</td><td>y</td><td>z</td><td> </td><td> </td></tr>
+    <tr><td class="r">  3</td><td> </td><td> </td><td>1</td><td> </td><td> </td><td> </td><td> </td></tr>
+    <tr><td class="r">  4</td><td>2</td><td> </td><td> </td><td>3</td><td> </td><td> </td><td> </td></tr>
+    <tr><td class="r">  5</td><td> </td><td> </td><td> </td><td> </td><td>4</td><td> </td><td> </td></tr>
+    <tr><td class="r">  6</td><td> </td><td> </td><td>5</td><td>6</td><td>7</td><td> </td><td> </td></tr>
+    <tr><td class="r">  7</td><td> </td><td> </td><td> </td><td> </td><td> </td><td>8</td><td> </td></tr>
+    <tr><td class="r">  8</td><td> </td><td> </td><td> </td><td> </td><td> </td><td> </td><td> </td></tr>
+    <tr><td class="r">  9</td><td> </td><td> </td><td> </td><td> </td><td>9</td><td> </td><td> </td></tr>
+    <tr><td class="r"> 10</td><td> </td><td> </td><td> </td><td> </td><td> </td><td> </td><td> </td></tr>
+    <tr><td class="r"> 11</td><td> </td><td> </td><td> </td><td> </td><td> </td><td> </td><td> </td></tr>
+    </tbody>
+</table>
+
+<p>
+    converted to following CSV:
+</p>
+
+<pre>
+x,y,z
+1,,
+,3,
+,,4
+5,6,7
+,,9
+</pre>
+
+<ul>
+    <li>C2(x) is the FirstCell, and E2(z) is the FirstRowLastCell.</li>
+    <li>A4(2) is ignored because it is out of range. So is F7(8).</li>
+    <li>Row 7 and 8 are ignored because those do not have a valid cell.</li>
+    <li>It is important to have a header row as shown in the example to define data area,
+        especially when a sheet includes empty cells.</li>
+</ul>
+
+</body>
+</html>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nifi/blob/5c48655e/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/test/java/org/apache/nifi/processors/poi/ConvertExcelToCSVProcessorTest.java
----------------------------------------------------------------------
diff --git a/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/test/java/org/apache/nifi/processors/poi/ConvertExcelToCSVProcessorTest.java b/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/test/java/org/apache/nifi/processors/poi/ConvertExcelToCSVProcessorTest.java
index 1972fbb..1df2568 100644
--- a/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/test/java/org/apache/nifi/processors/poi/ConvertExcelToCSVProcessorTest.java
+++ b/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/test/java/org/apache/nifi/processors/poi/ConvertExcelToCSVProcessorTest.java
@@ -16,9 +16,11 @@
  */
 package org.apache.nifi.processors.poi;
 
+import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertTrue;
 
 import java.io.File;
+import java.nio.charset.StandardCharsets;
 import java.util.List;
 
 import org.apache.nifi.flowfile.attributes.CoreAttributes;
@@ -40,6 +42,16 @@ public class ConvertExcelToCSVProcessorTest {
     }
 
     @Test
+    public void testColToIndex() {
+        assertEquals(Integer.valueOf(0), ConvertExcelToCSVProcessor.columnToIndex("A"));
+        assertEquals(Integer.valueOf(1), ConvertExcelToCSVProcessor.columnToIndex("B"));
+        assertEquals(Integer.valueOf(25), ConvertExcelToCSVProcessor.columnToIndex("Z"));
+        assertEquals(Integer.valueOf(29), ConvertExcelToCSVProcessor.columnToIndex("AD"));
+        assertEquals(Integer.valueOf(239), ConvertExcelToCSVProcessor.columnToIndex("IF"));
+        assertEquals(Integer.valueOf(16383), ConvertExcelToCSVProcessor.columnToIndex("XFD"));
+    }
+
+    @Test
     public void testMultipleSheetsGeneratesMultipleFlowFiles() throws Exception {
 
         testRunner.enqueue(new File("src/test/resources/TwoSheets.xlsx").toPath());
@@ -150,6 +162,30 @@ public class ConvertExcelToCSVProcessorTest {
     }
 
     /**
+     * Validates that a sheet contains blank cells can be converted to a CSV without missing columns.
+     *
+     * @throws Exception
+     *  Any exception thrown during execution.
+     */
+    @Test
+    public void testProcessASheetWithBlankCells() throws Exception {
+
+        testRunner.setProperty(ConvertExcelToCSVProcessor.DESIRED_SHEETS, "Sheet1");
+        testRunner.enqueue(new File("src/test/resources/with-blank-cells.xlsx").toPath());
+        testRunner.run();
+
+        testRunner.assertTransferCount(ConvertExcelToCSVProcessor.SUCCESS, 1);
+        testRunner.assertTransferCount(ConvertExcelToCSVProcessor.ORIGINAL, 1);
+        testRunner.assertTransferCount(ConvertExcelToCSVProcessor.FAILURE, 0);
+
+        MockFlowFile ff = testRunner.getFlowFilesForRelationship(ConvertExcelToCSVProcessor.SUCCESS).get(0);
+        Long l = new Long(ff.getAttribute(ConvertExcelToCSVProcessor.ROW_NUM));
+        assertTrue(l == 8l);
+        ff.isContentEqual("test", StandardCharsets.UTF_8);
+        ff.assertContentEquals(new File("src/test/resources/with-blank-cells.csv"));
+    }
+
+    /**
      * Tests for graceful handling and error messaging of unsupported .XLS files.
      */
     @Test

http://git-wip-us.apache.org/repos/asf/nifi/blob/5c48655e/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/test/resources/with-blank-cells.csv
----------------------------------------------------------------------
diff --git a/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/test/resources/with-blank-cells.csv b/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/test/resources/with-blank-cells.csv
new file mode 100644
index 0000000..ff3f706
--- /dev/null
+++ b/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/test/resources/with-blank-cells.csv
@@ -0,0 +1,8 @@
+A,B,C,D
+A1,,,
+,B2,C2,
+,,C3,
+,,C4,D4
+A5,,C5,D5
+A6,B6,,D6
+A7,B7,C7,D7

http://git-wip-us.apache.org/repos/asf/nifi/blob/5c48655e/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/test/resources/with-blank-cells.xlsx
----------------------------------------------------------------------
diff --git a/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/test/resources/with-blank-cells.xlsx b/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/test/resources/with-blank-cells.xlsx
new file mode 100644
index 0000000..a948246
Binary files /dev/null and b/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/test/resources/with-blank-cells.xlsx differ