You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nifi.apache.org by ma...@apache.org on 2017/07/13 14:49:40 UTC
nifi git commit: NIFI-4112: Fix ConvertExcelToCSV to handle empty
cells.
Repository: nifi
Updated Branches:
refs/heads/master 8acee0239 -> 5c48655e6
NIFI-4112: Fix ConvertExcelToCSV to handle empty cells.
Signed-off-by: Matt Burgess <ma...@apache.org>
This closes #1973
Project: http://git-wip-us.apache.org/repos/asf/nifi/repo
Commit: http://git-wip-us.apache.org/repos/asf/nifi/commit/5c48655e
Tree: http://git-wip-us.apache.org/repos/asf/nifi/tree/5c48655e
Diff: http://git-wip-us.apache.org/repos/asf/nifi/diff/5c48655e
Branch: refs/heads/master
Commit: 5c48655e69d1e1d9745a02099357f6b8b3d9d874
Parents: 8acee02
Author: Koji Kawamura <ij...@apache.org>
Authored: Tue Jul 4 16:30:19 2017 +0900
Committer: Matt Burgess <ma...@apache.org>
Committed: Thu Jul 13 10:40:01 2017 -0400
----------------------------------------------------------------------
.../nifi-poi-bundle/nifi-poi-processors/pom.xml | 13 +++
.../poi/ConvertExcelToCSVProcessor.java | 89 ++++++++++++++---
.../additionalDetails.html | 97 +++++++++++++++++++
.../poi/ConvertExcelToCSVProcessorTest.java | 36 +++++++
.../src/test/resources/with-blank-cells.csv | 8 ++
.../src/test/resources/with-blank-cells.xlsx | Bin 0 -> 8489 bytes
6 files changed, 227 insertions(+), 16 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/nifi/blob/5c48655e/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/pom.xml
----------------------------------------------------------------------
diff --git a/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/pom.xml b/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/pom.xml
index 85b38ac..d6db9c7 100644
--- a/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/pom.xml
+++ b/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/pom.xml
@@ -29,6 +29,19 @@
<artifactId>nifi-poi-processors</artifactId>
<packaging>jar</packaging>
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.rat</groupId>
+ <artifactId>apache-rat-plugin</artifactId>
+ <configuration>
+ <excludes combine.children="append">
+ <exclude>src/test/resources/with-blank-cells.csv</exclude>
+ </excludes>
+ </configuration>
+ </plugin>
+ </plugins>
+ </build>
<dependencies>
<!-- https://mvnrepository.com/artifact/xerces/xerces -->
<dependency>
http://git-wip-us.apache.org/repos/asf/nifi/blob/5c48655e/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/main/java/org/apache/nifi/processors/poi/ConvertExcelToCSVProcessor.java
----------------------------------------------------------------------
diff --git a/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/main/java/org/apache/nifi/processors/poi/ConvertExcelToCSVProcessor.java b/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/main/java/org/apache/nifi/processors/poi/ConvertExcelToCSVProcessor.java
index b881c69..6d8274b 100644
--- a/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/main/java/org/apache/nifi/processors/poi/ConvertExcelToCSVProcessor.java
+++ b/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/main/java/org/apache/nifi/processors/poi/ConvertExcelToCSVProcessor.java
@@ -24,6 +24,8 @@ import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.lang3.StringUtils;
@@ -78,6 +80,7 @@ public class ConvertExcelToCSVProcessor
public static final String SOURCE_FILE_NAME = "sourcefilename";
private static final String SAX_CELL_REF = "c";
private static final String SAX_CELL_TYPE = "t";
+ private static final String SAX_CELL_ADDRESS = "r";
private static final String SAX_CELL_STRING = "s";
private static final String SAX_CELL_CONTENT_REF = "v";
private static final String SAX_ROW_REF = "row";
@@ -85,6 +88,7 @@ public class ConvertExcelToCSVProcessor
private static final String DESIRED_SHEETS_DELIMITER = ",";
private static final String UNKNOWN_SHEET_NAME = "UNKNOWN";
private static final String SAX_PARSER = "org.apache.xerces.parsers.SAXParser";
+ private static final Pattern CELL_ADDRESS_REGEX = Pattern.compile("^([a-zA-Z]+)([\\d]+)$");
public static final PropertyDescriptor DESIRED_SHEETS = new PropertyDescriptor
.Builder().name("extract-sheets")
@@ -279,6 +283,27 @@ public class ConvertExcelToCSVProcessor
}
}
+ static Integer columnToIndex(String col) {
+ int length = col.length();
+ int accumulator = 0;
+ for (int i = length; i > 0; i--) {
+ char c = col.charAt(i - 1);
+ int x = ((int) c) - 64;
+ accumulator += x * Math.pow(26, length - i);
+ }
+ // Make it to start with 0.
+ return accumulator - 1;
+ }
+
+ private static class CellAddress {
+ final int row;
+ final int col;
+
+ private CellAddress(int row, int col) {
+ this.row = row;
+ this.col = col;
+ }
+ }
/**
* Extracts every row from an Excel Sheet and generates a corresponding JSONObject whose key is the Excel CellAddress and value
@@ -290,6 +315,10 @@ public class ConvertExcelToCSVProcessor
private SharedStringsTable sst;
private String currentContent;
private boolean nextIsString;
+ private CellAddress firstCellAddress;
+ private CellAddress firstRowLastCellAddress;
+ private CellAddress previousCellAddress;
+ private CellAddress nextCellAddress;
private OutputStream outputStream;
private boolean firstColInRow;
long rowCount;
@@ -306,18 +335,35 @@ public class ConvertExcelToCSVProcessor
this.outputStream = outputStream;
}
+
public void startElement(String uri, String localName, String name,
Attributes attributes) throws SAXException {
if (name.equals(SAX_CELL_REF)) {
String cellType = attributes.getValue(SAX_CELL_TYPE);
- if(cellType != null && cellType.equals(SAX_CELL_STRING)) {
+ // Analyze cell address.
+ Matcher cellAddressMatcher = CELL_ADDRESS_REGEX.matcher(attributes.getValue(SAX_CELL_ADDRESS));
+ if (cellAddressMatcher.matches()) {
+ String col = cellAddressMatcher.group(1);
+ String row = cellAddressMatcher.group(2);
+ nextCellAddress = new CellAddress(Integer.parseInt(row), columnToIndex(col));
+
+ if (firstCellAddress == null) {
+ firstCellAddress = nextCellAddress;
+ }
+ }
+ if (cellType != null && cellType.equals(SAX_CELL_STRING)) {
nextIsString = true;
} else {
nextIsString = false;
}
} else if (name.equals(SAX_ROW_REF)) {
+ if (firstRowLastCellAddress == null) {
+ firstRowLastCellAddress = previousCellAddress;
+ }
firstColInRow = true;
+ previousCellAddress = null;
+ nextCellAddress = null;
} else if (name.equals(SAX_SHEET_NAME_REF)) {
sheetName = attributes.getValue(0);
}
@@ -325,6 +371,16 @@ public class ConvertExcelToCSVProcessor
currentContent = "";
}
+ private void fillEmptyColumns(int nextColumn) throws IOException {
+ final CellAddress previousCell = previousCellAddress != null ? previousCellAddress : firstCellAddress;
+ if (previousCell != null) {
+ for (int i = 0; i < (nextColumn - previousCell.col); i++) {
+ // Fill columns.
+ outputStream.write(",".getBytes());
+ }
+ }
+ }
+
public void endElement(String uri, String localName, String name)
throws SAXException {
@@ -334,22 +390,20 @@ public class ConvertExcelToCSVProcessor
nextIsString = false;
}
- if (name.equals(SAX_CELL_CONTENT_REF)) {
- if (firstColInRow) {
+ if (name.equals(SAX_CELL_CONTENT_REF)
+ // Limit scanning from the first column, and up to the last column.
+ && (firstCellAddress == null || firstCellAddress.col <= nextCellAddress.col)
+ && (firstRowLastCellAddress == null || nextCellAddress.col <= firstRowLastCellAddress.col)) {
+ try {
+ // A cell is found.
+ fillEmptyColumns(nextCellAddress.col);
firstColInRow = false;
- try {
- outputStream.write(currentContent.getBytes());
- } catch (IOException e) {
- getLogger().error("IO error encountered while writing content of parsed cell " +
- "value from sheet {}", new Object[]{getSheetName()}, e);
- }
- } else {
- try {
- outputStream.write(("," + currentContent).getBytes());
- } catch (IOException e) {
- getLogger().error("IO error encountered while writing content of parsed cell " +
- "value from sheet {}", new Object[]{getSheetName()}, e);
- }
+ outputStream.write(currentContent.getBytes());
+ // Keep previously found cell address.
+ previousCellAddress = nextCellAddress;
+ } catch (IOException e) {
+ getLogger().error("IO error encountered while writing content of parsed cell " +
+ "value from sheet {}", new Object[]{getSheetName()}, e);
}
}
@@ -357,6 +411,9 @@ public class ConvertExcelToCSVProcessor
//If this is the first row and the end of the row element has been encountered then that means no columns were present.
if (!firstColInRow) {
try {
+ if (firstRowLastCellAddress != null) {
+ fillEmptyColumns(firstRowLastCellAddress.col);
+ }
rowCount++;
outputStream.write("\n".getBytes());
} catch (IOException e) {
http://git-wip-us.apache.org/repos/asf/nifi/blob/5c48655e/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/main/resources/docs/org.apache.nifi.processors.poi.ConvertExcelToCSVProcessor/additionalDetails.html
----------------------------------------------------------------------
diff --git a/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/main/resources/docs/org.apache.nifi.processors.poi.ConvertExcelToCSVProcessor/additionalDetails.html b/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/main/resources/docs/org.apache.nifi.processors.poi.ConvertExcelToCSVProcessor/additionalDetails.html
new file mode 100644
index 0000000..fbefa08
--- /dev/null
+++ b/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/main/resources/docs/org.apache.nifi.processors.poi.ConvertExcelToCSVProcessor/additionalDetails.html
@@ -0,0 +1,97 @@
+<!DOCTYPE html>
+<html lang="en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<head>
+ <meta charset="utf-8" />
+ <title>ConvertExcelToCSVProcessor</title>
+ <style>
+table {
+ border-collapse: collapse;
+}
+
+table, th, td {
+ border: 1px solid #ccc;
+}
+
+td.r {
+ text-align: right;
+}
+
+td {
+ width: 50px;
+ padding: 5px;
+}
+ </style>
+ <link rel="stylesheet" href="../../../../../css/component-usage.css" type="text/css" />
+</head>
+
+<body>
+<h2>How it extracts CSV data from a sheet</h2>
+<p>
+ ConvertExcelToCSVProcessor extracts CSV data with following rules:
+</p>
+<ul>
+ <li>Find the fist cell which has a value in it (the FirstCell).</li>
+ <li>Scan cells in the first row, starting from the FirstCell,
+ until it reaches to a cell after which no cell with a value can not be found in the row (the FirstRowLastCell).</li>
+ <li>Process the 2nd row and later, from the column of FirstCell to the column of FirstRowLastCell.</li>
+ <li>If a row does not have any cell that has a value, then the row is ignored.</li>
+</ul>
+
+<p>
+ As an example, the sheet shown below will be:
+</p>
+
+<table>
+ <tbody>
+ <tr><th>row </th><th>A</th><th>B</th><th>C</th><th>D</th><th>E</th><th>F</th><th>G</th></tr>
+ <tr><td class="r"> 1</td><td> </td><td> </td><td> </td><td> </td><td> </td><td> </td><td> </td></tr>
+ <tr><td class="r"> 2</td><td> </td><td> </td><td>x</td><td>y</td><td>z</td><td> </td><td> </td></tr>
+ <tr><td class="r"> 3</td><td> </td><td> </td><td>1</td><td> </td><td> </td><td> </td><td> </td></tr>
+ <tr><td class="r"> 4</td><td>2</td><td> </td><td> </td><td>3</td><td> </td><td> </td><td> </td></tr>
+ <tr><td class="r"> 5</td><td> </td><td> </td><td> </td><td> </td><td>4</td><td> </td><td> </td></tr>
+ <tr><td class="r"> 6</td><td> </td><td> </td><td>5</td><td>6</td><td>7</td><td> </td><td> </td></tr>
+ <tr><td class="r"> 7</td><td> </td><td> </td><td> </td><td> </td><td> </td><td>8</td><td> </td></tr>
+ <tr><td class="r"> 8</td><td> </td><td> </td><td> </td><td> </td><td> </td><td> </td><td> </td></tr>
+ <tr><td class="r"> 9</td><td> </td><td> </td><td> </td><td> </td><td>9</td><td> </td><td> </td></tr>
+ <tr><td class="r"> 10</td><td> </td><td> </td><td> </td><td> </td><td> </td><td> </td><td> </td></tr>
+ <tr><td class="r"> 11</td><td> </td><td> </td><td> </td><td> </td><td> </td><td> </td><td> </td></tr>
+ </tbody>
+</table>
+
+<p>
+ converted to following CSV:
+</p>
+
+<pre>
+x,y,z
+1,,
+,3,
+,,4
+5,6,7
+,,9
+</pre>
+
+<ul>
+ <li>C2(x) is the FirstCell, and E2(z) is the FirstRowLastCell.</li>
+ <li>A4(2) is ignored because it is out of range. So is F7(8).</li>
+ <li>Row 7 and 8 are ignored because those do not have a valid cell.</li>
+ <li>It is important to have a header row as shown in the example to define data area,
+ especially when a sheet includes empty cells.</li>
+</ul>
+
+</body>
+</html>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/nifi/blob/5c48655e/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/test/java/org/apache/nifi/processors/poi/ConvertExcelToCSVProcessorTest.java
----------------------------------------------------------------------
diff --git a/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/test/java/org/apache/nifi/processors/poi/ConvertExcelToCSVProcessorTest.java b/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/test/java/org/apache/nifi/processors/poi/ConvertExcelToCSVProcessorTest.java
index 1972fbb..1df2568 100644
--- a/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/test/java/org/apache/nifi/processors/poi/ConvertExcelToCSVProcessorTest.java
+++ b/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/test/java/org/apache/nifi/processors/poi/ConvertExcelToCSVProcessorTest.java
@@ -16,9 +16,11 @@
*/
package org.apache.nifi.processors.poi;
+import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import java.io.File;
+import java.nio.charset.StandardCharsets;
import java.util.List;
import org.apache.nifi.flowfile.attributes.CoreAttributes;
@@ -40,6 +42,16 @@ public class ConvertExcelToCSVProcessorTest {
}
@Test
+ public void testColToIndex() {
+ assertEquals(Integer.valueOf(0), ConvertExcelToCSVProcessor.columnToIndex("A"));
+ assertEquals(Integer.valueOf(1), ConvertExcelToCSVProcessor.columnToIndex("B"));
+ assertEquals(Integer.valueOf(25), ConvertExcelToCSVProcessor.columnToIndex("Z"));
+ assertEquals(Integer.valueOf(29), ConvertExcelToCSVProcessor.columnToIndex("AD"));
+ assertEquals(Integer.valueOf(239), ConvertExcelToCSVProcessor.columnToIndex("IF"));
+ assertEquals(Integer.valueOf(16383), ConvertExcelToCSVProcessor.columnToIndex("XFD"));
+ }
+
+ @Test
public void testMultipleSheetsGeneratesMultipleFlowFiles() throws Exception {
testRunner.enqueue(new File("src/test/resources/TwoSheets.xlsx").toPath());
@@ -150,6 +162,30 @@ public class ConvertExcelToCSVProcessorTest {
}
/**
+ * Validates that a sheet contains blank cells can be converted to a CSV without missing columns.
+ *
+ * @throws Exception
+ * Any exception thrown during execution.
+ */
+ @Test
+ public void testProcessASheetWithBlankCells() throws Exception {
+
+ testRunner.setProperty(ConvertExcelToCSVProcessor.DESIRED_SHEETS, "Sheet1");
+ testRunner.enqueue(new File("src/test/resources/with-blank-cells.xlsx").toPath());
+ testRunner.run();
+
+ testRunner.assertTransferCount(ConvertExcelToCSVProcessor.SUCCESS, 1);
+ testRunner.assertTransferCount(ConvertExcelToCSVProcessor.ORIGINAL, 1);
+ testRunner.assertTransferCount(ConvertExcelToCSVProcessor.FAILURE, 0);
+
+ MockFlowFile ff = testRunner.getFlowFilesForRelationship(ConvertExcelToCSVProcessor.SUCCESS).get(0);
+ Long l = new Long(ff.getAttribute(ConvertExcelToCSVProcessor.ROW_NUM));
+ assertTrue(l == 8l);
+ ff.isContentEqual("test", StandardCharsets.UTF_8);
+ ff.assertContentEquals(new File("src/test/resources/with-blank-cells.csv"));
+ }
+
+ /**
* Tests for graceful handling and error messaging of unsupported .XLS files.
*/
@Test
http://git-wip-us.apache.org/repos/asf/nifi/blob/5c48655e/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/test/resources/with-blank-cells.csv
----------------------------------------------------------------------
diff --git a/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/test/resources/with-blank-cells.csv b/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/test/resources/with-blank-cells.csv
new file mode 100644
index 0000000..ff3f706
--- /dev/null
+++ b/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/test/resources/with-blank-cells.csv
@@ -0,0 +1,8 @@
+A,B,C,D
+A1,,,
+,B2,C2,
+,,C3,
+,,C4,D4
+A5,,C5,D5
+A6,B6,,D6
+A7,B7,C7,D7
http://git-wip-us.apache.org/repos/asf/nifi/blob/5c48655e/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/test/resources/with-blank-cells.xlsx
----------------------------------------------------------------------
diff --git a/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/test/resources/with-blank-cells.xlsx b/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/test/resources/with-blank-cells.xlsx
new file mode 100644
index 0000000..a948246
Binary files /dev/null and b/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/test/resources/with-blank-cells.xlsx differ