You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@nifi.apache.org by jvwing <gi...@git.apache.org> on 2017/03/16 23:21:34 UTC
[GitHub] nifi pull request #929: NIFI-2613 - Apache POI processor to convert Excel do...

Github user jvwing commented on a diff in the pull request:

    https://github.com/apache/nifi/pull/929#discussion_r106554040
  
    --- Diff: nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/main/java/org/apache/nifi/processors/poi/ConvertExcelToCSVProcessor.java ---
    @@ -0,0 +1,409 @@
    +/*
    + * Licensed to the Apache Software Foundation (ASF) under one or more
    + * contributor license agreements.  See the NOTICE file distributed with
    + * this work for additional information regarding copyright ownership.
    + * The ASF licenses this file to You under the Apache License, Version 2.0
    + * (the "License"); you may not use this file except in compliance with
    + * the License.  You may obtain a copy of the License at
    + *
    + *     http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +package org.apache.nifi.processors.poi;
    +
    +import java.io.IOException;
    +import java.io.InputStream;
    +import java.io.OutputStream;
    +import java.util.ArrayList;
    +import java.util.Collections;
    +import java.util.HashSet;
    +import java.util.List;
    +import java.util.Set;
    +
    +import org.apache.commons.io.FilenameUtils;
    +import org.apache.commons.lang3.StringUtils;
    +import org.apache.nifi.annotation.behavior.WritesAttribute;
    +import org.apache.nifi.annotation.behavior.WritesAttributes;
    +import org.apache.nifi.annotation.documentation.CapabilityDescription;
    +import org.apache.nifi.annotation.documentation.Tags;
    +import org.apache.nifi.components.PropertyDescriptor;
    +import org.apache.nifi.flowfile.FlowFile;
    +import org.apache.nifi.flowfile.attributes.CoreAttributes;
    +import org.apache.nifi.processor.AbstractProcessor;
    +import org.apache.nifi.processor.ProcessContext;
    +import org.apache.nifi.processor.ProcessSession;
    +import org.apache.nifi.processor.ProcessorInitializationContext;
    +import org.apache.nifi.processor.Relationship;
    +import org.apache.nifi.processor.exception.ProcessException;
    +import org.apache.nifi.processor.io.InputStreamCallback;
    +import org.apache.nifi.processor.io.OutputStreamCallback;
    +import org.apache.nifi.processor.util.StandardValidators;
    +import org.apache.poi.openxml4j.opc.OPCPackage;
    +import org.apache.poi.xssf.eventusermodel.XSSFReader;
    +import org.apache.poi.xssf.model.SharedStringsTable;
    +import org.apache.poi.xssf.usermodel.XSSFRichTextString;
    +import org.xml.sax.Attributes;
    +import org.xml.sax.InputSource;
    +import org.xml.sax.SAXException;
    +import org.xml.sax.XMLReader;
    +import org.xml.sax.helpers.DefaultHandler;
    +import org.xml.sax.helpers.XMLReaderFactory;
    +
    +
    +@Tags({"excel", "csv", "poi"})
    +@CapabilityDescription("Consumes a Microsoft Excel document and converts each worksheet to csv. Each sheet from the incoming Excel " +
    +        "document will generate a new Flowfile that will be output from this processor. Each output Flowfile's contents will be formatted as a csv file " +
    +        "where the each row from the excel sheet is output as a newline in the csv file.")
    +@WritesAttributes({@WritesAttribute(attribute="sheetname", description="The name of the Excel sheet that this particular row of data came from in the Excel document"),
    +        @WritesAttribute(attribute="numrows", description="The number of rows in this Excel Sheet"),
    +        @WritesAttribute(attribute="sourcefilename", description="The name of the Excel document file that this data originated from"),
    +        @WritesAttribute(attribute="convertexceltocsvprocessor.error", description="Error message that was encountered on a per Excel sheet basis. This attribute is" +
    +                " only populated if an error was occured while processing the particular sheet. Having the error present at the sheet level will allow for the end" +
    +                " user to better understand what syntax errors in their excel doc on a larger scale caused the error.")})
    +public class ConvertExcelToCSVProcessor
    +        extends AbstractProcessor {
    +
    +    private static final String CSV_MIME_TYPE = "text/csv";
    +    public static final String SHEET_NAME = "sheetname";
    +    public static final String ROW_NUM = "numrows";
    +    public static final String SOURCE_FILE_NAME = "sourcefilename";
    +    private static final String SAX_CELL_REF = "c";
    +    private static final String SAX_CELL_TYPE = "t";
    +    private static final String SAX_CELL_STRING = "s";
    +    private static final String SAX_CELL_CONTENT_REF = "v";
    +    private static final String SAX_ROW_REF = "row";
    +    private static final String SAX_SHEET_NAME_REF = "sheetPr";
    +    private static final String DESIRED_SHEETS_DELIMITER = ",";
    +    private static final String UNKNOWN_SHEET_NAME = "UNKNOWN";
    +
    +    public static final PropertyDescriptor DESIRED_SHEETS = new PropertyDescriptor
    +            .Builder().name("extract-sheets")
    +            .displayName("Sheets to Extract")
    +            .description("Comma separated list of Excel document sheet names that should be extracted from the excel document. If this property" +
    +                    " is left blank then all of the sheets will be extracted from the Excel document. The list of names is case in-sensitive. Any sheets not " +
    +                    "specified in this value will be ignored.")
    +            .required(false)
    +            .expressionLanguageSupported(true)
    +            .addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
    +            .build();
    +
    +    public static final Relationship ORIGINAL = new Relationship.Builder()
    +            .name("original")
    +            .description("Original Excel document received by this processor")
    +            .build();
    +
    +    public static final Relationship SUCCESS = new Relationship.Builder()
    +            .name("success")
    +            .description("Excel data converted to csv")
    +            .build();
    +
    +    public static final Relationship FAILURE = new Relationship.Builder()
    +            .name("failure")
    +            .description("Failed to parse the Excel document")
    +            .build();
    +
    +    private List<PropertyDescriptor> descriptors;
    +
    +    private Set<Relationship> relationships;
    +
    +    @Override
    +    protected void init(final ProcessorInitializationContext context) {
    +        final List<PropertyDescriptor> descriptors = new ArrayList<>();
    +        descriptors.add(DESIRED_SHEETS);
    +        this.descriptors = Collections.unmodifiableList(descriptors);
    +
    +        final Set<Relationship> relationships = new HashSet<>();
    +        relationships.add(ORIGINAL);
    +        relationships.add(SUCCESS);
    +        relationships.add(FAILURE);
    +        this.relationships = Collections.unmodifiableSet(relationships);
    +    }
    +
    +    @Override
    +    public Set<Relationship> getRelationships() {
    +        return this.relationships;
    +    }
    +
    +    @Override
    +    public final List<PropertyDescriptor> getSupportedPropertyDescriptors() {
    +        return descriptors;
    +    }
    +
    +    @Override
    +    public void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException {
    +        final FlowFile flowFile = session.get();
    +        if ( flowFile == null ) {
    +            return;
    +        }
    +
    +        try {
    +
    +            session.read(flowFile, new InputStreamCallback() {
    +                @Override
    +                public void process(InputStream inputStream) throws IOException {
    +
    +                    try {
    +                        String desiredSheetsDelimited = context.getProperty(DESIRED_SHEETS)
    +                                .evaluateAttributeExpressions().getValue();
    +
    +                        OPCPackage pkg = OPCPackage.open(inputStream);
    +                        XSSFReader r = new XSSFReader(pkg);
    +                        SharedStringsTable sst = r.getSharedStringsTable();
    +                        XSSFReader.SheetIterator iter = (XSSFReader.SheetIterator) r.getSheetsData();
    +
    +                        if (desiredSheetsDelimited != null) {
    +
    +                            String[] desiredSheets = StringUtils
    +                                    .split(desiredSheetsDelimited, DESIRED_SHEETS_DELIMITER);
    +
    +                            if (desiredSheets != null) {
    +
    +                                while (iter.hasNext()) {
    +                                    InputStream sheet = iter.next();
    +                                    String sheetName = iter.getSheetName();
    +
    +                                    for (int i = 0; i < desiredSheets.length; i++) {
    +                                        //If the sheetName is a desired one parse it
    +                                        if (sheetName.equalsIgnoreCase(desiredSheets[i])) {
    +                                            handleExcelSheet(session, flowFile, sst, sheet, sheetName);
    +                                            break;
    +                                        }
    +                                    }
    +                                }
    +                            } else {
    +                                getLogger().debug("Excel document was parsed but no sheets with the specified desired names were found.");
    +                            }
    +
    +                        } else {
    +                            //Get all of the sheets in the document.
    +                            while (iter.hasNext()) {
    +                                handleExcelSheet(session, flowFile, sst, iter.next(), iter.getSheetName());
    +                            }
    +                        }
    +                    } catch (Exception ex) {
    +                        getLogger().error(ex.getMessage());
    --- End diff --
    
    Please include a comment and the full exception trace, like `getLogger().error("Attempt to do X failed", ex)`.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---