You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hop.apache.org by ha...@apache.org on 2023/03/08 19:27:51 UTC
[hop] branch master updated: Allow reading of large xlsx files, fixes #2307
This is an automated email from the ASF dual-hosted git repository.
hansva pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hop.git
The following commit(s) were added to refs/heads/master by this push:
new 227ca98eb2 Allow reading of large xlsx files, fixes #2307
new 26aea863c8 Merge pull request #2527 from hansva/master
227ca98eb2 is described below
commit 227ca98eb234161089675cafa02bccd713869033
Author: Hans Van Akelyen <ha...@gmail.com>
AuthorDate: Wed Mar 8 17:31:07 2023 +0100
Allow reading of large xlsx files, fixes #2307
---
.../spreadsheet/0008-read-large-xlsx-file.hpl | 193 +++++++++++++++++++++
.../spreadsheet/main-0008-read-large-xlsx.hwf | 92 ++++++++++
.../excelinput/staxpoi/StaxPoiWorkbook.java | 2 +
3 files changed, 287 insertions(+)
diff --git a/integration-tests/spreadsheet/0008-read-large-xlsx-file.hpl b/integration-tests/spreadsheet/0008-read-large-xlsx-file.hpl
new file mode 100644
index 0000000000..e3d4da3b78
--- /dev/null
+++ b/integration-tests/spreadsheet/0008-read-large-xlsx-file.hpl
@@ -0,0 +1,193 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+
+Licensed to the Apache Software Foundation (ASF) under one or more
+contributor license agreements. See the NOTICE file distributed with
+this work for additional information regarding copyright ownership.
+The ASF licenses this file to You under the Apache License, Version 2.0
+(the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+-->
+<pipeline>
+ <info>
+ <name>0008-read-large-xlsx-file</name>
+ <name_sync_with_filename>Y</name_sync_with_filename>
+ <description/>
+ <extended_description/>
+ <pipeline_version/>
+ <pipeline_type>Normal</pipeline_type>
+ <parameters>
+ </parameters>
+ <capture_transform_performance>N</capture_transform_performance>
+ <transform_performance_capturing_delay>1000</transform_performance_capturing_delay>
+ <transform_performance_capturing_size_limit>100</transform_performance_capturing_size_limit>
+ <created_user>-</created_user>
+ <created_date>2023/03/08 16:23:08.688</created_date>
+ <modified_user>-</modified_user>
+ <modified_date>2023/03/08 16:23:08.688</modified_date>
+ </info>
+ <notepads>
+ </notepads>
+ <order>
+ </order>
+ <transform>
+ <name>Read large xlsx file</name>
+ <type>ExcelInput</type>
+ <description/>
+ <distribute>Y</distribute>
+ <custom_distribution/>
+ <copies>1</copies>
+ <partitioning>
+ <method>none</method>
+ <schema_name/>
+ </partitioning>
+ <accept_field/>
+ <accept_filenames>N</accept_filenames>
+ <accept_transform_name/>
+ <add_to_result_filenames>Y</add_to_result_filenames>
+ <bad_line_files_destination_directory/>
+ <bad_line_files_extension>warning</bad_line_files_extension>
+ <encoding/>
+ <error_ignored>N</error_ignored>
+ <error_line_files_destination_directory/>
+ <error_line_files_extension>error</error_line_files_extension>
+ <error_line_skipped>N</error_line_skipped>
+ <extensionFieldName/>
+ <fields>
+ <field>
+ <currency/>
+ <decimal/>
+ <format/>
+ <group/>
+ <length>-1</length>
+ <name>first_name</name>
+ <precision>-1</precision>
+ <repeat>N</repeat>
+ <trim_type>none</trim_type>
+ <type>String</type>
+ </field>
+ <field>
+ <currency/>
+ <decimal/>
+ <format/>
+ <group/>
+ <length>-1</length>
+ <name>last_name</name>
+ <precision>-1</precision>
+ <repeat>N</repeat>
+ <trim_type>none</trim_type>
+ <type>String</type>
+ </field>
+ <field>
+ <currency/>
+ <decimal/>
+ <format/>
+ <group/>
+ <length>-1</length>
+ <name>full_name</name>
+ <precision>-1</precision>
+ <repeat>N</repeat>
+ <trim_type>none</trim_type>
+ <type>String</type>
+ </field>
+ <field>
+ <currency/>
+ <decimal/>
+ <format/>
+ <group/>
+ <length>-1</length>
+ <name>Age</name>
+ <precision>-1</precision>
+ <repeat>N</repeat>
+ <trim_type>none</trim_type>
+ <type>String</type>
+ </field>
+ <field>
+ <currency/>
+ <decimal/>
+ <format/>
+ <group/>
+ <length>-1</length>
+ <name>Phone</name>
+ <precision>-1</precision>
+ <repeat>N</repeat>
+ <trim_type>none</trim_type>
+ <type>String</type>
+ </field>
+ <field>
+ <currency/>
+ <decimal/>
+ <format/>
+ <group/>
+ <length>-1</length>
+ <name>Street</name>
+ <precision>-1</precision>
+ <repeat>N</repeat>
+ <trim_type>none</trim_type>
+ <type>String</type>
+ </field>
+ <field>
+ <currency/>
+ <decimal/>
+ <format/>
+ <group/>
+ <length>-1</length>
+ <name>City</name>
+ <precision>-1</precision>
+ <repeat>N</repeat>
+ <trim_type>none</trim_type>
+ <type>String</type>
+ </field>
+ </fields>
+ <file>
+ <exclude_filemask/>
+ <file_required>N</file_required>
+ <filemask/>
+ <include_subfolders>N</include_subfolders>
+ <name>gs://apache-hop-it/input/large_excel_file.xlsx</name>
+ </file>
+ <filefield/>
+ <header>Y</header>
+ <hiddenFieldName/>
+ <lastModificationTimeFieldName/>
+ <limit>0</limit>
+ <line_number_files_destination_directory/>
+ <line_number_files_extension>line</line_number_files_extension>
+ <noempty>Y</noempty>
+ <pathFieldName/>
+ <rootUriNameFieldName/>
+ <rownumfield/>
+ <sheetfield/>
+ <sheetrownumfield/>
+ <sheets>
+ <sheet>
+ <name>Sheet1</name>
+ <startcol>0</startcol>
+ <startrow>0</startrow>
+ </sheet>
+ </sheets>
+ <shortFileFieldName/>
+ <sizeFieldName/>
+ <spreadsheet_type>SAX_POI</spreadsheet_type>
+ <stoponempty>N</stoponempty>
+ <strict_types>N</strict_types>
+ <uriNameFieldName/>
+ <attributes/>
+ <GUI>
+ <xloc>144</xloc>
+ <yloc>64</yloc>
+ </GUI>
+ </transform>
+ <transform_error_handling>
+ </transform_error_handling>
+ <attributes/>
+</pipeline>
diff --git a/integration-tests/spreadsheet/main-0008-read-large-xlsx.hwf b/integration-tests/spreadsheet/main-0008-read-large-xlsx.hwf
new file mode 100644
index 0000000000..2f108fb6b7
--- /dev/null
+++ b/integration-tests/spreadsheet/main-0008-read-large-xlsx.hwf
@@ -0,0 +1,92 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+
+Licensed to the Apache Software Foundation (ASF) under one or more
+contributor license agreements. See the NOTICE file distributed with
+this work for additional information regarding copyright ownership.
+The ASF licenses this file to You under the Apache License, Version 2.0
+(the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+-->
+<workflow>
+ <name>main-0008-read-large-xlsx</name>
+ <name_sync_with_filename>Y</name_sync_with_filename>
+ <description/>
+ <extended_description/>
+ <workflow_version/>
+ <created_user>-</created_user>
+ <created_date>2021/01/05 13:04:10.956</created_date>
+ <modified_user>-</modified_user>
+ <modified_date>2021/01/05 13:04:10.956</modified_date>
+ <parameters>
+ </parameters>
+ <actions>
+ <action>
+ <name>Start</name>
+ <description/>
+ <type>SPECIAL</type>
+ <attributes/>
+ <DayOfMonth>1</DayOfMonth>
+ <hour>12</hour>
+ <intervalMinutes>60</intervalMinutes>
+ <intervalSeconds>0</intervalSeconds>
+ <minutes>0</minutes>
+ <repeat>N</repeat>
+ <schedulerType>0</schedulerType>
+ <weekDay>1</weekDay>
+ <parallel>N</parallel>
+ <xloc>50</xloc>
+ <yloc>50</yloc>
+ <attributes_hac/>
+ </action>
+ <action>
+ <name>0008-read-large-xlsx-file.hpl</name>
+ <description/>
+ <type>PIPELINE</type>
+ <attributes/>
+ <filename>${PROJECT_HOME}/0008-read-large-xlsx-file.hpl</filename>
+ <params_from_previous>N</params_from_previous>
+ <exec_per_row>N</exec_per_row>
+ <clear_rows>N</clear_rows>
+ <clear_files>N</clear_files>
+ <set_logfile>N</set_logfile>
+ <logfile/>
+ <logext/>
+ <add_date>N</add_date>
+ <add_time>N</add_time>
+ <loglevel>Basic</loglevel>
+ <set_append_logfile>N</set_append_logfile>
+ <wait_until_finished>Y</wait_until_finished>
+ <create_parent_folder>N</create_parent_folder>
+ <run_configuration>local</run_configuration>
+ <parameters>
+ <pass_all_parameters>Y</pass_all_parameters>
+ </parameters>
+ <parallel>N</parallel>
+ <xloc>240</xloc>
+ <yloc>48</yloc>
+ <attributes_hac/>
+ </action>
+ </actions>
+ <hops>
+ <hop>
+ <from>Start</from>
+ <to>0008-read-large-xlsx-file.hpl</to>
+ <enabled>Y</enabled>
+ <evaluation>Y</evaluation>
+ <unconditional>Y</unconditional>
+ </hop>
+ </hops>
+ <notepads>
+ </notepads>
+ <attributes/>
+</workflow>
diff --git a/plugins/transforms/excel/src/main/java/org/apache/hop/pipeline/transforms/excelinput/staxpoi/StaxPoiWorkbook.java b/plugins/transforms/excel/src/main/java/org/apache/hop/pipeline/transforms/excelinput/staxpoi/StaxPoiWorkbook.java
index 2b71f5bf47..fe49124a5e 100644
--- a/plugins/transforms/excel/src/main/java/org/apache/hop/pipeline/transforms/excelinput/staxpoi/StaxPoiWorkbook.java
+++ b/plugins/transforms/excel/src/main/java/org/apache/hop/pipeline/transforms/excelinput/staxpoi/StaxPoiWorkbook.java
@@ -24,6 +24,7 @@ import org.apache.hop.core.spreadsheet.IKSheet;
import org.apache.hop.core.spreadsheet.IKWorkbook;
import org.apache.hop.core.vfs.HopVfs;
import org.apache.poi.openxml4j.opc.OPCPackage;
+import org.apache.poi.util.IOUtils;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import javax.xml.stream.XMLInputFactory;
@@ -61,6 +62,7 @@ public class StaxPoiWorkbook implements IKWorkbook {
protected StaxPoiWorkbook() {
openSheetsMap = new HashMap<>();
+ IOUtils.setByteArrayMaxOverride(Integer.MAX_VALUE);
this.log = HopLogStore.getLogChannelFactory().create(this);
}