You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hop.apache.org by ha...@apache.org on 2023/03/08 19:27:51 UTC

[hop] branch master updated: Allow reading of large xlsx files, fixes #2307

This is an automated email from the ASF dual-hosted git repository.

hansva pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hop.git


The following commit(s) were added to refs/heads/master by this push:
     new 227ca98eb2 Allow reading of large xlsx files, fixes #2307
     new 26aea863c8 Merge pull request #2527 from hansva/master
227ca98eb2 is described below

commit 227ca98eb234161089675cafa02bccd713869033
Author: Hans Van Akelyen <ha...@gmail.com>
AuthorDate: Wed Mar 8 17:31:07 2023 +0100

    Allow reading of large xlsx files, fixes #2307
---
 .../spreadsheet/0008-read-large-xlsx-file.hpl      | 193 +++++++++++++++++++++
 .../spreadsheet/main-0008-read-large-xlsx.hwf      |  92 ++++++++++
 .../excelinput/staxpoi/StaxPoiWorkbook.java        |   2 +
 3 files changed, 287 insertions(+)

diff --git a/integration-tests/spreadsheet/0008-read-large-xlsx-file.hpl b/integration-tests/spreadsheet/0008-read-large-xlsx-file.hpl
new file mode 100644
index 0000000000..e3d4da3b78
--- /dev/null
+++ b/integration-tests/spreadsheet/0008-read-large-xlsx-file.hpl
@@ -0,0 +1,193 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+
+Licensed to the Apache Software Foundation (ASF) under one or more
+contributor license agreements.  See the NOTICE file distributed with
+this work for additional information regarding copyright ownership.
+The ASF licenses this file to You under the Apache License, Version 2.0
+(the "License"); you may not use this file except in compliance with
+the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+-->
+<pipeline>
+  <info>
+    <name>0008-read-large-xlsx-file</name>
+    <name_sync_with_filename>Y</name_sync_with_filename>
+    <description/>
+    <extended_description/>
+    <pipeline_version/>
+    <pipeline_type>Normal</pipeline_type>
+    <parameters>
+    </parameters>
+    <capture_transform_performance>N</capture_transform_performance>
+    <transform_performance_capturing_delay>1000</transform_performance_capturing_delay>
+    <transform_performance_capturing_size_limit>100</transform_performance_capturing_size_limit>
+    <created_user>-</created_user>
+    <created_date>2023/03/08 16:23:08.688</created_date>
+    <modified_user>-</modified_user>
+    <modified_date>2023/03/08 16:23:08.688</modified_date>
+  </info>
+  <notepads>
+  </notepads>
+  <order>
+  </order>
+  <transform>
+    <name>Read large xlsx file</name>
+    <type>ExcelInput</type>
+    <description/>
+    <distribute>Y</distribute>
+    <custom_distribution/>
+    <copies>1</copies>
+    <partitioning>
+      <method>none</method>
+      <schema_name/>
+    </partitioning>
+    <accept_field/>
+    <accept_filenames>N</accept_filenames>
+    <accept_transform_name/>
+    <add_to_result_filenames>Y</add_to_result_filenames>
+    <bad_line_files_destination_directory/>
+    <bad_line_files_extension>warning</bad_line_files_extension>
+    <encoding/>
+    <error_ignored>N</error_ignored>
+    <error_line_files_destination_directory/>
+    <error_line_files_extension>error</error_line_files_extension>
+    <error_line_skipped>N</error_line_skipped>
+    <extensionFieldName/>
+    <fields>
+      <field>
+        <currency/>
+        <decimal/>
+        <format/>
+        <group/>
+        <length>-1</length>
+        <name>first_name</name>
+        <precision>-1</precision>
+        <repeat>N</repeat>
+        <trim_type>none</trim_type>
+        <type>String</type>
+      </field>
+      <field>
+        <currency/>
+        <decimal/>
+        <format/>
+        <group/>
+        <length>-1</length>
+        <name>last_name</name>
+        <precision>-1</precision>
+        <repeat>N</repeat>
+        <trim_type>none</trim_type>
+        <type>String</type>
+      </field>
+      <field>
+        <currency/>
+        <decimal/>
+        <format/>
+        <group/>
+        <length>-1</length>
+        <name>full_name</name>
+        <precision>-1</precision>
+        <repeat>N</repeat>
+        <trim_type>none</trim_type>
+        <type>String</type>
+      </field>
+      <field>
+        <currency/>
+        <decimal/>
+        <format/>
+        <group/>
+        <length>-1</length>
+        <name>Age</name>
+        <precision>-1</precision>
+        <repeat>N</repeat>
+        <trim_type>none</trim_type>
+        <type>String</type>
+      </field>
+      <field>
+        <currency/>
+        <decimal/>
+        <format/>
+        <group/>
+        <length>-1</length>
+        <name>Phone</name>
+        <precision>-1</precision>
+        <repeat>N</repeat>
+        <trim_type>none</trim_type>
+        <type>String</type>
+      </field>
+      <field>
+        <currency/>
+        <decimal/>
+        <format/>
+        <group/>
+        <length>-1</length>
+        <name>Street</name>
+        <precision>-1</precision>
+        <repeat>N</repeat>
+        <trim_type>none</trim_type>
+        <type>String</type>
+      </field>
+      <field>
+        <currency/>
+        <decimal/>
+        <format/>
+        <group/>
+        <length>-1</length>
+        <name>City</name>
+        <precision>-1</precision>
+        <repeat>N</repeat>
+        <trim_type>none</trim_type>
+        <type>String</type>
+      </field>
+    </fields>
+    <file>
+      <exclude_filemask/>
+      <file_required>N</file_required>
+      <filemask/>
+      <include_subfolders>N</include_subfolders>
+      <name>gs://apache-hop-it/input/large_excel_file.xlsx</name>
+    </file>
+    <filefield/>
+    <header>Y</header>
+    <hiddenFieldName/>
+    <lastModificationTimeFieldName/>
+    <limit>0</limit>
+    <line_number_files_destination_directory/>
+    <line_number_files_extension>line</line_number_files_extension>
+    <noempty>Y</noempty>
+    <pathFieldName/>
+    <rootUriNameFieldName/>
+    <rownumfield/>
+    <sheetfield/>
+    <sheetrownumfield/>
+    <sheets>
+      <sheet>
+        <name>Sheet1</name>
+        <startcol>0</startcol>
+        <startrow>0</startrow>
+      </sheet>
+    </sheets>
+    <shortFileFieldName/>
+    <sizeFieldName/>
+    <spreadsheet_type>SAX_POI</spreadsheet_type>
+    <stoponempty>N</stoponempty>
+    <strict_types>N</strict_types>
+    <uriNameFieldName/>
+    <attributes/>
+    <GUI>
+      <xloc>144</xloc>
+      <yloc>64</yloc>
+    </GUI>
+  </transform>
+  <transform_error_handling>
+  </transform_error_handling>
+  <attributes/>
+</pipeline>
diff --git a/integration-tests/spreadsheet/main-0008-read-large-xlsx.hwf b/integration-tests/spreadsheet/main-0008-read-large-xlsx.hwf
new file mode 100644
index 0000000000..2f108fb6b7
--- /dev/null
+++ b/integration-tests/spreadsheet/main-0008-read-large-xlsx.hwf
@@ -0,0 +1,92 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+
+Licensed to the Apache Software Foundation (ASF) under one or more
+contributor license agreements.  See the NOTICE file distributed with
+this work for additional information regarding copyright ownership.
+The ASF licenses this file to You under the Apache License, Version 2.0
+(the "License"); you may not use this file except in compliance with
+the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+-->
+<workflow>
+  <name>main-0008-read-large-xlsx</name>
+  <name_sync_with_filename>Y</name_sync_with_filename>
+  <description/>
+  <extended_description/>
+  <workflow_version/>
+  <created_user>-</created_user>
+  <created_date>2021/01/05 13:04:10.956</created_date>
+  <modified_user>-</modified_user>
+  <modified_date>2021/01/05 13:04:10.956</modified_date>
+  <parameters>
+    </parameters>
+  <actions>
+    <action>
+      <name>Start</name>
+      <description/>
+      <type>SPECIAL</type>
+      <attributes/>
+      <DayOfMonth>1</DayOfMonth>
+      <hour>12</hour>
+      <intervalMinutes>60</intervalMinutes>
+      <intervalSeconds>0</intervalSeconds>
+      <minutes>0</minutes>
+      <repeat>N</repeat>
+      <schedulerType>0</schedulerType>
+      <weekDay>1</weekDay>
+      <parallel>N</parallel>
+      <xloc>50</xloc>
+      <yloc>50</yloc>
+      <attributes_hac/>
+    </action>
+    <action>
+      <name>0008-read-large-xlsx-file.hpl</name>
+      <description/>
+      <type>PIPELINE</type>
+      <attributes/>
+      <filename>${PROJECT_HOME}/0008-read-large-xlsx-file.hpl</filename>
+      <params_from_previous>N</params_from_previous>
+      <exec_per_row>N</exec_per_row>
+      <clear_rows>N</clear_rows>
+      <clear_files>N</clear_files>
+      <set_logfile>N</set_logfile>
+      <logfile/>
+      <logext/>
+      <add_date>N</add_date>
+      <add_time>N</add_time>
+      <loglevel>Basic</loglevel>
+      <set_append_logfile>N</set_append_logfile>
+      <wait_until_finished>Y</wait_until_finished>
+      <create_parent_folder>N</create_parent_folder>
+      <run_configuration>local</run_configuration>
+      <parameters>
+        <pass_all_parameters>Y</pass_all_parameters>
+      </parameters>
+      <parallel>N</parallel>
+      <xloc>240</xloc>
+      <yloc>48</yloc>
+      <attributes_hac/>
+    </action>
+  </actions>
+  <hops>
+    <hop>
+      <from>Start</from>
+      <to>0008-read-large-xlsx-file.hpl</to>
+      <enabled>Y</enabled>
+      <evaluation>Y</evaluation>
+      <unconditional>Y</unconditional>
+    </hop>
+  </hops>
+  <notepads>
+  </notepads>
+  <attributes/>
+</workflow>
diff --git a/plugins/transforms/excel/src/main/java/org/apache/hop/pipeline/transforms/excelinput/staxpoi/StaxPoiWorkbook.java b/plugins/transforms/excel/src/main/java/org/apache/hop/pipeline/transforms/excelinput/staxpoi/StaxPoiWorkbook.java
index 2b71f5bf47..fe49124a5e 100644
--- a/plugins/transforms/excel/src/main/java/org/apache/hop/pipeline/transforms/excelinput/staxpoi/StaxPoiWorkbook.java
+++ b/plugins/transforms/excel/src/main/java/org/apache/hop/pipeline/transforms/excelinput/staxpoi/StaxPoiWorkbook.java
@@ -24,6 +24,7 @@ import org.apache.hop.core.spreadsheet.IKSheet;
 import org.apache.hop.core.spreadsheet.IKWorkbook;
 import org.apache.hop.core.vfs.HopVfs;
 import org.apache.poi.openxml4j.opc.OPCPackage;
+import org.apache.poi.util.IOUtils;
 import org.apache.poi.xssf.eventusermodel.XSSFReader;
 
 import javax.xml.stream.XMLInputFactory;
@@ -61,6 +62,7 @@ public class StaxPoiWorkbook implements IKWorkbook {
 
   protected StaxPoiWorkbook() {
     openSheetsMap = new HashMap<>();
+    IOUtils.setByteArrayMaxOverride(Integer.MAX_VALUE);
     this.log = HopLogStore.getLogChannelFactory().create(this);
   }