You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@orc.apache.org by do...@apache.org on 2021/08/04 09:11:47 UTC

[orc] branch main updated: ORC-909: Remove commons-io v2.1 dependency (#815)

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/orc.git


The following commit(s) were added to refs/heads/main by this push:
     new a9a5286  ORC-909: Remove commons-io v2.1 dependency (#815)
a9a5286 is described below

commit a9a528693828ab4addf4607bb8da15a6027fe21b
Author: Dongjoon Hyun <do...@apache.org>
AuthorDate: Wed Aug 4 02:11:40 2021 -0700

    ORC-909: Remove commons-io v2.1 dependency (#815)
    
    ### What changes were proposed in this pull request?
    
    This PR aims to remove `commons-io` v2.1 dependency which comes from `Hadoop 2.2.0`.
    - https://mvnrepository.com/artifact/org.apache.orc/orc-core/1.6.9
    
    ### Why are the changes needed?
    
    This is a hidden dependency due to four lines using a single function `IOUtils.skipFully`. We had better cut this.
    ```
    $ git grep org.apache.commons.io
    core/src/java/org/apache/orc/impl/RunLengthByteReader.java:import org.apache.commons.io.IOUtils;
    core/src/java/org/apache/orc/impl/SerializationUtils.java:import org.apache.commons.io.IOUtils;
    ```
    
    ```
    $ git grep IOUtils | grep -v import
    core/src/java/org/apache/orc/impl/RunLengthByteReader.java:        IOUtils.skipFully(input,1);
    core/src/java/org/apache/orc/impl/RunLengthByteReader.java:        IOUtils.skipFully(input, numSkipRows);
    core/src/java/org/apache/orc/impl/SerializationUtils.java:    IOUtils.skipFully(in, numOfFloats * 4L);
    core/src/java/org/apache/orc/impl/SerializationUtils.java:    IOUtils.skipFully(in, numOfDoubles * 8L);
    ```
    
    ### How was this patch tested?
    
    Pass the CIs.
---
 .../core/src/java/org/apache/orc/impl/IOUtils.java | 143 +++++++++++++++++++++
 .../org/apache/orc/impl/RunLengthByteReader.java   |   1 -
 .../org/apache/orc/impl/SerializationUtils.java    |   1 -
 3 files changed, 143 insertions(+), 2 deletions(-)

diff --git a/java/core/src/java/org/apache/orc/impl/IOUtils.java b/java/core/src/java/org/apache/orc/impl/IOUtils.java
new file mode 100644
index 0000000..2dbe735
--- /dev/null
+++ b/java/core/src/java/org/apache/orc/impl/IOUtils.java
@@ -0,0 +1,143 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.impl;
+
+import java.io.EOFException;
+import java.io.IOException;
+import java.io.InputStream;
+
+/**
+ * This is copied from commons-io project to cut the dependency
+ * from old Hadoop.
+ */
+public final class IOUtils {
+
+  public static final int DEFAULT_BUFFER_SIZE = 8192;
+
+  /**
+   * Returns a new byte array of size {@link #DEFAULT_BUFFER_SIZE}.
+   *
+   * @return a new byte array of size {@link #DEFAULT_BUFFER_SIZE}.
+   * @since 2.9.0
+   */
+  public static byte[] byteArray() {
+    return byteArray(DEFAULT_BUFFER_SIZE);
+  }
+
+  /**
+   * Returns a new byte array of the given size.
+   *
+   * TODO Consider guarding or warning against large allocations...
+   *
+   * @param size array size.
+   * @return a new byte array of the given size.
+   * @since 2.9.0
+   */
+  public static byte[] byteArray(final int size) {
+    return new byte[size];
+  }
+
+  /**
+   * Internal byte array buffer.
+   */
+  private static final ThreadLocal<byte[]> SKIP_BYTE_BUFFER =
+      ThreadLocal.withInitial(IOUtils::byteArray);
+
+  /**
+   * Gets the thread local byte array.
+   *
+   * @return the thread local byte array.
+   */
+  static byte[] getByteArray() {
+    return SKIP_BYTE_BUFFER.get();
+  }
+
+  /**
+   * Skips the requested number of bytes or fail if there are not enough left.
+   * <p>
+   * This allows for the possibility that {@link InputStream#skip(long)} may
+   * not skip as many bytes as requested (most likely because of reaching EOF).
+   * <p>
+   * Note that the implementation uses {@link #skip(InputStream, long)}.
+   * This means that the method may be considerably less efficient than using the actual skip implementation,
+   * this is done to guarantee that the correct number of characters are skipped.
+   * </p>
+   *
+   * @param input stream to skip
+   * @param toSkip the number of bytes to skip
+   * @throws IOException              if there is a problem reading the file
+   * @throws IllegalArgumentException if toSkip is negative
+   * @throws EOFException             if the number of bytes skipped was incorrect
+   * @see InputStream#skip(long)
+   * @since 2.0
+   */
+  public static void skipFully(final InputStream input, final long toSkip)
+      throws IOException {
+    if (toSkip < 0) {
+      throw new IllegalArgumentException("Bytes to skip must not be negative: " + toSkip);
+    }
+    final long skipped = skip(input, toSkip);
+    if (skipped != toSkip) {
+      throw new EOFException("Bytes to skip: " + toSkip + " actual: " + skipped);
+    }
+  }
+
+  /**
+   * Skips bytes from an input byte stream.
+   * This implementation guarantees that it will read as many bytes
+   * as possible before giving up; this may not always be the case for
+   * skip() implementations in subclasses of {@link InputStream}.
+   * <p>
+   * Note that the implementation uses {@link InputStream#read(byte[], int, int)} rather
+   * than delegating to {@link InputStream#skip(long)}.
+   * This means that the method may be considerably less efficient than using the actual skip implementation,
+   * this is done to guarantee that the correct number of bytes are skipped.
+   * </p>
+   *
+   * @param input byte stream to skip
+   * @param toSkip number of bytes to skip.
+   * @return number of bytes actually skipped.
+   * @throws IOException              if there is a problem reading the file
+   * @throws IllegalArgumentException if toSkip is negative
+   * @see <a href="https://issues.apache.org/jira/browse/IO-203">IO-203 - Add skipFully() method for InputStreams</a>
+   * @since 2.0
+   */
+  public static long skip(final InputStream input, final long toSkip) throws IOException {
+    if (toSkip < 0) {
+      throw new IllegalArgumentException("Skip count must be non-negative, actual: " + toSkip);
+    }
+    /*
+     * N.B. no need to synchronize access to SKIP_BYTE_BUFFER: - we don't care if the buffer is created multiple
+     * times (the data is ignored) - we always use the same size buffer, so if it it is recreated it will still be
+     * OK (if the buffer size were variable, we would need to synch. to ensure some other thread did not create a
+     * smaller one)
+     */
+    long remain = toSkip;
+    while (remain > 0) {
+      // See https://issues.apache.org/jira/browse/IO-203 for why we use read() rather than delegating to skip()
+      final byte[] byteArray = getByteArray();
+      final long n = input.read(byteArray, 0, (int) Math.min(remain, byteArray.length));
+      if (n < 0) { // EOF
+          break;
+      }
+      remain -= n;
+    }
+    return toSkip - remain;
+  }
+}
diff --git a/java/core/src/java/org/apache/orc/impl/RunLengthByteReader.java b/java/core/src/java/org/apache/orc/impl/RunLengthByteReader.java
index 9f4c236..805aaae 100644
--- a/java/core/src/java/org/apache/orc/impl/RunLengthByteReader.java
+++ b/java/core/src/java/org/apache/orc/impl/RunLengthByteReader.java
@@ -20,7 +20,6 @@ package org.apache.orc.impl;
 import java.io.EOFException;
 import java.io.IOException;
 
-import org.apache.commons.io.IOUtils;
 import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
 
 /**
diff --git a/java/core/src/java/org/apache/orc/impl/SerializationUtils.java b/java/core/src/java/org/apache/orc/impl/SerializationUtils.java
index 283ffc6..3afa3eb 100644
--- a/java/core/src/java/org/apache/orc/impl/SerializationUtils.java
+++ b/java/core/src/java/org/apache/orc/impl/SerializationUtils.java
@@ -18,7 +18,6 @@
 
 package org.apache.orc.impl;
 
-import org.apache.commons.io.IOUtils;
 import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
 import org.apache.orc.CompressionCodec;
 import org.apache.orc.OrcFile;