You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@orc.apache.org by do...@apache.org on 2021/08/04 09:14:04 UTC
[orc] branch branch-1.7 updated: ORC-909: Remove commons-io v2.1
dependency (#815)
This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch branch-1.7
in repository https://gitbox.apache.org/repos/asf/orc.git
The following commit(s) were added to refs/heads/branch-1.7 by this push:
new c1b1ad7 ORC-909: Remove commons-io v2.1 dependency (#815)
c1b1ad7 is described below
commit c1b1ad7a0606dbd0dc954cbca66ca7329c77a89a
Author: Dongjoon Hyun <do...@apache.org>
AuthorDate: Wed Aug 4 02:11:40 2021 -0700
ORC-909: Remove commons-io v2.1 dependency (#815)
### What changes were proposed in this pull request?
This PR aims to remove `commons-io` v2.1 dependency which comes from `Hadoop 2.2.0`.
- https://mvnrepository.com/artifact/org.apache.orc/orc-core/1.6.9
### Why are the changes needed?
This is a hidden dependency due to four lines using a single function `IOUtils.skipFully`. We had better cut this.
```
$ git grep org.apache.commons.io
core/src/java/org/apache/orc/impl/RunLengthByteReader.java:import org.apache.commons.io.IOUtils;
core/src/java/org/apache/orc/impl/SerializationUtils.java:import org.apache.commons.io.IOUtils;
```
```
$ git grep IOUtils | grep -v import
core/src/java/org/apache/orc/impl/RunLengthByteReader.java: IOUtils.skipFully(input,1);
core/src/java/org/apache/orc/impl/RunLengthByteReader.java: IOUtils.skipFully(input, numSkipRows);
core/src/java/org/apache/orc/impl/SerializationUtils.java: IOUtils.skipFully(in, numOfFloats * 4L);
core/src/java/org/apache/orc/impl/SerializationUtils.java: IOUtils.skipFully(in, numOfDoubles * 8L);
```
### How was this patch tested?
Pass the CIs.
(cherry picked from commit a9a528693828ab4addf4607bb8da15a6027fe21b)
Signed-off-by: Dongjoon Hyun <do...@apache.org>
---
.../core/src/java/org/apache/orc/impl/IOUtils.java | 143 +++++++++++++++++++++
.../org/apache/orc/impl/RunLengthByteReader.java | 1 -
.../org/apache/orc/impl/SerializationUtils.java | 1 -
3 files changed, 143 insertions(+), 2 deletions(-)
diff --git a/java/core/src/java/org/apache/orc/impl/IOUtils.java b/java/core/src/java/org/apache/orc/impl/IOUtils.java
new file mode 100644
index 0000000..2dbe735
--- /dev/null
+++ b/java/core/src/java/org/apache/orc/impl/IOUtils.java
@@ -0,0 +1,143 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.impl;
+
+import java.io.EOFException;
+import java.io.IOException;
+import java.io.InputStream;
+
+/**
+ * This is copied from commons-io project to cut the dependency
+ * from old Hadoop.
+ */
+public final class IOUtils {
+
+ public static final int DEFAULT_BUFFER_SIZE = 8192;
+
+ /**
+ * Returns a new byte array of size {@link #DEFAULT_BUFFER_SIZE}.
+ *
+ * @return a new byte array of size {@link #DEFAULT_BUFFER_SIZE}.
+ * @since 2.9.0
+ */
+ public static byte[] byteArray() {
+ return byteArray(DEFAULT_BUFFER_SIZE);
+ }
+
+ /**
+ * Returns a new byte array of the given size.
+ *
+ * TODO Consider guarding or warning against large allocations...
+ *
+ * @param size array size.
+ * @return a new byte array of the given size.
+ * @since 2.9.0
+ */
+ public static byte[] byteArray(final int size) {
+ return new byte[size];
+ }
+
+ /**
+ * Internal byte array buffer.
+ */
+ private static final ThreadLocal<byte[]> SKIP_BYTE_BUFFER =
+ ThreadLocal.withInitial(IOUtils::byteArray);
+
+ /**
+ * Gets the thread local byte array.
+ *
+ * @return the thread local byte array.
+ */
+ static byte[] getByteArray() {
+ return SKIP_BYTE_BUFFER.get();
+ }
+
+ /**
+ * Skips the requested number of bytes or fail if there are not enough left.
+ * <p>
+ * This allows for the possibility that {@link InputStream#skip(long)} may
+ * not skip as many bytes as requested (most likely because of reaching EOF).
+ * <p>
+ * Note that the implementation uses {@link #skip(InputStream, long)}.
+ * This means that the method may be considerably less efficient than using the actual skip implementation,
+ * this is done to guarantee that the correct number of characters are skipped.
+ * </p>
+ *
+ * @param input stream to skip
+ * @param toSkip the number of bytes to skip
+ * @throws IOException if there is a problem reading the file
+ * @throws IllegalArgumentException if toSkip is negative
+ * @throws EOFException if the number of bytes skipped was incorrect
+ * @see InputStream#skip(long)
+ * @since 2.0
+ */
+ public static void skipFully(final InputStream input, final long toSkip)
+ throws IOException {
+ if (toSkip < 0) {
+ throw new IllegalArgumentException("Bytes to skip must not be negative: " + toSkip);
+ }
+ final long skipped = skip(input, toSkip);
+ if (skipped != toSkip) {
+ throw new EOFException("Bytes to skip: " + toSkip + " actual: " + skipped);
+ }
+ }
+
+ /**
+ * Skips bytes from an input byte stream.
+ * This implementation guarantees that it will read as many bytes
+ * as possible before giving up; this may not always be the case for
+ * skip() implementations in subclasses of {@link InputStream}.
+ * <p>
+ * Note that the implementation uses {@link InputStream#read(byte[], int, int)} rather
+ * than delegating to {@link InputStream#skip(long)}.
+ * This means that the method may be considerably less efficient than using the actual skip implementation,
+ * this is done to guarantee that the correct number of bytes are skipped.
+ * </p>
+ *
+ * @param input byte stream to skip
+ * @param toSkip number of bytes to skip.
+ * @return number of bytes actually skipped.
+ * @throws IOException if there is a problem reading the file
+ * @throws IllegalArgumentException if toSkip is negative
+ * @see <a href="https://issues.apache.org/jira/browse/IO-203">IO-203 - Add skipFully() method for InputStreams</a>
+ * @since 2.0
+ */
+ public static long skip(final InputStream input, final long toSkip) throws IOException {
+ if (toSkip < 0) {
+ throw new IllegalArgumentException("Skip count must be non-negative, actual: " + toSkip);
+ }
+ /*
+ * N.B. no need to synchronize access to SKIP_BYTE_BUFFER: - we don't care if the buffer is created multiple
+ * times (the data is ignored) - we always use the same size buffer, so if it it is recreated it will still be
+ * OK (if the buffer size were variable, we would need to synch. to ensure some other thread did not create a
+ * smaller one)
+ */
+ long remain = toSkip;
+ while (remain > 0) {
+ // See https://issues.apache.org/jira/browse/IO-203 for why we use read() rather than delegating to skip()
+ final byte[] byteArray = getByteArray();
+ final long n = input.read(byteArray, 0, (int) Math.min(remain, byteArray.length));
+ if (n < 0) { // EOF
+ break;
+ }
+ remain -= n;
+ }
+ return toSkip - remain;
+ }
+}
diff --git a/java/core/src/java/org/apache/orc/impl/RunLengthByteReader.java b/java/core/src/java/org/apache/orc/impl/RunLengthByteReader.java
index 9f4c236..805aaae 100644
--- a/java/core/src/java/org/apache/orc/impl/RunLengthByteReader.java
+++ b/java/core/src/java/org/apache/orc/impl/RunLengthByteReader.java
@@ -20,7 +20,6 @@ package org.apache.orc.impl;
import java.io.EOFException;
import java.io.IOException;
-import org.apache.commons.io.IOUtils;
import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
/**
diff --git a/java/core/src/java/org/apache/orc/impl/SerializationUtils.java b/java/core/src/java/org/apache/orc/impl/SerializationUtils.java
index fbf6dc0..f7024d0 100644
--- a/java/core/src/java/org/apache/orc/impl/SerializationUtils.java
+++ b/java/core/src/java/org/apache/orc/impl/SerializationUtils.java
@@ -18,7 +18,6 @@
package org.apache.orc.impl;
-import org.apache.commons.io.IOUtils;
import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
import org.apache.orc.CompressionCodec;
import org.apache.orc.OrcFile;