You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by yc...@apache.org on 2021/09/13 18:12:00 UTC
[hive] branch master updated: HIVE-24936 - Fix file name parsing
and copy file move (#2628) (Harish Jaiprakash, reviewed by Kishen Das)
This is an automated email from the ASF dual-hosted git repository.
ychena pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push:
new aefbf40 HIVE-24936 - Fix file name parsing and copy file move (#2628) (Harish Jaiprakash, reviewed by Kishen Das)
aefbf40 is described below
commit aefbf40f0b1b2e07ede073418200f8940f91dba7
Author: Harish Jaiprakash <ha...@users.noreply.github.com>
AuthorDate: Mon Sep 13 23:41:45 2021 +0530
HIVE-24936 - Fix file name parsing and copy file move (#2628) (Harish Jaiprakash, reviewed by Kishen Das)
* HIVE-24936 - Fix file name parsing and copy file move.
* HIVE-25130: handle spark inserted files during alter table concat.
---
.../hive/ql/exec/AbstractFileMergeOperator.java | 36 ++---
.../hadoop/hive/ql/exec/ParsedOutputFileName.java | 156 ++++++++++++++++++
.../org/apache/hadoop/hive/ql/exec/Utilities.java | 168 ++++++++++---------
.../hive/ql/exec/ParsedOutputFileNameTest.java | 177 +++++++++++++++++++++
.../apache/hadoop/hive/ql/exec/TestUtilities.java | 45 ++++++
5 files changed, 473 insertions(+), 109 deletions(-)
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/AbstractFileMergeOperator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/AbstractFileMergeOperator.java
index 86cb983..264573b 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/AbstractFileMergeOperator.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/AbstractFileMergeOperator.java
@@ -275,39 +275,33 @@ public abstract class AbstractFileMergeOperator<T extends FileMergeDesc>
throw new HiveException("Incompatible files should not happen in MM tables.");
}
Path destDir = finalPath.getParent();
- Path destPath = destDir;
// move any incompatible files to final path
if (incompatFileSet != null && !incompatFileSet.isEmpty()) {
for (Path incompatFile : incompatFileSet) {
- // check if path conforms to Hive's file name convention. Hive expects filenames to be in specific format
+ // Hive expects filenames to be in specific format
// like 000000_0, but "LOAD DATA" commands can let you add any files to any partitions/tables without
- // renaming. This can cause MoveTask to remove files in some cases where MoveTask assumes the files are
- // are generated by speculatively executed tasks.
+ // renaming.
+ // This can cause a few issues:
+ // MoveTask will remove files in some cases where MoveTask assumes the files are are generated by
+ // speculatively executed tasks.
// Example: MoveTask thinks the following files are same
// part-m-00000_1417075294718
// part-m-00001_1417075294718
// Assumes 1417075294718 as taskId and retains only large file supposedly generated by speculative execution.
- // This can result in data loss in case of CONCATENATE/merging. Filter out files that does not match Hive's
- // filename convention.
- if (!Utilities.isHiveManagedFile(incompatFile)) {
- // rename un-managed files to conform to Hive's naming standard
- // Example:
- // /warehouse/table/part-m-00000_1417075294718 will get renamed to /warehouse/table/.hive-staging/000000_0
- // If staging directory already contains the file, taskId_copy_N naming will be used.
- final String taskId = Utilities.getTaskId(jc);
- Path destFilePath = new Path(destDir, new Path(taskId));
- for (int counter = 1; fs.exists(destFilePath); counter++) {
- destFilePath = new Path(destDir, taskId + (Utilities.COPY_KEYWORD + counter));
- }
- LOG.warn("Path doesn't conform to Hive's expectation. Renaming {} to {}", incompatFile, destFilePath);
- destPath = destFilePath;
- }
+ // This can result in data loss in case of CONCATENATE/merging.
+ // If filename is consistent with XXXXXX_N and another task with same task-id runs after this move, then
+ // the same file name is used in the other task which will result in task failure and retry of task and
+ // subsequent removal of this file as duplicate.
+ // Example: if the file name is 000001_0 and another task runs with taskid 000001_0, it will fail to create
+ // the file and next attempt will create 000001_1, both the files will be considered as output of same task
+ // and only 000001_1 will be picked resulting it loss of existing file 000001_0.
+ final String destFileName = Utilities.getTaskId(jc) + Utilities.COPY_KEYWORD + 1;
try {
- Utilities.renameOrMoveFiles(fs, incompatFile, destPath);
+ Path destPath = Utilities.moveFile(fs, incompatFile, destDir, destFileName);
LOG.info("Moved incompatible file " + incompatFile + " to " + destPath);
} catch (HiveException e) {
- LOG.error("Unable to move " + incompatFile + " to " + destPath);
+ LOG.error("Unable to move " + incompatFile + " to " + destDir + ", " + destFileName);
throw new IOException(e);
}
}
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/ParsedOutputFileName.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/ParsedOutputFileName.java
new file mode 100644
index 0000000..4fc827e
--- /dev/null
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/ParsedOutputFileName.java
@@ -0,0 +1,156 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.exec;
+
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+
+/**
+ * Helper class to match hive filenames and extract taskId, taskAttemptId, copyIndex.
+ *
+ * Matches following:
+ * 00001_02
+ * 00001_02.gz
+ * 00001_02.zlib.gz
+ * 00001_02_copy_1
+ * 00001_02_copy_1.gz
+ * <p>
+ * All the components are here:
+ * tmp_(taskPrefix)00001_02_copy_1.zlib.gz
+ *
+ * Spark output file:
+ * part-00026-23003837-facb-49ec-b1c4-eeda902cacf3.c000.zlib.orc
+ */
+public class ParsedOutputFileName {
+ private static final Pattern COPY_FILE_NAME_TO_TASK_ID_REGEX = Pattern.compile(
+ "^(.*?)?" + // any prefix
+ "(\\(.*\\))?" + // taskId prefix
+ "([0-9]+)" + // taskId
+ "(?:_([0-9]{1,6}))?" + // _<attemptId> (limited to 6 digits)
+ "(?:_copy_([0-9]{1,6}))?" + // copy file index
+ "(\\..*)?$"); // any suffix/file extension
+
+ private static final Pattern SPARK_FILE_NAME =
+ Pattern.compile("^part-(\\d+)-[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\\.[a-f0-9]+(.*)$");
+
+ public static ParsedOutputFileName parse(String fileName) {
+ return new ParsedOutputFileName(fileName);
+ }
+
+ private final boolean matches;
+ private final String taskIdPrefix;
+ private final String taskId;
+ private final String attemptId;
+ private final String copyIndex;
+ private final String suffix;
+ private final CharSequence filePrefixForCopy;
+
+ private ParsedOutputFileName(CharSequence fileName) {
+ Matcher m = SPARK_FILE_NAME.matcher(fileName);
+ if (m.matches()) {
+ matches = true;
+ taskIdPrefix = null;
+ taskId = m.group(1);
+ attemptId = "1";
+ copyIndex = null;
+ String s = m.group(2);
+ suffix = (s == null || s.isEmpty() ? null : s);
+ filePrefixForCopy = null;
+ } else {
+ m = COPY_FILE_NAME_TO_TASK_ID_REGEX.matcher(fileName);
+ matches = m.matches();
+ if (matches) {
+ taskIdPrefix = m.group(2);
+ taskId = m.group(3);
+ attemptId = m.group(4);
+ copyIndex = m.group(5);
+ suffix = m.group(6);
+ filePrefixForCopy = m.end(4) >= 0 ? fileName.subSequence(0, m.end(4)) : null;
+ } else {
+ taskIdPrefix = null;
+ taskId = null;
+ attemptId = null;
+ copyIndex = null;
+ suffix = null;
+ filePrefixForCopy = null;
+ }
+ }
+ }
+
+ public boolean matches() {
+ return matches;
+ }
+
+ public String getTaskIdPrefix() {
+ return taskIdPrefix;
+ }
+
+ public String getTaskId() {
+ return taskId;
+ }
+
+ public String getPrefixedTaskId() {
+ String prefix = getTaskIdPrefix();
+ String taskId = getTaskId();
+ if (prefix != null && taskId != null) {
+ return prefix + taskId;
+ } else {
+ return taskId;
+ }
+ }
+
+ public String getAttemptId() {
+ return attemptId;
+ }
+
+ public boolean isCopyFile() {
+ return copyIndex != null;
+ }
+
+ public String getCopyIndex() {
+ return copyIndex;
+ }
+
+ public String getSuffix() {
+ return suffix;
+ }
+
+ /**
+ * Create a copy file using the same file name as this and the given index. It will keep the prefixes but drop any
+ * suffixes.
+ * Ex: 00001_02 will be converted to 00001_02_copy_3 for idx = 3.
+ * tmp_(prefix)00001_02_copy_1.snappy.orc will be converted to tmp_(prefix)00001_02_copy_3 for idx = 3
+ * @param idx The index required.
+ * @return
+ */
+ public String makeFilenameWithCopyIndex(int idx) throws HiveException {
+ if (filePrefixForCopy == null) {
+ throw new HiveException("Not expected to make copy files of spark output files.");
+ }
+ return filePrefixForCopy + "_copy_" + idx;
+ }
+
+ public String toString() {
+ return "[taskId: " + getPrefixedTaskId() + ", taskAttemptId: " + getAttemptId() +
+ ", copyIndex: " + getCopyIndex() + "]";
+ }
+}
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java
index b6e8ed5..01f0967 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java
@@ -1098,7 +1098,7 @@ public final class Utilities {
}
}
- private static void moveFile(FileSystem fs, FileStatus file, Path dst) throws IOException,
+ private static void moveFileOrDir(FileSystem fs, FileStatus file, Path dst) throws IOException,
HiveException {
Path srcFilePath = file.getPath();
String fileName = srcFilePath.getName();
@@ -1106,32 +1106,61 @@ public final class Utilities {
if (file.isDir()) {
renameOrMoveFiles(fs, srcFilePath, dstFilePath);
} else {
- if (fs.exists(dstFilePath)) {
- int suffix = 0;
- do {
- suffix++;
- dstFilePath = new Path(dst, fileName + "_" + suffix);
- } while (fs.exists(dstFilePath));
- }
-
- if (!fs.rename(srcFilePath, dstFilePath)) {
- throw new HiveException("Unable to move: " + srcFilePath + " to: " + dst);
- }
+ moveFile(fs, srcFilePath, dst, fileName);
}
}
/**
* Rename src to dst, or in the case dst already exists, move files in src to dst. If there is an
- * existing file with the same name, the new file's name will be appended with "_1", "_2", etc.
- *
+ * existing file with the same name, the new file's name will be generated based on the file name.
+ * If the file name confirms to hive managed file NNNNNN_Y(_copy_YY) then it will create NNNNN_Y_copy_XX
+ * else it will append _1, _2, ....
* @param fs
* the FileSystem where src and dst are on.
- * @param src
- * the src directory
- * @param dst
+ * @param srcFile
+ * the src file
+ * @param destDir
* the target directory
- * @throws IOException
+ * @param destFileName
+ * the target filename
+ * @return The final path the file was moved to.
+ * @throws IOException, HiveException
*/
+ public static Path moveFile(FileSystem fs, Path srcFile, Path destDir, String destFileName)
+ throws IOException, HiveException {
+ Path dstFilePath = new Path(destDir, destFileName);
+ if (fs.exists(dstFilePath)) {
+ ParsedOutputFileName parsedFileName = ParsedOutputFileName.parse(destFileName);
+ int suffix = 0;
+ do {
+ suffix++;
+ if (parsedFileName.matches()) {
+ dstFilePath = new Path(destDir, parsedFileName.makeFilenameWithCopyIndex(suffix));
+ } else {
+ dstFilePath = new Path(destDir, destFileName + "_" + suffix);
+ }
+ } while (fs.exists(dstFilePath));
+ }
+ if (!fs.rename(srcFile, dstFilePath)) {
+ throw new HiveException("Unable to move: " + srcFile + " to: " + dstFilePath);
+ }
+ return dstFilePath;
+ }
+
+ /**
+ * Rename src to dst, or in the case dst already exists, move files in src to dst. If there is an
+ * existing file with the same name, the new file's name will be generated based on the file name.
+ * If the file name confirms to hive managed file NNNNNN_Y(_copy_YY) then it will create NNNNN_Y_copy_XX
+ * else it will append _1, _2, ....
+ *
+ * @param fs
+ * the FileSystem where src and dst are on.
+ * @param src
+ * the src directory
+ * @param dst
+ * the target directory
+ * @throws IOException
+ */
public static void renameOrMoveFiles(FileSystem fs, Path src, Path dst) throws IOException,
HiveException {
if (!fs.exists(dst)) {
@@ -1142,7 +1171,7 @@ public final class Utilities {
// move file by file
FileStatus[] files = fs.listStatus(src);
for (FileStatus file : files) {
- Utilities.moveFile(fs, file, dst);
+ Utilities.moveFileOrDir(fs, file, dst);
}
}
}
@@ -1180,7 +1209,7 @@ public final class Utilities {
@Override
public Void call() throws HiveException {
try {
- Utilities.moveFile(fs, file, dst);
+ Utilities.moveFileOrDir(fs, file, dst);
} catch (Exception e) {
throw new HiveException(e);
}
@@ -1194,39 +1223,7 @@ public final class Utilities {
}
}
- /**
- * The first group will contain the task id. The second group is the optional extension. The file
- * name looks like: "0_0" or "0_0.gz". There may be a leading prefix (tmp_). Since getTaskId() can
- * return an integer only - this should match a pure integer as well. {1,6} is used to limit
- * matching for attempts #'s 0-999999.
- */
- private static final Pattern FILE_NAME_TO_TASK_ID_REGEX =
- Pattern.compile("^.*?([0-9]+)(_[0-9]{1,6})?(\\..*)?$");
-
- /**
- * Some jobs like "INSERT INTO" jobs create copies of files like 0000001_0_copy_2.
- * For such files,
- * Group 1: 00000001 [taskId]
- * Group 3: 0 [task attempId]
- * Group 4: _copy_2 [copy suffix]
- * Group 6: copy [copy keyword]
- * Group 8: 2 [copy file index]
- */
public static final String COPY_KEYWORD = "_copy_"; // copy keyword
- private static final Pattern COPY_FILE_NAME_TO_TASK_ID_REGEX =
- Pattern.compile("^.*?"+ // any prefix
- "([0-9]+)"+ // taskId
- "(_)"+ // separator
- "([0-9]{1,6})?"+ // attemptId (limited to 6 digits)
- "((_)(\\Bcopy\\B)(_)" +
- "([0-9]{1,6})$)?"+ // copy file index
- "(\\..*)?$"); // any suffix/file extension
-
- /**
- * This retruns prefix part + taskID for bucket join for partitioned table
- */
- private static final Pattern FILE_NAME_PREFIXED_TASK_ID_REGEX =
- Pattern.compile("^.*?((\\(.*\\))?[0-9]+)(_[0-9]{1,6})?(\\..*)?$");
/**
* This breaks a prefixed bucket number into the prefix and the taskID
@@ -1247,7 +1244,7 @@ public final class Utilities {
* filename to extract taskid from
*/
public static String getTaskIdFromFilename(String filename) {
- return getTaskIdFromFilename(filename, FILE_NAME_TO_TASK_ID_REGEX);
+ return getIdFromFilename(filename, false, false);
}
/**
@@ -1258,33 +1255,38 @@ public final class Utilities {
* filename to extract taskid from
*/
private static String getPrefixedTaskIdFromFilename(String filename) {
- return getTaskIdFromFilename(filename, FILE_NAME_PREFIXED_TASK_ID_REGEX);
- }
-
- private static String getTaskIdFromFilename(String filename, Pattern pattern) {
- return getIdFromFilename(filename, pattern, 1);
+ return getIdFromFilename(filename, true, false);
}
private static int getAttemptIdFromFilename(String filename) {
- String attemptStr = getIdFromFilename(filename, FILE_NAME_PREFIXED_TASK_ID_REGEX, 3);
- return Integer.parseInt(attemptStr.substring(1));
+ return Integer.parseInt(getIdFromFilename(filename, true, true));
}
- private static String getIdFromFilename(String filename, Pattern pattern, int group) {
- String taskId = filename;
- int dirEnd = filename.lastIndexOf(Path.SEPARATOR);
+ private static String getIdFromFilename(String filepath, boolean isPrefixed, boolean isTaskAttempt) {
+ String filename = filepath;
+ int dirEnd = filepath.lastIndexOf(Path.SEPARATOR);
if (dirEnd != -1) {
- taskId = filename.substring(dirEnd + 1);
+ filename = filepath.substring(dirEnd + 1);
}
- Matcher m = pattern.matcher(taskId);
- if (!m.matches()) {
+ ParsedOutputFileName parsedOutputFileName = ParsedOutputFileName.parse(filename);
+ String taskId;
+ if (parsedOutputFileName.matches()) {
+ if (isTaskAttempt) {
+ taskId = parsedOutputFileName.getAttemptId();
+ } else {
+ taskId = isPrefixed ? parsedOutputFileName.getPrefixedTaskId() : parsedOutputFileName.getTaskId();
+ }
+ } else {
+ taskId = filename;
LOG.warn("Unable to get task id from file name: {}. Using last component {}"
- + " as task id.", filename, taskId);
+ + " as task id.", filepath, taskId);
+ }
+ if (isTaskAttempt) {
+ LOG.debug("TaskAttemptId for {} = {}", filepath, taskId);
} else {
- taskId = m.group(group);
+ LOG.debug("TaskId for {} = {}", filepath, taskId);
}
- LOG.debug("TaskId for {} = {}", filename, taskId);
return taskId;
}
@@ -1570,7 +1572,7 @@ public final class Utilities {
public Void call() throws HiveException {
try {
LOG.debug("Moving from {} to {} ", fileStatus.getPath(), dst);
- Utilities.moveFile(fs, fileStatus, dst);
+ Utilities.moveFileOrDir(fs, fileStatus, dst);
} catch (Exception e) {
throw new HiveException(e);
}
@@ -2029,27 +2031,17 @@ public final class Utilities {
return toRetain;
}
- public static boolean isCopyFile(String filename) {
- String taskId = filename;
- String copyFileSuffix = null;
- int dirEnd = filename.lastIndexOf(Path.SEPARATOR);
+ public static boolean isCopyFile(String filepath) {
+ String filename = filepath;
+ int dirEnd = filepath.lastIndexOf(Path.SEPARATOR);
if (dirEnd != -1) {
- taskId = filename.substring(dirEnd + 1);
- }
- Matcher m = COPY_FILE_NAME_TO_TASK_ID_REGEX.matcher(taskId);
- if (!m.matches()) {
- LOG.warn("Unable to verify if file name {} has _copy_ suffix.", filename);
- } else {
- taskId = m.group(1);
- copyFileSuffix = m.group(4);
+ filename = filepath.substring(dirEnd + 1);
}
-
- LOG.debug("Filename: {} TaskId: {} CopySuffix: {}", filename, taskId, copyFileSuffix);
- if (taskId != null && copyFileSuffix != null) {
- return true;
+ ParsedOutputFileName parsedFileName = ParsedOutputFileName.parse(filename);
+ if (!parsedFileName.matches()) {
+ LOG.warn("Unable to verify if file name {} has _copy_ suffix.", filepath);
}
-
- return false;
+ return parsedFileName.isCopyFile();
}
public static String getBucketFileNameFromPathSubString(String bucketName) {
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/exec/ParsedOutputFileNameTest.java b/ql/src/test/org/apache/hadoop/hive/ql/exec/ParsedOutputFileNameTest.java
new file mode 100644
index 0000000..50c3615
--- /dev/null
+++ b/ql/src/test/org/apache/hadoop/hive/ql/exec/ParsedOutputFileNameTest.java
@@ -0,0 +1,177 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.exec;
+
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class ParsedOutputFileNameTest {
+ @Test
+ public void testStandardNoAttemptId() {
+ ParsedOutputFileName p = ParsedOutputFileName.parse("00001");
+ Assert.assertTrue(p.matches());
+ Assert.assertNull(p.getTaskIdPrefix());
+ Assert.assertEquals("00001", p.getTaskId());
+ Assert.assertEquals("00001", p.getPrefixedTaskId());
+ Assert.assertNull(p.getAttemptId());
+ Assert.assertNull(p.getCopyIndex());
+ Assert.assertFalse(p.isCopyFile());
+ Assert.assertNull(p.getSuffix());
+ }
+
+ @Test
+ public void testStandard() throws Exception {
+ ParsedOutputFileName p = ParsedOutputFileName.parse("00001_02");
+ Assert.assertTrue(p.matches());
+ Assert.assertNull(p.getTaskIdPrefix());
+ Assert.assertEquals("00001", p.getTaskId());
+ Assert.assertEquals("00001", p.getPrefixedTaskId());
+ Assert.assertEquals("02", p.getAttemptId());
+ Assert.assertNull(p.getCopyIndex());
+ Assert.assertFalse(p.isCopyFile());
+ Assert.assertNull(p.getSuffix());
+ Assert.assertEquals("00001_02_copy_3", p.makeFilenameWithCopyIndex(3));
+ }
+
+ @Test
+ public void testStandardPrefix() throws Exception {
+ ParsedOutputFileName p = ParsedOutputFileName.parse("(prefix)00001_02");
+ Assert.assertTrue(p.matches());
+ Assert.assertEquals("(prefix)", p.getTaskIdPrefix());
+ Assert.assertEquals("00001", p.getTaskId());
+ Assert.assertEquals("(prefix)00001", p.getPrefixedTaskId());
+ Assert.assertEquals("02", p.getAttemptId());
+ Assert.assertNull(p.getCopyIndex());
+ Assert.assertFalse(p.isCopyFile());
+ Assert.assertNull(p.getSuffix());
+ Assert.assertEquals("(prefix)00001_02_copy_3", p.makeFilenameWithCopyIndex(3));
+ }
+
+ @Test
+ public void testStandardSuffix() throws Exception {
+ ParsedOutputFileName p = ParsedOutputFileName.parse("00001_02.snappy.orc");
+ Assert.assertTrue(p.matches());
+ Assert.assertNull(p.getTaskIdPrefix());
+ Assert.assertEquals("00001", p.getTaskId());
+ Assert.assertEquals("00001", p.getPrefixedTaskId());
+ Assert.assertEquals("02", p.getAttemptId());
+ Assert.assertNull(p.getCopyIndex());
+ Assert.assertFalse(p.isCopyFile());
+ Assert.assertEquals(".snappy.orc", p.getSuffix());
+ Assert.assertEquals("00001_02_copy_3", p.makeFilenameWithCopyIndex(3));
+ }
+
+ @Test
+ public void testPrefixAndSuffix() throws Exception {
+ ParsedOutputFileName p = ParsedOutputFileName.parse("tmp_(prefix)00001_02.snappy.orc");
+ Assert.assertTrue(p.matches());
+ Assert.assertEquals("(prefix)", p.getTaskIdPrefix());
+ Assert.assertEquals("00001", p.getTaskId());
+ Assert.assertEquals("(prefix)00001", p.getPrefixedTaskId());
+ Assert.assertEquals("02", p.getAttemptId());
+ Assert.assertNull(p.getCopyIndex());
+ Assert.assertFalse(p.isCopyFile());
+ Assert.assertEquals(".snappy.orc", p.getSuffix());
+ Assert.assertEquals("tmp_(prefix)00001_02_copy_3", p.makeFilenameWithCopyIndex(3));
+ }
+
+ @Test
+ public void testCopy() throws Exception {
+ ParsedOutputFileName p = ParsedOutputFileName.parse("00001_02_copy_3");
+ Assert.assertTrue(p.matches());
+ Assert.assertNull(p.getTaskIdPrefix());
+ Assert.assertEquals("00001", p.getTaskId());
+ Assert.assertEquals("00001", p.getPrefixedTaskId());
+ Assert.assertEquals("02", p.getAttemptId());
+ Assert.assertEquals("3", p.getCopyIndex());
+ Assert.assertTrue(p.isCopyFile());
+ Assert.assertNull(p.getSuffix());
+ Assert.assertEquals("00001_02_copy_4", p.makeFilenameWithCopyIndex(4));
+ }
+
+ @Test
+ public void testCopyAllParts() throws Exception {
+ ParsedOutputFileName p = ParsedOutputFileName.parse("tmp_(prefix)00001_02_copy_3.snappy.orc");
+ Assert.assertTrue(p.matches());
+ Assert.assertEquals("(prefix)", p.getTaskIdPrefix());
+ Assert.assertEquals("00001", p.getTaskId());
+ Assert.assertEquals("(prefix)00001", p.getPrefixedTaskId());
+ Assert.assertEquals("02", p.getAttemptId());
+ Assert.assertEquals("3", p.getCopyIndex());
+ Assert.assertTrue(p.isCopyFile());
+ Assert.assertEquals(".snappy.orc", p.getSuffix());
+ Assert.assertEquals("tmp_(prefix)00001_02_copy_4", p.makeFilenameWithCopyIndex(4));
+ }
+
+ @Test
+ public void testSparkFileNameSuffixed() {
+ ParsedOutputFileName p = ParsedOutputFileName.parse(
+ "part-00026-23003837-facb-49ec-b1c4-eeda902cacf3.c000.zlib.orc");
+ Assert.assertTrue(p.matches());
+ Assert.assertEquals("00026", p.getTaskId());
+ Assert.assertEquals("00026", p.getPrefixedTaskId());
+ Assert.assertNull(p.getTaskIdPrefix());
+ Assert.assertEquals("1", p.getAttemptId());
+ Assert.assertNull(p.getCopyIndex());
+ Assert.assertFalse(p.isCopyFile());
+ Assert.assertEquals(".zlib.orc", p.getSuffix());
+ try {
+ p.makeFilenameWithCopyIndex(1);
+ Assert.fail("Expected HiveException");
+ } catch(HiveException e) {
+ }
+ }
+
+ @Test
+ public void testSparkFileName() {
+ ParsedOutputFileName p = ParsedOutputFileName.parse("part-00003-c6acfdee-0c32-492e-b209-c2f1cf477770.c000");
+ Assert.assertTrue(p.matches());
+ Assert.assertEquals("00003", p.getTaskId());
+ Assert.assertEquals("00003", p.getPrefixedTaskId());
+ Assert.assertNull(p.getTaskIdPrefix());
+ Assert.assertEquals("1", p.getAttemptId());
+ Assert.assertNull(p.getCopyIndex());
+ Assert.assertFalse(p.isCopyFile());
+ Assert.assertNull(p.getSuffix());
+ try {
+ p.makeFilenameWithCopyIndex(1);
+ Assert.fail("Expected HiveException");
+ } catch(HiveException e) {
+ }
+ }
+
+ @Test
+ public void testNoMatch() {
+ ParsedOutputFileName p = ParsedOutputFileName.parse("ZfsLke");
+ Assert.assertFalse(p.matches());
+ Assert.assertNull(p.getTaskId());
+ Assert.assertNull(p.getPrefixedTaskId());
+ Assert.assertNull(p.getTaskIdPrefix());
+ Assert.assertNull(p.getAttemptId());
+ Assert.assertNull(p.getCopyIndex());
+ Assert.assertFalse(p.isCopyFile());
+ Assert.assertNull(p.getSuffix());
+ try {
+ p.makeFilenameWithCopyIndex(1);
+ Assert.fail("Expected HiveException");
+ } catch(HiveException e) {
+ }
+ }
+}
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/exec/TestUtilities.java b/ql/src/test/org/apache/hadoop/hive/ql/exec/TestUtilities.java
index c46e83b..2cdebe5 100644
--- a/ql/src/test/org/apache/hadoop/hive/ql/exec/TestUtilities.java
+++ b/ql/src/test/org/apache/hadoop/hive/ql/exec/TestUtilities.java
@@ -201,6 +201,51 @@ public class TestUtilities {
assertEquals(NUM_BUCKETS, paths.size());
}
+ @Test
+ public void testRenameFilesNotExists() throws Exception {
+ FileSystem fs = mock(FileSystem.class);
+ Path src = new Path("src");
+ Path dest = new Path("dir");
+ when(fs.exists(dest)).thenReturn(false);
+ when(fs.rename(src, dest)).thenReturn(true);
+ Utilities.renameOrMoveFiles(fs, src, dest);
+ verify(fs, times(1)).rename(src, dest);
+ }
+
+ @Test
+ public void testRenameFileExistsNonHive() throws Exception {
+ FileSystem fs = mock(FileSystem.class);
+ Path src = new Path("src");
+ Path dest = new Path("dir1");
+ Path finalPath = new Path(dest, "src_2");
+ FileStatus status = new FileStatus();
+ status.setPath(src);
+ when(fs.listStatus(src)).thenReturn(new FileStatus[]{status});
+ when(fs.exists(dest)).thenReturn(true);
+ when(fs.exists(new Path(dest, "src"))).thenReturn(true);
+ when(fs.exists(new Path(dest,"src_1"))).thenReturn(true);
+ when(fs.rename(src, finalPath)).thenReturn(true);
+ Utilities.renameOrMoveFiles(fs, src, dest);
+ verify(fs, times(1)).rename(src, finalPath);
+ }
+
+ @Test
+ public void testRenameFileExistsHivePath() throws Exception {
+ FileSystem fs = mock(FileSystem.class);
+ Path src = new Path("00001_02");
+ Path dest = new Path("dir1");
+ Path finalPath = new Path(dest, "00001_02_copy_2");
+ FileStatus status = new FileStatus();
+ status.setPath(src);
+ when(fs.listStatus(src)).thenReturn(new FileStatus[]{status});
+ when(fs.exists(dest)).thenReturn(true);
+ when(fs.exists(new Path(dest, "00001_02"))).thenReturn(true);
+ when(fs.exists(new Path(dest,"00001_02_copy_1"))).thenReturn(true);
+ when(fs.rename(src, finalPath)).thenReturn(true);
+ Utilities.renameOrMoveFiles(fs, src, dest);
+ verify(fs, times(1)).rename(src, finalPath);
+ }
+
private List<Path> runRemoveTempOrDuplicateFilesTestCase(String executionEngine, boolean dPEnabled)
throws Exception {
Configuration hconf = new HiveConf(this.getClass());