You are viewing a plain text version of this content. The canonical link for it is here.
Posted to mapreduce-commits@hadoop.apache.org by ka...@apache.org on 2014/06/06 20:39:29 UTC
svn commit: r1600979 - in
/hadoop/common/branches/branch-2/hadoop-mapreduce-project: ./
hadoop-mapreduce-client/hadoop-mapreduce-client-core/
hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/
hadoop-mapreduce-...
Author: kasha
Date: Fri Jun 6 18:39:28 2014
New Revision: 1600979
URL: http://svn.apache.org/r1600979
Log:
MAPREDUCE-5777. Support utf-8 text with Byte Order Marker. (Zhihai Xu via kasha)
Added:
hadoop/common/branches/branch-2/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/resources/testBOM.txt
- copied unchanged from r1600977, hadoop/common/trunk/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/resources/testBOM.txt
Modified:
hadoop/common/branches/branch-2/hadoop-mapreduce-project/CHANGES.txt
hadoop/common/branches/branch-2/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/pom.xml
hadoop/common/branches/branch-2/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/LineRecordReader.java
hadoop/common/branches/branch-2/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/input/LineRecordReader.java
hadoop/common/branches/branch-2/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapred/TestLineRecordReader.java
hadoop/common/branches/branch-2/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/input/TestLineRecordReader.java
Modified: hadoop/common/branches/branch-2/hadoop-mapreduce-project/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-2/hadoop-mapreduce-project/CHANGES.txt?rev=1600979&r1=1600978&r2=1600979&view=diff
==============================================================================
--- hadoop/common/branches/branch-2/hadoop-mapreduce-project/CHANGES.txt (original)
+++ hadoop/common/branches/branch-2/hadoop-mapreduce-project/CHANGES.txt Fri Jun 6 18:39:28 2014
@@ -105,6 +105,9 @@ Release 2.5.0 - UNRELEASED
MAPREDUCE-5895. Close streams properly to avoid leakage in TaskLog.
(Kousuke Saruta via devaraj)
+ MAPREDUCE-5777. Support utf-8 text with Byte Order Marker.
+ (Zhihai Xu via kasha)
+
Release 2.4.1 - UNRELEASED
INCOMPATIBLE CHANGES
Modified: hadoop/common/branches/branch-2/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/pom.xml
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-2/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/pom.xml?rev=1600979&r1=1600978&r2=1600979&view=diff
==============================================================================
--- hadoop/common/branches/branch-2/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/pom.xml (original)
+++ hadoop/common/branches/branch-2/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/pom.xml Fri Jun 6 18:39:28 2014
@@ -91,6 +91,7 @@
<configuration>
<excludes>
<exclude>src/test/resources/recordSpanningMultipleSplits.txt</exclude>
+ <exclude>src/test/resources/testBOM.txt</exclude>
</excludes>
</configuration>
</plugin>
Modified: hadoop/common/branches/branch-2/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/LineRecordReader.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-2/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/LineRecordReader.java?rev=1600979&r1=1600978&r2=1600979&view=diff
==============================================================================
--- hadoop/common/branches/branch-2/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/LineRecordReader.java (original)
+++ hadoop/common/branches/branch-2/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/LineRecordReader.java Fri Jun 6 18:39:28 2014
@@ -197,6 +197,39 @@ public class LineRecordReader implements
return retVal;
}
+ private int skipUtfByteOrderMark(Text value) throws IOException {
+ // Strip BOM(Byte Order Mark)
+ // Text only support UTF-8, we only need to check UTF-8 BOM
+ // (0xEF,0xBB,0xBF) at the start of the text stream.
+ int newMaxLineLength = (int) Math.min(3L + (long) maxLineLength,
+ Integer.MAX_VALUE);
+ int newSize = in.readLine(value, newMaxLineLength, maxBytesToConsume(pos));
+ // Even we read 3 extra bytes for the first line,
+ // we won't alter existing behavior (no backwards incompat issue).
+ // Because the newSize is less than maxLineLength and
+ // the number of bytes copied to Text is always no more than newSize.
+ // If the return size from readLine is not less than maxLineLength,
+ // we will discard the current line and read the next line.
+ pos += newSize;
+ int textLength = value.getLength();
+ byte[] textBytes = value.getBytes();
+ if ((textLength >= 3) && (textBytes[0] == (byte)0xEF) &&
+ (textBytes[1] == (byte)0xBB) && (textBytes[2] == (byte)0xBF)) {
+ // find UTF-8 BOM, strip it.
+ LOG.info("Found UTF-8 BOM and skipped it");
+ textLength -= 3;
+ newSize -= 3;
+ if (textLength > 0) {
+ // It may work to use the same buffer and not do the copyBytes
+ textBytes = value.copyBytes();
+ value.set(textBytes, 3, textLength);
+ } else {
+ value.clear();
+ }
+ }
+ return newSize;
+ }
+
/** Read a line. */
public synchronized boolean next(LongWritable key, Text value)
throws IOException {
@@ -206,11 +239,17 @@ public class LineRecordReader implements
while (getFilePosition() <= end || in.needAdditionalRecordAfterSplit()) {
key.set(pos);
- int newSize = in.readLine(value, maxLineLength, maxBytesToConsume(pos));
+ int newSize = 0;
+ if (pos == 0) {
+ newSize = skipUtfByteOrderMark(value);
+ } else {
+ newSize = in.readLine(value, maxLineLength, maxBytesToConsume(pos));
+ pos += newSize;
+ }
+
if (newSize == 0) {
return false;
}
- pos += newSize;
if (newSize < maxLineLength) {
return true;
}
Modified: hadoop/common/branches/branch-2/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/input/LineRecordReader.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-2/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/input/LineRecordReader.java?rev=1600979&r1=1600978&r2=1600979&view=diff
==============================================================================
--- hadoop/common/branches/branch-2/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/input/LineRecordReader.java (original)
+++ hadoop/common/branches/branch-2/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/input/LineRecordReader.java Fri Jun 6 18:39:28 2014
@@ -134,6 +134,39 @@ public class LineRecordReader extends Re
return retVal;
}
+ private int skipUtfByteOrderMark() throws IOException {
+ // Strip BOM(Byte Order Mark)
+ // Text only support UTF-8, we only need to check UTF-8 BOM
+ // (0xEF,0xBB,0xBF) at the start of the text stream.
+ int newMaxLineLength = (int) Math.min(3L + (long) maxLineLength,
+ Integer.MAX_VALUE);
+ int newSize = in.readLine(value, newMaxLineLength, maxBytesToConsume(pos));
+ // Even we read 3 extra bytes for the first line,
+ // we won't alter existing behavior (no backwards incompat issue).
+ // Because the newSize is less than maxLineLength and
+ // the number of bytes copied to Text is always no more than newSize.
+ // If the return size from readLine is not less than maxLineLength,
+ // we will discard the current line and read the next line.
+ pos += newSize;
+ int textLength = value.getLength();
+ byte[] textBytes = value.getBytes();
+ if ((textLength >= 3) && (textBytes[0] == (byte)0xEF) &&
+ (textBytes[1] == (byte)0xBB) && (textBytes[2] == (byte)0xBF)) {
+ // find UTF-8 BOM, strip it.
+ LOG.info("Found UTF-8 BOM and skipped it");
+ textLength -= 3;
+ newSize -= 3;
+ if (textLength > 0) {
+ // It may work to use the same buffer and not do the copyBytes
+ textBytes = value.copyBytes();
+ value.set(textBytes, 3, textLength);
+ } else {
+ value.clear();
+ }
+ }
+ return newSize;
+ }
+
public boolean nextKeyValue() throws IOException {
if (key == null) {
key = new LongWritable();
@@ -146,9 +179,14 @@ public class LineRecordReader extends Re
// We always read one extra line, which lies outside the upper
// split limit i.e. (end - 1)
while (getFilePosition() <= end || in.needAdditionalRecordAfterSplit()) {
- newSize = in.readLine(value, maxLineLength, maxBytesToConsume(pos));
- pos += newSize;
- if (newSize < maxLineLength) {
+ if (pos == 0) {
+ newSize = skipUtfByteOrderMark();
+ } else {
+ newSize = in.readLine(value, maxLineLength, maxBytesToConsume(pos));
+ pos += newSize;
+ }
+
+ if ((newSize == 0) || (newSize < maxLineLength)) {
break;
}
Modified: hadoop/common/branches/branch-2/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapred/TestLineRecordReader.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-2/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapred/TestLineRecordReader.java?rev=1600979&r1=1600978&r2=1600979&view=diff
==============================================================================
--- hadoop/common/branches/branch-2/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapred/TestLineRecordReader.java (original)
+++ hadoop/common/branches/branch-2/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapred/TestLineRecordReader.java Fri Jun 6 18:39:28 2014
@@ -188,4 +188,41 @@ public class TestLineRecordReader {
checkRecordSpanningMultipleSplits("recordSpanningMultipleSplits.txt.bz2",
200 * 1000, true);
}
+
+ @Test
+ public void testStripBOM() throws IOException {
+ // the test data contains a BOM at the start of the file
+ // confirm the BOM is skipped by LineRecordReader
+ String UTF8_BOM = "\uFEFF";
+ URL testFileUrl = getClass().getClassLoader().getResource("testBOM.txt");
+ assertNotNull("Cannot find testBOM.txt", testFileUrl);
+ File testFile = new File(testFileUrl.getFile());
+ Path testFilePath = new Path(testFile.getAbsolutePath());
+ long testFileSize = testFile.length();
+ Configuration conf = new Configuration();
+ conf.setInt(org.apache.hadoop.mapreduce.lib.input.
+ LineRecordReader.MAX_LINE_LENGTH, Integer.MAX_VALUE);
+
+ // read the data and check whether BOM is skipped
+ FileSplit split = new FileSplit(testFilePath, 0, testFileSize,
+ (String[])null);
+ LineRecordReader reader = new LineRecordReader(conf, split);
+ LongWritable key = new LongWritable();
+ Text value = new Text();
+ int numRecords = 0;
+ boolean firstLine = true;
+ boolean skipBOM = true;
+ while (reader.next(key, value)) {
+ if (firstLine) {
+ firstLine = false;
+ if (value.toString().startsWith(UTF8_BOM)) {
+ skipBOM = false;
+ }
+ }
+ ++numRecords;
+ }
+ reader.close();
+
+ assertTrue("BOM is not skipped", skipBOM);
+ }
}
Modified: hadoop/common/branches/branch-2/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/input/TestLineRecordReader.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-2/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/input/TestLineRecordReader.java?rev=1600979&r1=1600978&r2=1600979&view=diff
==============================================================================
--- hadoop/common/branches/branch-2/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/input/TestLineRecordReader.java (original)
+++ hadoop/common/branches/branch-2/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/input/TestLineRecordReader.java Fri Jun 6 18:39:28 2014
@@ -193,4 +193,42 @@ public class TestLineRecordReader {
200 * 1000,
true);
}
+
+ @Test
+ public void testStripBOM() throws IOException {
+ // the test data contains a BOM at the start of the file
+ // confirm the BOM is skipped by LineRecordReader
+ String UTF8_BOM = "\uFEFF";
+ URL testFileUrl = getClass().getClassLoader().getResource("testBOM.txt");
+ assertNotNull("Cannot find testBOM.txt", testFileUrl);
+ File testFile = new File(testFileUrl.getFile());
+ Path testFilePath = new Path(testFile.getAbsolutePath());
+ long testFileSize = testFile.length();
+ Configuration conf = new Configuration();
+ conf.setInt(org.apache.hadoop.mapreduce.lib.input.
+ LineRecordReader.MAX_LINE_LENGTH, Integer.MAX_VALUE);
+
+ TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
+
+ // read the data and check whether BOM is skipped
+ FileSplit split = new FileSplit(testFilePath, 0, testFileSize,
+ (String[])null);
+ LineRecordReader reader = new LineRecordReader();
+ reader.initialize(split, context);
+ int numRecords = 0;
+ boolean firstLine = true;
+ boolean skipBOM = true;
+ while (reader.nextKeyValue()) {
+ if (firstLine) {
+ firstLine = false;
+ if (reader.getCurrentValue().toString().startsWith(UTF8_BOM)) {
+ skipBOM = false;
+ }
+ }
+ ++numRecords;
+ }
+ reader.close();
+
+ assertTrue("BOM is not skipped", skipBOM);
+ }
}