You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@crunch.apache.org by mk...@apache.org on 2014/02/05 20:58:07 UTC
git commit: CRUNCH-331: Altered the default for disabling combining
files to true and enabled the combining for sources we know could benefit
from the behavior
Updated Branches:
refs/heads/master 715128b93 -> 752fed58e
CRUNCH-331: Altered the default for disabling combining
files to true and enabled the combining for sources we know could benefit
from the behavior
Project: http://git-wip-us.apache.org/repos/asf/crunch/repo
Commit: http://git-wip-us.apache.org/repos/asf/crunch/commit/752fed58
Tree: http://git-wip-us.apache.org/repos/asf/crunch/tree/752fed58
Diff: http://git-wip-us.apache.org/repos/asf/crunch/diff/752fed58
Branch: refs/heads/master
Commit: 752fed58ee1d9b8b09ae959536af06b1114edf7f
Parents: 715128b
Author: Micah Whitacre <mk...@apache.org>
Authored: Tue Feb 4 22:04:36 2014 -0600
Committer: Micah Whitacre <mk...@apache.org>
Committed: Tue Feb 4 22:04:36 2014 -0600
----------------------------------------------------------------------
.../java/org/apache/crunch/impl/mr/run/CrunchInputFormat.java | 2 +-
.../java/org/apache/crunch/impl/mr/run/RuntimeParameters.java | 5 +++++
.../main/java/org/apache/crunch/io/avro/AvroFileSource.java | 4 +++-
.../java/org/apache/crunch/io/avro/trevni/TrevniKeySource.java | 2 ++
.../src/main/java/org/apache/crunch/io/seq/SeqFileSource.java | 6 +++++-
.../main/java/org/apache/crunch/io/text/TextFileSource.java | 6 +++++-
6 files changed, 21 insertions(+), 4 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/crunch/blob/752fed58/crunch-core/src/main/java/org/apache/crunch/impl/mr/run/CrunchInputFormat.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/impl/mr/run/CrunchInputFormat.java b/crunch-core/src/main/java/org/apache/crunch/impl/mr/run/CrunchInputFormat.java
index 0c6f5e1..8f1c853 100644
--- a/crunch-core/src/main/java/org/apache/crunch/impl/mr/run/CrunchInputFormat.java
+++ b/crunch-core/src/main/java/org/apache/crunch/impl/mr/run/CrunchInputFormat.java
@@ -52,7 +52,7 @@ public class CrunchInputFormat<K, V> extends InputFormat<K, V> {
Job jobCopy = new Job(conf);
InputFormat<?, ?> format = (InputFormat<?, ?>) ReflectionUtils.newInstance(inputBundle.getFormatClass(),
jobCopy.getConfiguration());
- if (format instanceof FileInputFormat && !conf.getBoolean(RuntimeParameters.DISABLE_COMBINE_FILE, false)) {
+ if (format instanceof FileInputFormat && !conf.getBoolean(RuntimeParameters.DISABLE_COMBINE_FILE, true)) {
format = new CrunchCombineFileInputFormat<Object, Object>(job);
}
for (Map.Entry<Integer, List<Path>> nodeEntry : entry.getValue().entrySet()) {
http://git-wip-us.apache.org/repos/asf/crunch/blob/752fed58/crunch-core/src/main/java/org/apache/crunch/impl/mr/run/RuntimeParameters.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/impl/mr/run/RuntimeParameters.java b/crunch-core/src/main/java/org/apache/crunch/impl/mr/run/RuntimeParameters.java
index 0c9f229..07abf11 100644
--- a/crunch-core/src/main/java/org/apache/crunch/impl/mr/run/RuntimeParameters.java
+++ b/crunch-core/src/main/java/org/apache/crunch/impl/mr/run/RuntimeParameters.java
@@ -28,6 +28,11 @@ public final class RuntimeParameters {
public static final String LOG_JOB_PROGRESS = "crunch.log.job.progress";
+ /**
+ * Runtime property which indicates that a {@link org.apache.crunch.Source} should attempt to combine small files
+ * to reduce overhead by default splits. Unless overridden by the {@code Source} implementation it will default to
+ * {@code true}.
+ */
public static final String DISABLE_COMBINE_FILE = "crunch.disable.combine.file";
public static final String COMBINE_FILE_BLOCK_SIZE = "crunch.combine.file.block.size";
http://git-wip-us.apache.org/repos/asf/crunch/blob/752fed58/crunch-core/src/main/java/org/apache/crunch/io/avro/AvroFileSource.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/io/avro/AvroFileSource.java b/crunch-core/src/main/java/org/apache/crunch/io/avro/AvroFileSource.java
index 17f47d7..1b6b27b 100644
--- a/crunch-core/src/main/java/org/apache/crunch/io/avro/AvroFileSource.java
+++ b/crunch-core/src/main/java/org/apache/crunch/io/avro/AvroFileSource.java
@@ -23,6 +23,7 @@ import java.util.List;
import org.apache.avro.io.DatumReader;
import org.apache.avro.mapred.AvroJob;
import org.apache.crunch.ReadableData;
+import org.apache.crunch.impl.mr.run.RuntimeParameters;
import org.apache.crunch.io.FormatBundle;
import org.apache.crunch.io.ReadableSource;
import org.apache.crunch.io.impl.FileSourceImpl;
@@ -39,7 +40,8 @@ public class AvroFileSource<T> extends FileSourceImpl<T> implements ReadableSour
FormatBundle bundle = FormatBundle.forInput(AvroInputFormat.class)
.set(AvroJob.INPUT_IS_REFLECT, String.valueOf(ptype.hasReflect()))
.set(AvroJob.INPUT_SCHEMA, ptype.getSchema().toString())
- .set(Avros.REFLECT_DATA_FACTORY_CLASS, Avros.REFLECT_DATA_FACTORY.getClass().getName());
+ .set(Avros.REFLECT_DATA_FACTORY_CLASS, Avros.REFLECT_DATA_FACTORY.getClass().getName())
+ .set(RuntimeParameters.DISABLE_COMBINE_FILE, Boolean.FALSE.toString());
AvroMode.fromType(ptype).configure(bundle);
return bundle;
}
http://git-wip-us.apache.org/repos/asf/crunch/blob/752fed58/crunch-core/src/main/java/org/apache/crunch/io/avro/trevni/TrevniKeySource.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/io/avro/trevni/TrevniKeySource.java b/crunch-core/src/main/java/org/apache/crunch/io/avro/trevni/TrevniKeySource.java
index 3f387d7..fb0e8fe 100644
--- a/crunch-core/src/main/java/org/apache/crunch/io/avro/trevni/TrevniKeySource.java
+++ b/crunch-core/src/main/java/org/apache/crunch/io/avro/trevni/TrevniKeySource.java
@@ -19,6 +19,7 @@ package org.apache.crunch.io.avro.trevni;
import java.util.List;
import org.apache.avro.mapred.AvroJob;
+import org.apache.crunch.impl.mr.run.RuntimeParameters;
import org.apache.crunch.io.FormatBundle;
import org.apache.crunch.io.ReadableSource;
import org.apache.crunch.ReadableData;
@@ -37,6 +38,7 @@ public class TrevniKeySource<T> extends FileSourceImpl<T> implements ReadableSou
return FormatBundle.forInput(AvroTrevniKeyInputFormat.class)
.set(AvroJob.INPUT_IS_REFLECT, String.valueOf(ptype.hasReflect()))
.set(AvroJob.INPUT_SCHEMA, ptype.getSchema().toString())
+ .set(RuntimeParameters.DISABLE_COMBINE_FILE, Boolean.FALSE.toString())
.set(Avros.REFLECT_DATA_FACTORY_CLASS, Avros.REFLECT_DATA_FACTORY.getClass().getName());
}
http://git-wip-us.apache.org/repos/asf/crunch/blob/752fed58/crunch-core/src/main/java/org/apache/crunch/io/seq/SeqFileSource.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/io/seq/SeqFileSource.java b/crunch-core/src/main/java/org/apache/crunch/io/seq/SeqFileSource.java
index 1bf64e4..0c2a14c 100644
--- a/crunch-core/src/main/java/org/apache/crunch/io/seq/SeqFileSource.java
+++ b/crunch-core/src/main/java/org/apache/crunch/io/seq/SeqFileSource.java
@@ -19,7 +19,10 @@ package org.apache.crunch.io.seq;
import java.io.IOException;
+import java.util.Collections;
import java.util.List;
+
+import org.apache.crunch.impl.mr.run.RuntimeParameters;
import org.apache.crunch.io.ReadableSource;
import org.apache.crunch.ReadableData;
import org.apache.crunch.io.impl.FileSourceImpl;
@@ -31,11 +34,12 @@ import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
public class SeqFileSource<T> extends FileSourceImpl<T> implements ReadableSource<T> {
public SeqFileSource(Path path, PType<T> ptype) {
- super(path, ptype, SequenceFileInputFormat.class);
+ this(Collections.<Path>singletonList(path), ptype);
}
public SeqFileSource(List<Path> paths, PType<T> ptype) {
super(paths, ptype, SequenceFileInputFormat.class);
+ inputBundle.set(RuntimeParameters.DISABLE_COMBINE_FILE, Boolean.FALSE.toString());
}
@Override
http://git-wip-us.apache.org/repos/asf/crunch/blob/752fed58/crunch-core/src/main/java/org/apache/crunch/io/text/TextFileSource.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/io/text/TextFileSource.java b/crunch-core/src/main/java/org/apache/crunch/io/text/TextFileSource.java
index fe23c47..732288d 100644
--- a/crunch-core/src/main/java/org/apache/crunch/io/text/TextFileSource.java
+++ b/crunch-core/src/main/java/org/apache/crunch/io/text/TextFileSource.java
@@ -19,7 +19,10 @@ package org.apache.crunch.io.text;
import java.io.IOException;
+import java.util.Collections;
import java.util.List;
+
+import org.apache.crunch.impl.mr.run.RuntimeParameters;
import org.apache.crunch.io.ReadableSource;
import org.apache.crunch.ReadableData;
import org.apache.crunch.io.impl.FileSourceImpl;
@@ -42,11 +45,12 @@ public class TextFileSource<T> extends FileSourceImpl<T> implements ReadableSour
}
public TextFileSource(Path path, PType<T> ptype) {
- super(path, ptype, getInputFormat(path, ptype));
+ this(Collections.singletonList(path), ptype);
}
public TextFileSource(List<Path> paths, PType<T> ptype) {
super(paths, ptype, getInputFormat(paths.get(0), ptype));
+ inputBundle.set(RuntimeParameters.DISABLE_COMBINE_FILE, Boolean.FALSE.toString());
}
@Override