You are viewing a plain text version of this content. The canonical link for it is here.
Posted to mapreduce-commits@hadoop.apache.org by to...@apache.org on 2010/03/27 00:49:56 UTC
svn commit: r928104 - in /hadoop/mapreduce/trunk: ./ src/contrib/sqoop/
src/contrib/sqoop/src/test/org/apache/hadoop/sqoop/testutil/
src/java/org/apache/hadoop/mapreduce/lib/db/
Author: tomwhite
Date: Fri Mar 26 23:49:55 2010
New Revision: 928104
URL: http://svn.apache.org/viewvc?rev=928104&view=rev
Log:
MAPREDUCE-1489. DataDrivenDBInputFormat should not query the database when generating only one split. Contributed by Aaron Kimball.
Modified:
hadoop/mapreduce/trunk/CHANGES.txt
hadoop/mapreduce/trunk/src/contrib/sqoop/build.xml
hadoop/mapreduce/trunk/src/contrib/sqoop/src/test/org/apache/hadoop/sqoop/testutil/CommonArgs.java
hadoop/mapreduce/trunk/src/contrib/sqoop/src/test/org/apache/hadoop/sqoop/testutil/ImportJobTestCase.java
hadoop/mapreduce/trunk/src/java/org/apache/hadoop/mapreduce/lib/db/BigDecimalSplitter.java
hadoop/mapreduce/trunk/src/java/org/apache/hadoop/mapreduce/lib/db/DataDrivenDBInputFormat.java
hadoop/mapreduce/trunk/src/java/org/apache/hadoop/mapreduce/lib/db/DateSplitter.java
hadoop/mapreduce/trunk/src/java/org/apache/hadoop/mapreduce/lib/db/FloatSplitter.java
hadoop/mapreduce/trunk/src/java/org/apache/hadoop/mapreduce/lib/db/IntegerSplitter.java
hadoop/mapreduce/trunk/src/java/org/apache/hadoop/mapreduce/lib/db/TextSplitter.java
Modified: hadoop/mapreduce/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/mapreduce/trunk/CHANGES.txt?rev=928104&r1=928103&r2=928104&view=diff
==============================================================================
--- hadoop/mapreduce/trunk/CHANGES.txt (original)
+++ hadoop/mapreduce/trunk/CHANGES.txt Fri Mar 26 23:49:55 2010
@@ -232,6 +232,9 @@ Trunk (unreleased changes)
MAPREDUCE-1629. Get rid of fakeBlockLocations() on HarFileSystem, since
it's not used (mahadev)
+ MAPREDUCE-1489. DataDrivenDBInputFormat should not query the database
+ when generating only one split. (Aaron Kimball via tomwhite)
+
OPTIMIZATIONS
MAPREDUCE-270. Fix the tasktracker to optionally send an out-of-band
Modified: hadoop/mapreduce/trunk/src/contrib/sqoop/build.xml
URL: http://svn.apache.org/viewvc/hadoop/mapreduce/trunk/src/contrib/sqoop/build.xml?rev=928104&r1=928103&r2=928104&view=diff
==============================================================================
--- hadoop/mapreduce/trunk/src/contrib/sqoop/build.xml (original)
+++ hadoop/mapreduce/trunk/src/contrib/sqoop/build.xml Fri Mar 26 23:49:55 2010
@@ -63,9 +63,35 @@ to call at top-level: ant deploy-contrib
<!-- ================================================================== -->
<!-- Run unit tests -->
<!-- Override with our own version so we can set hadoop.alt.classpath -->
- <!-- and Hadoop logger properties -->
+ <!-- and Hadoop logger properties. -->
+ <!-- We also want to select specific sub-batteries of tests to run -->
+ <!-- tests of third-party databases, etc. -->
+ <!-- -->
+ <!-- By default, we'll run the "normal" tests: Test*.java -->
+ <!-- To run third-party tests, run with -Dthirdparty=true -->
<!-- ================================================================== -->
- <target name="test" depends="compile-test, compile" if="test.available">
+ <target name="test" depends="compile-test, compile, test-prep, run-tests"
+ if="test.available">
+ </target>
+
+ <!-- set variables that configure the test proper -->
+ <target name="test-prep" depends="test-prep-normal, test-prep-thirdparty"/>
+
+ <target name="test-prep-normal" unless="thirdparty">
+ <!-- Set this to run all the "standard" tests -->
+ <property name="test.pattern" value="Test*" />
+ </target>
+
+ <target name="test-prep-thirdparty" if="thirdparty">
+ <!-- Run tests that *end* with the name Test, instead of starting with it;
+ this runs non-standard tests e.g. third-party database tests. -->
+ <property name="test.pattern" value="*Test" />
+ </target>
+
+
+ <!-- actually run the selected unit tests -->
+ <target name="run-tests" depends="compile-test, compile, test-prep"
+ if="test.available">
<echo message="contrib: ${name}"/>
<delete dir="${hadoop.log.dir}"/>
<mkdir dir="${hadoop.log.dir}"/>
@@ -142,7 +168,7 @@ to call at top-level: ant deploy-contrib
<formatter type="${test.junit.output.format}" />
<batchtest todir="${build.test}" unless="testcase">
<fileset dir="${src.test}"
- includes="**/Test*.java" excludes="**/${test.exclude}.java" />
+ includes="**/${test.pattern}.java" excludes="**/${test.exclude}.java" />
</batchtest>
<batchtest todir="${build.test}" if="testcase">
<fileset dir="${src.test}" includes="**/${testcase}.java"/>
Modified: hadoop/mapreduce/trunk/src/contrib/sqoop/src/test/org/apache/hadoop/sqoop/testutil/CommonArgs.java
URL: http://svn.apache.org/viewvc/hadoop/mapreduce/trunk/src/contrib/sqoop/src/test/org/apache/hadoop/sqoop/testutil/CommonArgs.java?rev=928104&r1=928103&r2=928104&view=diff
==============================================================================
--- hadoop/mapreduce/trunk/src/contrib/sqoop/src/test/org/apache/hadoop/sqoop/testutil/CommonArgs.java (original)
+++ hadoop/mapreduce/trunk/src/contrib/sqoop/src/test/org/apache/hadoop/sqoop/testutil/CommonArgs.java Fri Mar 26 23:49:55 2010
@@ -36,7 +36,7 @@ public final class CommonArgs {
args.add("-D");
args.add("mapreduce.job.maps=1");
args.add("-D");
- args.add("fs.default.name=file:///");
+ args.add("fs.defaultFS=file:///");
args.add("-D");
args.add("jobclient.completion.poll.interval=50");
args.add("-D");
Modified: hadoop/mapreduce/trunk/src/contrib/sqoop/src/test/org/apache/hadoop/sqoop/testutil/ImportJobTestCase.java
URL: http://svn.apache.org/viewvc/hadoop/mapreduce/trunk/src/contrib/sqoop/src/test/org/apache/hadoop/sqoop/testutil/ImportJobTestCase.java?rev=928104&r1=928103&r2=928104&view=diff
==============================================================================
--- hadoop/mapreduce/trunk/src/contrib/sqoop/src/test/org/apache/hadoop/sqoop/testutil/ImportJobTestCase.java (original)
+++ hadoop/mapreduce/trunk/src/contrib/sqoop/src/test/org/apache/hadoop/sqoop/testutil/ImportJobTestCase.java Fri Mar 26 23:49:55 2010
@@ -93,7 +93,7 @@ public class ImportJobTestCase extends B
args.add(getConnectString());
args.add("--as-sequencefile");
args.add("--num-mappers");
- args.add("1");
+ args.add("2");
args.addAll(getExtraArgs(conf));
Modified: hadoop/mapreduce/trunk/src/java/org/apache/hadoop/mapreduce/lib/db/BigDecimalSplitter.java
URL: http://svn.apache.org/viewvc/hadoop/mapreduce/trunk/src/java/org/apache/hadoop/mapreduce/lib/db/BigDecimalSplitter.java?rev=928104&r1=928103&r2=928104&view=diff
==============================================================================
--- hadoop/mapreduce/trunk/src/java/org/apache/hadoop/mapreduce/lib/db/BigDecimalSplitter.java (original)
+++ hadoop/mapreduce/trunk/src/java/org/apache/hadoop/mapreduce/lib/db/BigDecimalSplitter.java Fri Mar 26 23:49:55 2010
@@ -29,6 +29,7 @@ import org.apache.commons.logging.LogFac
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.InputSplit;
+import org.apache.hadoop.mapreduce.JobContext;
/**
* Implement DBSplitter over BigDecimal values.
@@ -45,7 +46,7 @@ public class BigDecimalSplitter implemen
String lowClausePrefix = colName + " >= ";
String highClausePrefix = colName + " < ";
- BigDecimal numSplits = new BigDecimal(conf.getInt("mapred.map.tasks", 1));
+ BigDecimal numSplits = new BigDecimal(conf.getInt(JobContext.NUM_MAPS, 1));
if (minVal == null && maxVal == null) {
// Range is null to null. Return a null split accordingly.
Modified: hadoop/mapreduce/trunk/src/java/org/apache/hadoop/mapreduce/lib/db/DataDrivenDBInputFormat.java
URL: http://svn.apache.org/viewvc/hadoop/mapreduce/trunk/src/java/org/apache/hadoop/mapreduce/lib/db/DataDrivenDBInputFormat.java?rev=928104&r1=928103&r2=928104&view=diff
==============================================================================
--- hadoop/mapreduce/trunk/src/java/org/apache/hadoop/mapreduce/lib/db/DataDrivenDBInputFormat.java (original)
+++ hadoop/mapreduce/trunk/src/java/org/apache/hadoop/mapreduce/lib/db/DataDrivenDBInputFormat.java Fri Mar 26 23:49:55 2010
@@ -160,6 +160,16 @@ public class DataDrivenDBInputFormat<T e
/** {@inheritDoc} */
public List<InputSplit> getSplits(JobContext job) throws IOException {
+ int targetNumTasks = job.getConfiguration().getInt(JobContext.NUM_MAPS, 1);
+ if (1 == targetNumTasks) {
+ // There's no need to run a bounding vals query; just return a split
+ // that separates nothing. This can be considerably more optimal for a
+ // large table with no index.
+ List<InputSplit> singletonSplit = new ArrayList<InputSplit>();
+ singletonSplit.add(new DataDrivenDBInputSplit("1=1", "1=1"));
+ return singletonSplit;
+ }
+
ResultSet results = null;
Statement statement = null;
Connection connection = getConnection();
Modified: hadoop/mapreduce/trunk/src/java/org/apache/hadoop/mapreduce/lib/db/DateSplitter.java
URL: http://svn.apache.org/viewvc/hadoop/mapreduce/trunk/src/java/org/apache/hadoop/mapreduce/lib/db/DateSplitter.java?rev=928104&r1=928103&r2=928104&view=diff
==============================================================================
--- hadoop/mapreduce/trunk/src/java/org/apache/hadoop/mapreduce/lib/db/DateSplitter.java (original)
+++ hadoop/mapreduce/trunk/src/java/org/apache/hadoop/mapreduce/lib/db/DateSplitter.java Fri Mar 26 23:49:55 2010
@@ -32,6 +32,7 @@ import org.apache.commons.logging.LogFac
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.InputSplit;
+import org.apache.hadoop.mapreduce.JobContext;
/**
* Implement DBSplitter over date/time values.
@@ -55,7 +56,7 @@ public class DateSplitter extends Intege
String lowClausePrefix = colName + " >= ";
String highClausePrefix = colName + " < ";
- int numSplits = conf.getInt("mapred.map.tasks", 1);
+ int numSplits = conf.getInt(JobContext.NUM_MAPS, 1);
if (numSplits < 1) {
numSplits = 1;
}
Modified: hadoop/mapreduce/trunk/src/java/org/apache/hadoop/mapreduce/lib/db/FloatSplitter.java
URL: http://svn.apache.org/viewvc/hadoop/mapreduce/trunk/src/java/org/apache/hadoop/mapreduce/lib/db/FloatSplitter.java?rev=928104&r1=928103&r2=928104&view=diff
==============================================================================
--- hadoop/mapreduce/trunk/src/java/org/apache/hadoop/mapreduce/lib/db/FloatSplitter.java (original)
+++ hadoop/mapreduce/trunk/src/java/org/apache/hadoop/mapreduce/lib/db/FloatSplitter.java Fri Mar 26 23:49:55 2010
@@ -23,19 +23,30 @@ import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.InputSplit;
+import org.apache.hadoop.mapreduce.JobContext;
/**
* Implement DBSplitter over floating-point values.
*/
public class FloatSplitter implements DBSplitter {
+ private static final Log LOG = LogFactory.getLog(FloatSplitter.class);
+
private static final double MIN_INCREMENT = 10000 * Double.MIN_VALUE;
public List<InputSplit> split(Configuration conf, ResultSet results, String colName)
throws SQLException {
+ LOG.warn("Generating splits for a floating-point index column. Due to the");
+ LOG.warn("imprecise representation of floating-point values in Java, this");
+ LOG.warn("may result in an incomplete import.");
+ LOG.warn("You are strongly encouraged to choose an integral split column.");
+
List<InputSplit> splits = new ArrayList<InputSplit>();
if (results.getString(1) == null && results.getString(2) == null) {
@@ -50,7 +61,7 @@ public class FloatSplitter implements DB
// Use this as a hint. May need an extra task if the size doesn't
// divide cleanly.
- int numSplits = conf.getInt("mapred.map.tasks", 1);
+ int numSplits = conf.getInt(JobContext.NUM_MAPS, 1);
double splitSize = (maxVal - minVal) / (double) numSplits;
if (splitSize < MIN_INCREMENT) {
Modified: hadoop/mapreduce/trunk/src/java/org/apache/hadoop/mapreduce/lib/db/IntegerSplitter.java
URL: http://svn.apache.org/viewvc/hadoop/mapreduce/trunk/src/java/org/apache/hadoop/mapreduce/lib/db/IntegerSplitter.java?rev=928104&r1=928103&r2=928104&view=diff
==============================================================================
--- hadoop/mapreduce/trunk/src/java/org/apache/hadoop/mapreduce/lib/db/IntegerSplitter.java (original)
+++ hadoop/mapreduce/trunk/src/java/org/apache/hadoop/mapreduce/lib/db/IntegerSplitter.java Fri Mar 26 23:49:55 2010
@@ -25,6 +25,7 @@ import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.InputSplit;
+import org.apache.hadoop.mapreduce.JobContext;
/**
* Implement DBSplitter over integer values.
@@ -39,7 +40,7 @@ public class IntegerSplitter implements
String lowClausePrefix = colName + " >= ";
String highClausePrefix = colName + " < ";
- int numSplits = conf.getInt("mapred.map.tasks", 1);
+ int numSplits = conf.getInt(JobContext.NUM_MAPS, 1);
if (numSplits < 1) {
numSplits = 1;
}
Modified: hadoop/mapreduce/trunk/src/java/org/apache/hadoop/mapreduce/lib/db/TextSplitter.java
URL: http://svn.apache.org/viewvc/hadoop/mapreduce/trunk/src/java/org/apache/hadoop/mapreduce/lib/db/TextSplitter.java?rev=928104&r1=928103&r2=928104&view=diff
==============================================================================
--- hadoop/mapreduce/trunk/src/java/org/apache/hadoop/mapreduce/lib/db/TextSplitter.java (original)
+++ hadoop/mapreduce/trunk/src/java/org/apache/hadoop/mapreduce/lib/db/TextSplitter.java Fri Mar 26 23:49:55 2010
@@ -30,6 +30,7 @@ import org.apache.commons.logging.LogFac
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.InputSplit;
+import org.apache.hadoop.mapreduce.JobContext;
/**
* Implement DBSplitter over text strings.
@@ -61,7 +62,7 @@ public class TextSplitter extends BigDec
LOG.warn("Generating splits for a textual index column.");
LOG.warn("If your database sorts in a case-insensitive order, "
+ "this may result in a partial import or duplicate records.");
- LOG.warn("You are strongly encouraged to choose a numeric split column.");
+ LOG.warn("You are strongly encouraged to choose an integral split column.");
String minString = results.getString(1);
String maxString = results.getString(2);
@@ -86,7 +87,7 @@ public class TextSplitter extends BigDec
// Use this as a hint. May need an extra task if the size doesn't
// divide cleanly.
- int numSplits = conf.getInt("mapred.map.tasks", 1);
+ int numSplits = conf.getInt(JobContext.NUM_MAPS, 1);
String lowClausePrefix = colName + " >= '";
String highClausePrefix = colName + " < '";