You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by cd...@apache.org on 2008/06/11 02:56:25 UTC

svn commit: r666406 - in /hadoop/core/branches/branch-0.18: ./ conf/ src/core/org/apache/hadoop/util/ src/mapred/org/apache/hadoop/mapred/ src/test/org/apache/hadoop/util/

Author: cdouglas
Date: Tue Jun 10 17:56:25 2008
New Revision: 666406

URL: http://svn.apache.org/viewvc?rev=666406&view=rev
Log:
HADOOP-3442. Limit recursion depth on the stack for QuickSort to prevent
StackOverflowErrors. To avoid O(n*n) cases, when partitioning depth exceeds a
multiple of log(n), change to HeapSort.


Added:
    hadoop/core/branches/branch-0.18/src/core/org/apache/hadoop/util/HeapSort.java
Modified:
    hadoop/core/branches/branch-0.18/CHANGES.txt
    hadoop/core/branches/branch-0.18/conf/hadoop-default.xml
    hadoop/core/branches/branch-0.18/src/core/org/apache/hadoop/util/IndexedSorter.java
    hadoop/core/branches/branch-0.18/src/core/org/apache/hadoop/util/QuickSort.java
    hadoop/core/branches/branch-0.18/src/mapred/org/apache/hadoop/mapred/MapTask.java
    hadoop/core/branches/branch-0.18/src/test/org/apache/hadoop/util/TestIndexedSort.java

Modified: hadoop/core/branches/branch-0.18/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/core/branches/branch-0.18/CHANGES.txt?rev=666406&r1=666405&r2=666406&view=diff
==============================================================================
--- hadoop/core/branches/branch-0.18/CHANGES.txt (original)
+++ hadoop/core/branches/branch-0.18/CHANGES.txt Tue Jun 10 17:56:25 2008
@@ -555,6 +555,10 @@
     HADOOP-3528. Metrics FilesCreated and files_deleted metrics 
     do not match. (Lohit via Mahadev)
 
+    HADOOP-3442. Limit recursion depth on the stack for QuickSort to prevent
+    StackOverflowErrors. To avoid O(n*n) cases, when partitioning depth exceeds
+    a multiple of log(n), change to HeapSort. (cdouglas)
+
 Release 0.17.0 - 2008-05-18
 
   INCOMPATIBLE CHANGES

Modified: hadoop/core/branches/branch-0.18/conf/hadoop-default.xml
URL: http://svn.apache.org/viewvc/hadoop/core/branches/branch-0.18/conf/hadoop-default.xml?rev=666406&r1=666405&r2=666406&view=diff
==============================================================================
--- hadoop/core/branches/branch-0.18/conf/hadoop-default.xml (original)
+++ hadoop/core/branches/branch-0.18/conf/hadoop-default.xml Tue Jun 10 17:56:25 2008
@@ -966,7 +966,7 @@
 
 <property>
   <name>map.sort.class</name>
-  <value>org.apache.hadoop.mapred.MergeSorter</value>
+  <value>org.apache.hadoop.util.QuickSort</value>
   <description>The default sort class for sorting keys.
   </description>
 </property>

Added: hadoop/core/branches/branch-0.18/src/core/org/apache/hadoop/util/HeapSort.java
URL: http://svn.apache.org/viewvc/hadoop/core/branches/branch-0.18/src/core/org/apache/hadoop/util/HeapSort.java?rev=666406&view=auto
==============================================================================
--- hadoop/core/branches/branch-0.18/src/core/org/apache/hadoop/util/HeapSort.java (added)
+++ hadoop/core/branches/branch-0.18/src/core/org/apache/hadoop/util/HeapSort.java Tue Jun 10 17:56:25 2008
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.util;
+
+/**
+ * An implementation of the core algorithm of HeapSort.
+ */
+public final class HeapSort implements IndexedSorter {
+
+  public HeapSort() { }
+
+  private static void downHeap(final IndexedSortable s, final int b,
+      int i, final int N) {
+    for (int idx = i << 1; idx < N; idx = i << 1) {
+      if (idx + 1 < N && s.compare(b + idx, b + idx + 1) < 0) {
+        if (s.compare(b + i, b + idx + 1) < 0) {
+          s.swap(b + i, b + idx + 1);
+        } else return;
+        i = idx + 1;
+      } else if (s.compare(b + i, b + idx) < 0) {
+        s.swap(b + i, b + idx);
+        i = idx;
+      } else return;
+    }
+  }
+
+  /**
+   * Sort the given range of items using heap sort.
+   * {@inheritDoc}
+   */
+  public void sort(IndexedSortable s, int p, int r) {
+    sort(s, p, r, null);
+  }
+
+  /**
+   * {@inheritDoc}
+   */
+  public void sort(final IndexedSortable s, final int p, final int r,
+      final Progressable rep) {
+    final int N = r - p;
+    // build heap w/ reverse comparator, then write in-place from end
+    final int t = Integer.highestOneBit(N);
+    for (int i = t; i > 1; i >>>= 1) {
+      for (int j = i >>> 1; j < i; ++j) {
+        downHeap(s, p-1, j, N + 1);
+      }
+      if (null != rep) {
+        rep.progress();
+      }
+    }
+    for (int i = r - 1; i > p; --i) {
+      s.swap(p, i);
+      downHeap(s, p - 1, 1, i - p + 1);
+    }
+  }
+}

Modified: hadoop/core/branches/branch-0.18/src/core/org/apache/hadoop/util/IndexedSorter.java
URL: http://svn.apache.org/viewvc/hadoop/core/branches/branch-0.18/src/core/org/apache/hadoop/util/IndexedSorter.java?rev=666406&r1=666405&r2=666406&view=diff
==============================================================================
--- hadoop/core/branches/branch-0.18/src/core/org/apache/hadoop/util/IndexedSorter.java (original)
+++ hadoop/core/branches/branch-0.18/src/core/org/apache/hadoop/util/IndexedSorter.java Tue Jun 10 17:56:25 2008
@@ -35,4 +35,12 @@
    * @see IndexedSortable#swap
    */
   void sort(IndexedSortable s, int l, int r);
+
+  /**
+   * Same as {@link #sort(IndexedSortable,int,int)}, but indicate progress
+   * periodically.
+   * @see #sort(IndexedSortable,int,int)
+   */
+  void sort(IndexedSortable s, int l, int r, Progressable rep);
+
 }

Modified: hadoop/core/branches/branch-0.18/src/core/org/apache/hadoop/util/QuickSort.java
URL: http://svn.apache.org/viewvc/hadoop/core/branches/branch-0.18/src/core/org/apache/hadoop/util/QuickSort.java?rev=666406&r1=666405&r2=666406&view=diff
==============================================================================
--- hadoop/core/branches/branch-0.18/src/core/org/apache/hadoop/util/QuickSort.java (original)
+++ hadoop/core/branches/branch-0.18/src/core/org/apache/hadoop/util/QuickSort.java Tue Jun 10 17:56:25 2008
@@ -19,41 +19,66 @@
 
 /**
  * An implementation of the core algorithm of QuickSort.
- * See "Median-of-Three Partitioning" in Sedgewick book.
  */
 public final class QuickSort implements IndexedSorter {
 
+  private static final IndexedSorter alt = new HeapSort();
+
   public QuickSort() { }
 
-  private void fix(IndexedSortable s, int p, int r) {
+  private static void fix(IndexedSortable s, int p, int r) {
     if (s.compare(p, r) > 0) {
       s.swap(p, r);
     }
   }
 
+  /**
+   * Deepest recursion before giving up and doing a heapsort.
+   * Returns 2 * ceil(log(n)).
+   */
+  protected static int getMaxDepth(int x) {
+    if (x <= 0)
+      throw new IllegalArgumentException("Undefined for " + x);
+    return (32 - Integer.numberOfLeadingZeros(x - 1)) << 2;
+  }
+
+  /**
+   * Sort the given range of items using quick sort.
+   * {@inheritDoc} If the recursion depth falls below {@link #getMaxDepth},
+   * then switch to {@link HeapSort}.
+   */
   public void sort(IndexedSortable s, int p, int r) {
     sort(s, p, r, null);
   }
 
   /**
-   * Same as {@link #sort}, but indicate that we're making progress after
-   * each partition.
+   * {@inheritDoc}
    */
-  public void sort(final IndexedSortable s, final int p, final int r,
+  public void sort(final IndexedSortable s, int p, int r,
       final Progressable rep) {
+    sortInternal(s, p, r, rep, getMaxDepth(r - p));
+  }
+
+  private static void sortInternal(final IndexedSortable s, int p, int r,
+      final Progressable rep, int depth) {
     if (null != rep) {
       rep.progress();
     }
+    while (true) {
     if (r-p < 13) {
       for (int i = p; i < r; ++i) {
-        for (int j = i; j > p; --j) {
-          if (s.compare(j-1, j) > 0) {
-            s.swap(j, j-1);
-          }
+        for (int j = i; j > p && s.compare(j-1, j) > 0; --j) {
+          s.swap(j, j-1);
         }
       }
       return;
     }
+    if (--depth < 0) {
+      // give up
+      System.out.print("H");
+      alt.sort(s, p, r, rep);
+      return;
+    }
 
     // select, move pivot into first position
     fix(s, (p+r) >>> 1, p);
@@ -95,11 +120,12 @@
     // Recurse on smaller interval first to keep stack shallow
     assert i != j;
     if (i - p < r - j) {
-      sort(s, p, i, rep);
-      sort(s, j, r, rep);
+      sortInternal(s, p, i, rep, depth);
+      p = j;
     } else {
-      sort(s, j, r, rep);
-      sort(s, p, i, rep);
+      sortInternal(s, j, r, rep, depth);
+      r = i;
+    }
     }
   }
 

Modified: hadoop/core/branches/branch-0.18/src/mapred/org/apache/hadoop/mapred/MapTask.java
URL: http://svn.apache.org/viewvc/hadoop/core/branches/branch-0.18/src/mapred/org/apache/hadoop/mapred/MapTask.java?rev=666406&r1=666405&r2=666406&view=diff
==============================================================================
--- hadoop/core/branches/branch-0.18/src/mapred/org/apache/hadoop/mapred/MapTask.java (original)
+++ hadoop/core/branches/branch-0.18/src/mapred/org/apache/hadoop/mapred/MapTask.java Tue Jun 10 17:56:25 2008
@@ -51,6 +51,7 @@
 import org.apache.hadoop.mapred.IFile.Reader;
 import org.apache.hadoop.mapred.Merger.Segment;
 import org.apache.hadoop.util.IndexedSortable;
+import org.apache.hadoop.util.IndexedSorter;
 import org.apache.hadoop.util.Progress;
 import org.apache.hadoop.util.QuickSort;
 import org.apache.hadoop.util.ReflectionUtils;
@@ -323,8 +324,8 @@
     private final int softRecordLimit;
     private final int softBufferLimit;
     private final int minSpillsForCombine;
+    private final IndexedSorter sorter;
     private final Object spillLock = new Object();
-    private final QuickSort sorter = new QuickSort();
     private final BlockingBuffer bb = new BlockingBuffer();
 
     private final FileSystem localFs;
@@ -356,6 +357,9 @@
       if ((sortmb & 0x7FF) != sortmb) {
         throw new IOException("Invalid \"io.sort.mb\": " + sortmb);
       }
+      sorter = (IndexedSorter)
+        ReflectionUtils.newInstance(
+            job.getClass("map.sort.class", QuickSort.class), job);
       LOG.info("io.sort.mb = " + sortmb);
       // buffers and accounting
       int maxMemUsage = sortmb << 20;

Modified: hadoop/core/branches/branch-0.18/src/test/org/apache/hadoop/util/TestIndexedSort.java
URL: http://svn.apache.org/viewvc/hadoop/core/branches/branch-0.18/src/test/org/apache/hadoop/util/TestIndexedSort.java?rev=666406&r1=666405&r2=666406&view=diff
==============================================================================
--- hadoop/core/branches/branch-0.18/src/test/org/apache/hadoop/util/TestIndexedSort.java (original)
+++ hadoop/core/branches/branch-0.18/src/test/org/apache/hadoop/util/TestIndexedSort.java Tue Jun 10 17:56:25 2008
@@ -31,6 +31,146 @@
 
 public class TestIndexedSort extends TestCase {
 
+  public void sortAllEqual(IndexedSorter sorter) throws Exception {
+    final int SAMPLE = 500;
+    int[] values = new int[SAMPLE];
+    Arrays.fill(values, 10);
+    SampleSortable s = new SampleSortable(values);
+    sorter.sort(s, 0, SAMPLE);
+    int[] check = s.getSorted();
+    assertTrue(Arrays.toString(values) + "\ndoesn't match\n" +
+        Arrays.toString(check), Arrays.equals(values, check));
+    // Set random min/max, re-sort.
+    Random r = new Random();
+    int min = r.nextInt(SAMPLE);
+    int max = (min + 1 + r.nextInt(SAMPLE - 2)) % SAMPLE;
+    values[min] = 9;
+    values[max] = 11;
+    System.out.println("testAllEqual setting min/max at " + min + "/" + max +
+        "(" + sorter.getClass().getName() + ")");
+    s = new SampleSortable(values);
+    sorter.sort(s, 0, SAMPLE);
+    check = s.getSorted();
+    Arrays.sort(values);
+    assertTrue(check[0] == 9);
+    assertTrue(check[SAMPLE - 1] == 11);
+    assertTrue(Arrays.toString(values) + "\ndoesn't match\n" +
+        Arrays.toString(check), Arrays.equals(values, check));
+  }
+
+  public void sortSorted(IndexedSorter sorter) throws Exception {
+    final int SAMPLE = 500;
+    int[] values = new int[SAMPLE];
+    Random r = new Random();
+    long seed = r.nextLong();
+    r.setSeed(seed);
+    System.out.println("testSorted seed: " + seed +
+        "(" + sorter.getClass().getName() + ")");
+    for (int i = 0; i < SAMPLE; ++i) {
+      values[i] = r.nextInt(100);
+    }
+    Arrays.sort(values);
+    SampleSortable s = new SampleSortable(values);
+    sorter.sort(s, 0, SAMPLE);
+    int[] check = s.getSorted();
+    assertTrue(Arrays.toString(values) + "\ndoesn't match\n" +
+        Arrays.toString(check), Arrays.equals(values, check));
+  }
+
+  public void sortSequential(IndexedSorter sorter) throws Exception {
+    final int SAMPLE = 500;
+    int[] values = new int[SAMPLE];
+    for (int i = 0; i < SAMPLE; ++i) {
+      values[i] = i;
+    }
+    SampleSortable s = new SampleSortable(values);
+    sorter.sort(s, 0, SAMPLE);
+    int[] check = s.getSorted();
+    assertTrue(Arrays.toString(values) + "\ndoesn't match\n" +
+        Arrays.toString(check), Arrays.equals(values, check));
+  }
+
+  public void sortSingleRecord(IndexedSorter sorter) throws Exception {
+    final int SAMPLE = 1;
+    SampleSortable s = new SampleSortable(SAMPLE);
+    int[] values = s.getValues();
+    sorter.sort(s, 0, SAMPLE);
+    int[] check = s.getSorted();
+    assertTrue(Arrays.toString(values) + "\ndoesn't match\n" +
+        Arrays.toString(check), Arrays.equals(values, check));
+  }
+
+  public void sortRandom(IndexedSorter sorter) throws Exception {
+    final int SAMPLE = 256 * 1024;
+    SampleSortable s = new SampleSortable(SAMPLE);
+    long seed = s.getSeed();
+    System.out.println("sortRandom seed: " + seed +
+        "(" + sorter.getClass().getName() + ")");
+    int[] values = s.getValues();
+    Arrays.sort(values);
+    sorter.sort(s, 0, SAMPLE);
+    int[] check = s.getSorted();
+    assertTrue("seed: " + seed + "\ndoesn't match\n",
+               Arrays.equals(values, check));
+  }
+
+  public void sortWritable(IndexedSorter sorter) throws Exception {
+    final int SAMPLE = 1000;
+    WritableSortable s = new WritableSortable(SAMPLE);
+    long seed = s.getSeed();
+    System.out.println("sortWritable seed: " + seed +
+        "(" + sorter.getClass().getName() + ")");
+    String[] values = s.getValues();
+    Arrays.sort(values);
+    sorter.sort(s, 0, SAMPLE);
+    String[] check = s.getSorted();
+    assertTrue("seed: " + seed + "\ndoesn't match",
+               Arrays.equals(values, check));
+  }
+
+
+  public void testQuickSort() throws Exception {
+    QuickSort sorter = new QuickSort();
+    sortRandom(sorter);
+    sortSingleRecord(sorter);
+    sortSequential(sorter);
+    sortSorted(sorter);
+    sortAllEqual(sorter);
+    sortWritable(sorter);
+
+    // test degenerate case for median-of-three partitioning
+    // a_n, a_1, a_2, ..., a_{n-1}
+    final int DSAMPLE = 500;
+    int[] values = new int[DSAMPLE];
+    for (int i = 0; i < DSAMPLE; ++i) { values[i] = i; }
+    values[0] = values[DSAMPLE - 1] + 1;
+    SampleSortable s = new SampleSortable(values);
+    values = s.getValues();
+    final int DSS = (DSAMPLE / 2) * (DSAMPLE / 2);
+    // Worst case is (N/2)^2 comparisons, not including those effecting
+    // the median-of-three partitioning; impl should handle this case
+    MeasuredSortable m = new MeasuredSortable(s, DSS);
+    sorter.sort(m, 0, DSAMPLE);
+    System.out.println("QuickSort degen cmp/swp: " +
+        m.getCmp() + "/" + m.getSwp() +
+        "(" + sorter.getClass().getName() + ")");
+    Arrays.sort(values);
+    int[] check = s.getSorted();
+    assertTrue(Arrays.equals(values, check));
+  }
+
+  public void testHeapSort() throws Exception {
+    HeapSort sorter = new HeapSort();
+    sortRandom(sorter);
+    sortSingleRecord(sorter);
+    sortSequential(sorter);
+    sortSorted(sorter);
+    sortAllEqual(sorter);
+    sortWritable(sorter);
+  }
+
+  // Sortables //
+
   private static class SampleSortable implements IndexedSortable {
     private int[] valindex;
     private int[] valindirect;
@@ -96,6 +236,45 @@
 
   }
 
+  public static class MeasuredSortable implements IndexedSortable {
+
+    private int comparisions;
+    private int swaps;
+    private final int maxcmp;
+    private final int maxswp;
+    private IndexedSortable s;
+
+    public MeasuredSortable(IndexedSortable s) {
+      this(s, Integer.MAX_VALUE);
+    }
+
+    public MeasuredSortable(IndexedSortable s, int maxcmp) {
+      this(s, maxcmp, Integer.MAX_VALUE);
+    }
+
+    public MeasuredSortable(IndexedSortable s, int maxcmp, int maxswp) {
+      this.s = s;
+      this.maxcmp = maxcmp;
+      this.maxswp = maxswp;
+    }
+
+    public int getCmp() { return comparisions; }
+    public int getSwp() { return swaps; }
+
+    public int compare(int i, int j) {
+      assertTrue("Expected fewer than " + maxcmp + " comparisons",
+                 ++comparisions < maxcmp);
+      return s.compare(i, j);
+    }
+
+    public void swap(int i, int j) {
+      assertTrue("Expected fewer than " + maxswp + " swaps",
+                 ++swaps < maxswp);
+      s.swap(i, j);
+    }
+
+  }
+
   private static class WritableSortable implements IndexedSortable {
 
     private static Random r = new Random();
@@ -179,101 +358,4 @@
 
   }
 
-  public void testAllEqual() throws Exception {
-    final int SAMPLE = 500;
-    int[] values = new int[SAMPLE];
-    Arrays.fill(values, 10);
-    SampleSortable s = new SampleSortable(values);
-    IndexedSorter sorter = new QuickSort();
-    sorter.sort(s, 0, SAMPLE);
-    int[] check = s.getSorted();
-    assertTrue(Arrays.toString(values) + "\ndoesn't match\n" +
-        Arrays.toString(check), Arrays.equals(values, check));
-    // Set random min/max, re-sort.
-    Random r = new Random();
-    int min = r.nextInt(SAMPLE);
-    int max = (min + 1 + r.nextInt(SAMPLE - 2)) % SAMPLE;
-    values[min] = 9;
-    values[max] = 11;
-    System.out.println("testAllEqual setting min/max at " + min + "/" + max);
-    s = new SampleSortable(values);
-    sorter.sort(s, 0, SAMPLE);
-    check = s.getSorted();
-    Arrays.sort(values);
-    assertTrue(check[0] == 9);
-    assertTrue(check[SAMPLE - 1] == 11);
-    assertTrue(Arrays.toString(values) + "\ndoesn't match\n" +
-        Arrays.toString(check), Arrays.equals(values, check));
-  }
-
-  public void testSorted() throws Exception {
-    final int SAMPLE = 500;
-    int[] values = new int[SAMPLE];
-    Random r = new Random();
-    long seed = r.nextLong();
-    r.setSeed(seed);
-    System.out.println("testSorted seed: " + seed);
-    for (int i = 0; i < SAMPLE; ++i) {
-      values[i] = r.nextInt(100);
-    }
-    Arrays.sort(values);
-    SampleSortable s = new SampleSortable(values);
-    IndexedSorter sorter = new QuickSort();
-    sorter.sort(s, 0, SAMPLE);
-    int[] check = s.getSorted();
-    assertTrue(Arrays.toString(values) + "\ndoesn't match\n" +
-        Arrays.toString(check), Arrays.equals(values, check));
-  }
-
-  public void testSequential() throws Exception {
-    final int SAMPLE = 500;
-    int[] values = new int[SAMPLE];
-    for (int i = 0; i < SAMPLE; ++i) {
-      values[i] = i;
-    }
-    SampleSortable s = new SampleSortable(values);
-    IndexedSorter sorter = new QuickSort();
-    sorter.sort(s, 0, SAMPLE);
-    int[] check = s.getSorted();
-    assertTrue(Arrays.toString(values) + "\ndoesn't match\n" +
-        Arrays.toString(check), Arrays.equals(values, check));
-  }
-
-  public void testSingleRecord() throws Exception {
-    final int SAMPLE = 1;
-    SampleSortable s = new SampleSortable(SAMPLE);
-    int[] values = s.getValues();
-    IndexedSorter sorter = new QuickSort();
-    sorter.sort(s, 0, SAMPLE);
-    int[] check = s.getSorted();
-    assertTrue(Arrays.toString(values) + "\ndoesn't match\n" +
-        Arrays.toString(check), Arrays.equals(values, check));
-  }
-
-  public void testQuickSort() throws Exception {
-    final int SAMPLE = 100000;
-    SampleSortable s = new SampleSortable(SAMPLE);
-    System.out.println("testQuickSort seed: " + s.getSeed());
-    int[] values = s.getValues();
-    Arrays.sort(values);
-    IndexedSorter sorter = new QuickSort();
-    sorter.sort(s, 0, SAMPLE);
-    int[] check = s.getSorted();
-    assertTrue(Arrays.toString(values) + "\ndoesn't match\n" +
-        Arrays.toString(check), Arrays.equals(values, check));
-  }
-
-  public void testWritable() throws Exception {
-    final int SAMPLE = 1000;
-    WritableSortable s = new WritableSortable(SAMPLE);
-    System.out.println("testWritable seed: " + s.getSeed());
-    String[] values = s.getValues();
-    Arrays.sort(values);
-    IndexedSorter sorter = new QuickSort();
-    sorter.sort(s, 0, SAMPLE);
-    String[] check = s.getSorted();
-    assertTrue(Arrays.toString(values) + "\ndoesn't match\n" +
-        Arrays.toString(check), Arrays.equals(values, check));
-  }
-
 }