You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by cu...@apache.org on 2007/05/14 20:00:11 UTC

svn commit: r537926 - in /lucene/hadoop/trunk: ./ src/java/org/apache/hadoop/mapred/lib/aggregate/ src/test/org/apache/hadoop/mapred/

Author: cutting
Date: Mon May 14 11:00:10 2007
New Revision: 537926

URL: http://svn.apache.org/viewvc?view=rev&rev=537926
Log:
HADOOP-1342.  In aggregators, permit one to limit the number of unique values per key.  Contributed by Runping.

Modified:
    lucene/hadoop/trunk/CHANGES.txt
    lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/lib/aggregate/UniqValueCount.java
    lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/lib/aggregate/ValueAggregatorBaseDescriptor.java
    lucene/hadoop/trunk/src/test/org/apache/hadoop/mapred/TestAggregates.java

Modified: lucene/hadoop/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/CHANGES.txt?view=diff&rev=537926&r1=537925&r2=537926
==============================================================================
--- lucene/hadoop/trunk/CHANGES.txt (original)
+++ lucene/hadoop/trunk/CHANGES.txt Mon May 14 11:00:10 2007
@@ -11,6 +11,9 @@
 
   3. HADOOP-1344.  Add RunningJob#getJobName(). (Michael Bieniosek via cutting)
 
+  4. HADOOP-1342.  In aggregators, permit one to limit the number of
+     unique values per key.  (Runping Qi via cutting)
+
 
 Branch 0.13 (unreleased changes)
 

Modified: lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/lib/aggregate/UniqValueCount.java
URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/lib/aggregate/UniqValueCount.java?view=diff&rev=537926&r1=537925&r2=537926
==============================================================================
--- lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/lib/aggregate/UniqValueCount.java (original)
+++ lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/lib/aggregate/UniqValueCount.java Mon May 14 11:00:10 2007
@@ -29,17 +29,49 @@
  */
 public class UniqValueCount implements ValueAggregator {
 
-  TreeMap<Object, Object> uniqItems = null;
+  private TreeMap<Object, Object> uniqItems = null;
+
+  private long numItems = 0;
+  
+  private long maxNumItems = Long.MAX_VALUE;
 
   /**
    * the default constructor
    * 
    */
   public UniqValueCount() {
+    this(Long.MAX_VALUE);
+  }
+  
+  /**
+   * constructor
+   * @param maxNum the limit in the number of unique values to keep.
+   *  
+   */
+  public UniqValueCount(long maxNum) {
     uniqItems = new TreeMap<Object, Object>();
+    this.numItems = 0;
+    maxNumItems = Long.MAX_VALUE;
+    if (maxNum > 0 ) {
+      this.maxNumItems = maxNum;
+    }
   }
 
   /**
+   * Set the limit on the number of unique values
+   * @param n the desired limit on the number of unique values
+   * @return the new limit on the number of unique values
+   */
+  public long setMaxItems(long n) {
+    if (n >= numItems) {
+      this.maxNumItems = n;
+    } else if (this.maxNumItems >= this.numItems) {
+      this.maxNumItems = this.numItems;
+    }
+    return this.maxNumItems;
+  }
+  
+  /**
    * add a value to the aggregator
    * 
    * @param val
@@ -47,8 +79,10 @@
    * 
    */
   public void addNextValue(Object val) {
-    uniqItems.put(val, "1");
-
+    if (this.numItems <= this.maxNumItems) {
+      uniqItems.put(val, "1");
+      this.numItems = this.uniqItems.size();
+    }
   }
 
   /**

Modified: lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/lib/aggregate/ValueAggregatorBaseDescriptor.java
URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/lib/aggregate/ValueAggregatorBaseDescriptor.java?view=diff&rev=537926&r1=537925&r2=537926
==============================================================================
--- lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/lib/aggregate/ValueAggregatorBaseDescriptor.java (original)
+++ lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/lib/aggregate/ValueAggregatorBaseDescriptor.java Mon May 14 11:00:10 2007
@@ -46,7 +46,8 @@
   
   static public final String STRING_VALUE_MIN = "StringValueMin";
   
-
+  private static long maxNumItems = Long.MAX_VALUE;
+  
   public String inputFile = null;
 
   private static class MyEntry implements Entry {
@@ -106,7 +107,7 @@
     } else if (type.compareToIgnoreCase(DOUBLE_VALUE_SUM) == 0) {
       retv = new DoubleValueSum();
     } else if (type.compareToIgnoreCase(UNIQ_VALUE_COUNT) == 0) {
-      retv = new UniqValueCount();
+      retv = new UniqValueCount(maxNumItems);
     } else if (type.compareToIgnoreCase(VALUE_HISTOGRAM) == 0) {
       retv = new ValueHistogram();
     }
@@ -153,5 +154,6 @@
    */
   public void configure(JobConf job) {
     this.inputFile = job.get("map.input.file");
+    maxNumItems = job.getLong("aggregate.max.num.unique.values", Long.MAX_VALUE);
   }
 }

Modified: lucene/hadoop/trunk/src/test/org/apache/hadoop/mapred/TestAggregates.java
URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/test/org/apache/hadoop/mapred/TestAggregates.java?view=diff&rev=537926&r1=537925&r2=537926
==============================================================================
--- lucene/hadoop/trunk/src/test/org/apache/hadoop/mapred/TestAggregates.java (original)
+++ lucene/hadoop/trunk/src/test/org/apache/hadoop/mapred/TestAggregates.java Mon May 14 11:00:10 2007
@@ -69,7 +69,7 @@
     }
     expectedOutput.append("value_as_string_max\t9\n");
     expectedOutput.append("value_as_string_min\t1\n");
-    expectedOutput.append("uniq_count\t19\n");
+    expectedOutput.append("uniq_count\t15\n");
 
 
     fileOut.write(inputData.toString().getBytes("utf-8"));
@@ -96,6 +96,7 @@
     job.setInt("aggregator.descriptor.num", 1);
     job.set("aggregator.descriptor.0", 
           "UserDefined,org.apache.hadoop.mapred.lib.aggregate.AggregatorTests");
+    job.setLong("aggregate.max.num.unique.values", 14);
 
     JobClient.runJob(job);