You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ro...@apache.org on 2010/02/08 13:51:52 UTC

svn commit: r907642 - in /lucene/mahout/trunk: core/src/main/java/org/apache/mahout/fpm/pfpgrowth/ core/src/main/java/org/apache/mahout/fpm/pfpgrowth/fpgrowth/ examples/src/main/java/org/apache/mahout/text/ utils/src/main/java/org/apache/mahout/utils/v...

Author: robinanil
Date: Mon Feb  8 12:51:51 2010
New Revision: 907642

URL: http://svn.apache.org/viewvc?rev=907642&view=rev
Log:
Transforming code to use Mahout-math collections instead of HashMap. Only the easier ones. No Changes made in public functions

Modified:
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/ParallelFPGrowthMapper.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/ParallelFPGrowthReducer.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/TransactionSortingMapper.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/fpgrowth/FrequentPatternMaxHeap.java
    lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/document/SequenceFileTokenizerMapper.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TFPartialVectorReducer.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TermCountMapper.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/ParallelFPGrowthMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/ParallelFPGrowthMapper.java?rev=907642&r1=907641&r2=907642&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/ParallelFPGrowthMapper.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/ParallelFPGrowthMapper.java Mon Feb  8 12:51:51 2010
@@ -18,18 +18,18 @@
 package org.apache.mahout.fpm.pfpgrowth;
 
 import java.io.IOException;
-import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;
-import java.util.Map;
-import java.util.Map.Entry;
 import java.util.Set;
+import java.util.Map.Entry;
 
 import org.apache.hadoop.io.LongWritable;
 import org.apache.hadoop.mapreduce.Mapper;
 import org.apache.mahout.common.Pair;
 import org.apache.mahout.common.Parameters;
+import org.apache.mahout.math.map.OpenIntLongHashMap;
+import org.apache.mahout.math.map.OpenObjectIntHashMap;
 
 /**
  * {@link ParallelFPGrowthMapper} maps each transaction to all unique items
@@ -40,7 +40,7 @@
 public class ParallelFPGrowthMapper extends
     Mapper<LongWritable,TransactionTree,LongWritable,TransactionTree> {
   
-  private final Map<Integer,Long> gListInt = new HashMap<Integer,Long>();
+  private final OpenIntLongHashMap gListInt = new OpenIntLongHashMap();
   
   @Override
   protected void map(LongWritable offset,
@@ -83,7 +83,7 @@
     Parameters params = Parameters.fromString(context.getConfiguration().get(
       "pfp.parameters", ""));
     
-    Map<String,Integer> fMap = new HashMap<String,Integer>();
+    OpenObjectIntHashMap<String> fMap = new OpenObjectIntHashMap<String>();
     int i = 0;
     for (Pair<String,Long> e : PFPGrowth.deserializeList(params, "fList",
       context.getConfiguration())) {

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/ParallelFPGrowthReducer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/ParallelFPGrowthReducer.java?rev=907642&r1=907641&r2=907642&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/ParallelFPGrowthReducer.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/ParallelFPGrowthReducer.java Mon Feb  8 12:51:51 2010
@@ -21,7 +21,6 @@
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.Comparator;
-import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;
@@ -40,6 +39,9 @@
 import org.apache.mahout.fpm.pfpgrowth.convertors.string.TopKStringPatterns;
 import org.apache.mahout.fpm.pfpgrowth.fpgrowth.FPGrowth;
 import org.apache.mahout.fpm.pfpgrowth.fpgrowth.FPTreeDepthCache;
+import org.apache.mahout.math.list.IntArrayList;
+import org.apache.mahout.math.map.OpenLongObjectHashMap;
+import org.apache.mahout.math.map.OpenObjectIntHashMap;
 
 /**
  * {@link ParallelFPGrowthReducer} takes each group of transactions and runs
@@ -55,11 +57,11 @@
   
   private final List<String> featureReverseMap = new ArrayList<String>();
   
-  private final Map<String,Integer> fMap = new HashMap<String,Integer>();
+  private final OpenObjectIntHashMap<String> fMap = new OpenObjectIntHashMap<String>();
   
   private final List<String> fRMap = new ArrayList<String>();
   
-  private final Map<Long,List<Integer>> groupFeatures = new HashMap<Long,List<Integer>>();
+  private final OpenLongObjectHashMap<IntArrayList> groupFeatures = new OpenLongObjectHashMap<IntArrayList>();
   
   private int maxHeapSize = 50;
   
@@ -100,17 +102,18 @@
     });
     
     FPGrowth<Integer> fpGrowth = new FPGrowth<Integer>();
-    fpGrowth.generateTopKFrequentPatterns(
-        cTree.getIterator(),
-        localFList,
-        minSupport,
-        maxHeapSize,
-        new HashSet<Integer>(groupFeatures.get(key.get())),
-        new IntegerStringOutputConverter(
-            new ContextWriteOutputCollector<LongWritable,TransactionTree,Text,TopKStringPatterns>(
-                context), featureReverseMap),
-        new ContextStatusUpdater<LongWritable,TransactionTree,Text,TopKStringPatterns>(
-            context));
+    fpGrowth
+        .generateTopKFrequentPatterns(
+          cTree.getIterator(),
+          localFList,
+          minSupport,
+          maxHeapSize,
+          new HashSet<Integer>(groupFeatures.get(key.get()).toList()),
+          new IntegerStringOutputConverter(
+              new ContextWriteOutputCollector<LongWritable,TransactionTree,Text,TopKStringPatterns>(
+                  context), featureReverseMap),
+          new ContextStatusUpdater<LongWritable,TransactionTree,Text,TopKStringPatterns>(
+              context));
   }
   
   @Override
@@ -135,12 +138,12 @@
         .getConfiguration());
     
     for (Entry<String,Long> entry : gList.entrySet()) {
-      List<Integer> groupList = groupFeatures.get(entry.getValue());
+      IntArrayList groupList = groupFeatures.get(entry.getValue());
       Integer itemInteger = fMap.get(entry.getKey());
       if (groupList != null) {
         groupList.add(itemInteger);
       } else {
-        groupList = new ArrayList<Integer>();
+        groupList = new IntArrayList();
         groupList.add(itemInteger);
         groupFeatures.put(entry.getValue(), groupList);
       }

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/TransactionSortingMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/TransactionSortingMapper.java?rev=907642&r1=907641&r2=907642&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/TransactionSortingMapper.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/TransactionSortingMapper.java Mon Feb  8 12:51:51 2010
@@ -21,10 +21,8 @@
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
-import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
-import java.util.Map;
 import java.util.Set;
 import java.util.regex.Pattern;
 
@@ -33,6 +31,7 @@
 import org.apache.hadoop.mapreduce.Mapper;
 import org.apache.mahout.common.Pair;
 import org.apache.mahout.common.Parameters;
+import org.apache.mahout.math.map.OpenObjectIntHashMap;
 
 /**
  * {@link TransactionSortingMapper} maps each transaction to all unique items
@@ -43,7 +42,7 @@
 public class TransactionSortingMapper extends
     Mapper<LongWritable,Text,LongWritable,TransactionTree> {
   
-  private final Map<String,Integer> fMap = new HashMap<String,Integer>();
+  private final OpenObjectIntHashMap<String> fMap = new OpenObjectIntHashMap<String>();
   
   private Pattern splitter;
   

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/fpgrowth/FrequentPatternMaxHeap.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/fpgrowth/FrequentPatternMaxHeap.java?rev=907642&r1=907641&r2=907642&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/fpgrowth/FrequentPatternMaxHeap.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/fpgrowth/FrequentPatternMaxHeap.java Mon Feb  8 12:51:51 2010
@@ -17,12 +17,12 @@
 
 package org.apache.mahout.fpm.pfpgrowth.fpgrowth;
 
-import java.util.HashMap;
 import java.util.HashSet;
-import java.util.Map;
 import java.util.PriorityQueue;
 import java.util.Set;
 
+import org.apache.mahout.math.map.OpenLongObjectHashMap;
+
 /** {@link FrequentPatternMaxHeap} keeps top K Attributes in a TreeSet */
 public final class FrequentPatternMaxHeap {
   
@@ -34,7 +34,7 @@
   
   private boolean subPatternCheck;
   
-  private Map<Long,Set<Pattern>> patternIndex;
+  private OpenLongObjectHashMap<Set<Pattern>> patternIndex;
   
   private PriorityQueue<Pattern> queue;
   
@@ -42,7 +42,7 @@
     maxSize = numResults;
     queue = new PriorityQueue<Pattern>(maxSize);
     this.subPatternCheck = subPatternCheck;
-    patternIndex = new HashMap<Long,Set<Pattern>>();
+    patternIndex = new OpenLongObjectHashMap<Set<Pattern>>();
     for (Pattern p : queue) {
       Long index = p.support();
       Set<Pattern> patternList;

Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java?rev=907642&r1=907641&r2=907642&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java Mon Feb  8 12:51:51 2010
@@ -85,6 +85,7 @@
       maxChunkSizeInBytes = chunkSizeInMB * 1024 * 1024;
       this.outputDir = outputDir;
       fs = FileSystem.get(conf);
+      currentChunkID = 0;
       writer =
           new SequenceFile.Writer(fs, conf, getPath(currentChunkID),
               Text.class, Text.class);

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java?rev=907642&r1=907641&r2=907642&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java Mon Feb  8 12:51:51 2010
@@ -43,7 +43,6 @@
 import org.apache.mahout.common.HadoopUtil;
 import org.apache.mahout.common.StringTuple;
 import org.apache.mahout.math.VectorWritable;
-import org.apache.mahout.utils.vectors.common.PartialVectorMergeReducer;
 import org.apache.mahout.utils.vectors.common.PartialVectorMerger;
 import org.apache.mahout.utils.vectors.text.term.TFPartialVectorReducer;
 import org.apache.mahout.utils.vectors.text.term.TermCountMapper;

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/document/SequenceFileTokenizerMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/document/SequenceFileTokenizerMapper.java?rev=907642&r1=907641&r2=907642&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/document/SequenceFileTokenizerMapper.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/document/SequenceFileTokenizerMapper.java Mon Feb  8 12:51:51 2010
@@ -31,7 +31,6 @@
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 import org.apache.mahout.common.StringTuple;
-import org.apache.mahout.utils.vectors.text.DictionaryVectorizer;
 import org.apache.mahout.utils.vectors.text.DocumentProcessor;
 
 /**
@@ -57,7 +56,7 @@
         document.add(new String(termAtt.termBuffer(), 0, termAtt.termLength()));
       }
     }
-    output.collect(key,document);
+    output.collect(key, document);
   }
   
   @Override

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TFPartialVectorReducer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TFPartialVectorReducer.java?rev=907642&r1=907641&r2=907642&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TFPartialVectorReducer.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TFPartialVectorReducer.java Mon Feb  8 12:51:51 2010
@@ -32,7 +32,6 @@
 import org.apache.hadoop.mapred.OutputCollector;
 import org.apache.hadoop.mapred.Reducer;
 import org.apache.hadoop.mapred.Reporter;
-import org.apache.lucene.analysis.Analyzer;
 import org.apache.mahout.common.StringTuple;
 import org.apache.mahout.math.RandomAccessSparseVector;
 import org.apache.mahout.math.Vector;
@@ -44,7 +43,6 @@
  */
 public class TFPartialVectorReducer extends MapReduceBase implements
     Reducer<Text,StringTuple,Text,VectorWritable> {
-  private Analyzer analyzer;
   private final OpenObjectIntHashMap<String> dictionary = new OpenObjectIntHashMap<String>();
   
   private final VectorWritable vectorWritable = new VectorWritable();

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TermCountMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TermCountMapper.java?rev=907642&r1=907641&r2=907642&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TermCountMapper.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TermCountMapper.java Mon Feb  8 12:51:51 2010
@@ -18,8 +18,6 @@
 package org.apache.mahout.utils.vectors.text.term;
 
 import java.io.IOException;
-import java.util.HashMap;
-import java.util.Map;
 import java.util.Map.Entry;
 
 import org.apache.commons.lang.mutable.MutableLong;
@@ -30,6 +28,8 @@
 import org.apache.hadoop.mapred.OutputCollector;
 import org.apache.hadoop.mapred.Reporter;
 import org.apache.mahout.common.StringTuple;
+import org.apache.mahout.math.function.ObjectLongProcedure;
+import org.apache.mahout.math.map.OpenObjectLongHashMap;
 
 /**
  * TextVectorizer Term Count Mapper. Tokenizes a text document and outputs the
@@ -41,20 +41,24 @@
   @Override
   public void map(Text key,
                   StringTuple value,
-                  OutputCollector<Text,LongWritable> output,
-                  Reporter reporter) throws IOException {
-    
-    Map<String,MutableLong> wordCount = new HashMap<String,MutableLong>();
+                  final OutputCollector<Text,LongWritable> output,
+                  final Reporter reporter) throws IOException {
+    OpenObjectLongHashMap<String> wordCount = new OpenObjectLongHashMap<String>();
     for (String word : value.getEntries()) {
       if (wordCount.containsKey(word) == false) {
-        wordCount.put(word, new MutableLong(0));
-      }
-      wordCount.get(word).increment();
-    }
-    
-    for (Entry<String,MutableLong> entry : wordCount.entrySet()) {
-      output.collect(new Text(entry.getKey()), new LongWritable(entry
-          .getValue().longValue()));
+        wordCount.put(word, 1);
+      } else wordCount.put(word, wordCount.get(word) + 1);
     }
+    wordCount.forEachPair(new ObjectLongProcedure<String>() {
+      @Override
+      public boolean apply(String first, long second) {
+        try {
+          output.collect(new Text(first), new LongWritable(second));
+        } catch (IOException e) {
+          reporter.incrCounter("Exception", "Output IO Exception", 1);
+        }
+        return true;
+      }
+    });
   }
 }

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java?rev=907642&r1=907641&r2=907642&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java Mon Feb  8 12:51:51 2010
@@ -43,7 +43,6 @@
 import org.apache.mahout.common.HadoopUtil;
 import org.apache.mahout.common.Pair;
 import org.apache.mahout.math.VectorWritable;
-import org.apache.mahout.utils.vectors.common.PartialVectorMergeReducer;
 import org.apache.mahout.utils.vectors.common.PartialVectorMerger;
 import org.apache.mahout.utils.vectors.text.term.TermDocumentCountMapper;
 import org.apache.mahout.utils.vectors.text.term.TermDocumentCountReducer;