You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ro...@apache.org on 2010/02/08 13:51:52 UTC
svn commit: r907642 - in /lucene/mahout/trunk:
core/src/main/java/org/apache/mahout/fpm/pfpgrowth/
core/src/main/java/org/apache/mahout/fpm/pfpgrowth/fpgrowth/
examples/src/main/java/org/apache/mahout/text/
utils/src/main/java/org/apache/mahout/utils/v...
Author: robinanil
Date: Mon Feb 8 12:51:51 2010
New Revision: 907642
URL: http://svn.apache.org/viewvc?rev=907642&view=rev
Log:
Transforming code to use Mahout-math collections instead of HashMap. Only the easier ones. No Changes made in public functions
Modified:
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/ParallelFPGrowthMapper.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/ParallelFPGrowthReducer.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/TransactionSortingMapper.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/fpgrowth/FrequentPatternMaxHeap.java
lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/document/SequenceFileTokenizerMapper.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TFPartialVectorReducer.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TermCountMapper.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/ParallelFPGrowthMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/ParallelFPGrowthMapper.java?rev=907642&r1=907641&r2=907642&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/ParallelFPGrowthMapper.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/ParallelFPGrowthMapper.java Mon Feb 8 12:51:51 2010
@@ -18,18 +18,18 @@
package org.apache.mahout.fpm.pfpgrowth;
import java.io.IOException;
-import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
-import java.util.Map;
-import java.util.Map.Entry;
import java.util.Set;
+import java.util.Map.Entry;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.mahout.common.Pair;
import org.apache.mahout.common.Parameters;
+import org.apache.mahout.math.map.OpenIntLongHashMap;
+import org.apache.mahout.math.map.OpenObjectIntHashMap;
/**
* {@link ParallelFPGrowthMapper} maps each transaction to all unique items
@@ -40,7 +40,7 @@
public class ParallelFPGrowthMapper extends
Mapper<LongWritable,TransactionTree,LongWritable,TransactionTree> {
- private final Map<Integer,Long> gListInt = new HashMap<Integer,Long>();
+ private final OpenIntLongHashMap gListInt = new OpenIntLongHashMap();
@Override
protected void map(LongWritable offset,
@@ -83,7 +83,7 @@
Parameters params = Parameters.fromString(context.getConfiguration().get(
"pfp.parameters", ""));
- Map<String,Integer> fMap = new HashMap<String,Integer>();
+ OpenObjectIntHashMap<String> fMap = new OpenObjectIntHashMap<String>();
int i = 0;
for (Pair<String,Long> e : PFPGrowth.deserializeList(params, "fList",
context.getConfiguration())) {
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/ParallelFPGrowthReducer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/ParallelFPGrowthReducer.java?rev=907642&r1=907641&r2=907642&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/ParallelFPGrowthReducer.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/ParallelFPGrowthReducer.java Mon Feb 8 12:51:51 2010
@@ -21,7 +21,6 @@
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
-import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
@@ -40,6 +39,9 @@
import org.apache.mahout.fpm.pfpgrowth.convertors.string.TopKStringPatterns;
import org.apache.mahout.fpm.pfpgrowth.fpgrowth.FPGrowth;
import org.apache.mahout.fpm.pfpgrowth.fpgrowth.FPTreeDepthCache;
+import org.apache.mahout.math.list.IntArrayList;
+import org.apache.mahout.math.map.OpenLongObjectHashMap;
+import org.apache.mahout.math.map.OpenObjectIntHashMap;
/**
* {@link ParallelFPGrowthReducer} takes each group of transactions and runs
@@ -55,11 +57,11 @@
private final List<String> featureReverseMap = new ArrayList<String>();
- private final Map<String,Integer> fMap = new HashMap<String,Integer>();
+ private final OpenObjectIntHashMap<String> fMap = new OpenObjectIntHashMap<String>();
private final List<String> fRMap = new ArrayList<String>();
- private final Map<Long,List<Integer>> groupFeatures = new HashMap<Long,List<Integer>>();
+ private final OpenLongObjectHashMap<IntArrayList> groupFeatures = new OpenLongObjectHashMap<IntArrayList>();
private int maxHeapSize = 50;
@@ -100,17 +102,18 @@
});
FPGrowth<Integer> fpGrowth = new FPGrowth<Integer>();
- fpGrowth.generateTopKFrequentPatterns(
- cTree.getIterator(),
- localFList,
- minSupport,
- maxHeapSize,
- new HashSet<Integer>(groupFeatures.get(key.get())),
- new IntegerStringOutputConverter(
- new ContextWriteOutputCollector<LongWritable,TransactionTree,Text,TopKStringPatterns>(
- context), featureReverseMap),
- new ContextStatusUpdater<LongWritable,TransactionTree,Text,TopKStringPatterns>(
- context));
+ fpGrowth
+ .generateTopKFrequentPatterns(
+ cTree.getIterator(),
+ localFList,
+ minSupport,
+ maxHeapSize,
+ new HashSet<Integer>(groupFeatures.get(key.get()).toList()),
+ new IntegerStringOutputConverter(
+ new ContextWriteOutputCollector<LongWritable,TransactionTree,Text,TopKStringPatterns>(
+ context), featureReverseMap),
+ new ContextStatusUpdater<LongWritable,TransactionTree,Text,TopKStringPatterns>(
+ context));
}
@Override
@@ -135,12 +138,12 @@
.getConfiguration());
for (Entry<String,Long> entry : gList.entrySet()) {
- List<Integer> groupList = groupFeatures.get(entry.getValue());
+ IntArrayList groupList = groupFeatures.get(entry.getValue());
Integer itemInteger = fMap.get(entry.getKey());
if (groupList != null) {
groupList.add(itemInteger);
} else {
- groupList = new ArrayList<Integer>();
+ groupList = new IntArrayList();
groupList.add(itemInteger);
groupFeatures.put(entry.getValue(), groupList);
}
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/TransactionSortingMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/TransactionSortingMapper.java?rev=907642&r1=907641&r2=907642&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/TransactionSortingMapper.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/TransactionSortingMapper.java Mon Feb 8 12:51:51 2010
@@ -21,10 +21,8 @@
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
-import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
-import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
@@ -33,6 +31,7 @@
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.mahout.common.Pair;
import org.apache.mahout.common.Parameters;
+import org.apache.mahout.math.map.OpenObjectIntHashMap;
/**
* {@link TransactionSortingMapper} maps each transaction to all unique items
@@ -43,7 +42,7 @@
public class TransactionSortingMapper extends
Mapper<LongWritable,Text,LongWritable,TransactionTree> {
- private final Map<String,Integer> fMap = new HashMap<String,Integer>();
+ private final OpenObjectIntHashMap<String> fMap = new OpenObjectIntHashMap<String>();
private Pattern splitter;
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/fpgrowth/FrequentPatternMaxHeap.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/fpgrowth/FrequentPatternMaxHeap.java?rev=907642&r1=907641&r2=907642&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/fpgrowth/FrequentPatternMaxHeap.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/fpgrowth/FrequentPatternMaxHeap.java Mon Feb 8 12:51:51 2010
@@ -17,12 +17,12 @@
package org.apache.mahout.fpm.pfpgrowth.fpgrowth;
-import java.util.HashMap;
import java.util.HashSet;
-import java.util.Map;
import java.util.PriorityQueue;
import java.util.Set;
+import org.apache.mahout.math.map.OpenLongObjectHashMap;
+
/** {@link FrequentPatternMaxHeap} keeps top K Attributes in a TreeSet */
public final class FrequentPatternMaxHeap {
@@ -34,7 +34,7 @@
private boolean subPatternCheck;
- private Map<Long,Set<Pattern>> patternIndex;
+ private OpenLongObjectHashMap<Set<Pattern>> patternIndex;
private PriorityQueue<Pattern> queue;
@@ -42,7 +42,7 @@
maxSize = numResults;
queue = new PriorityQueue<Pattern>(maxSize);
this.subPatternCheck = subPatternCheck;
- patternIndex = new HashMap<Long,Set<Pattern>>();
+ patternIndex = new OpenLongObjectHashMap<Set<Pattern>>();
for (Pattern p : queue) {
Long index = p.support();
Set<Pattern> patternList;
Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java?rev=907642&r1=907641&r2=907642&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java Mon Feb 8 12:51:51 2010
@@ -85,6 +85,7 @@
maxChunkSizeInBytes = chunkSizeInMB * 1024 * 1024;
this.outputDir = outputDir;
fs = FileSystem.get(conf);
+ currentChunkID = 0;
writer =
new SequenceFile.Writer(fs, conf, getPath(currentChunkID),
Text.class, Text.class);
Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java?rev=907642&r1=907641&r2=907642&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java Mon Feb 8 12:51:51 2010
@@ -43,7 +43,6 @@
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.common.StringTuple;
import org.apache.mahout.math.VectorWritable;
-import org.apache.mahout.utils.vectors.common.PartialVectorMergeReducer;
import org.apache.mahout.utils.vectors.common.PartialVectorMerger;
import org.apache.mahout.utils.vectors.text.term.TFPartialVectorReducer;
import org.apache.mahout.utils.vectors.text.term.TermCountMapper;
Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/document/SequenceFileTokenizerMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/document/SequenceFileTokenizerMapper.java?rev=907642&r1=907641&r2=907642&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/document/SequenceFileTokenizerMapper.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/document/SequenceFileTokenizerMapper.java Mon Feb 8 12:51:51 2010
@@ -31,7 +31,6 @@
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.mahout.common.StringTuple;
-import org.apache.mahout.utils.vectors.text.DictionaryVectorizer;
import org.apache.mahout.utils.vectors.text.DocumentProcessor;
/**
@@ -57,7 +56,7 @@
document.add(new String(termAtt.termBuffer(), 0, termAtt.termLength()));
}
}
- output.collect(key,document);
+ output.collect(key, document);
}
@Override
Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TFPartialVectorReducer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TFPartialVectorReducer.java?rev=907642&r1=907641&r2=907642&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TFPartialVectorReducer.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TFPartialVectorReducer.java Mon Feb 8 12:51:51 2010
@@ -32,7 +32,6 @@
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
-import org.apache.lucene.analysis.Analyzer;
import org.apache.mahout.common.StringTuple;
import org.apache.mahout.math.RandomAccessSparseVector;
import org.apache.mahout.math.Vector;
@@ -44,7 +43,6 @@
*/
public class TFPartialVectorReducer extends MapReduceBase implements
Reducer<Text,StringTuple,Text,VectorWritable> {
- private Analyzer analyzer;
private final OpenObjectIntHashMap<String> dictionary = new OpenObjectIntHashMap<String>();
private final VectorWritable vectorWritable = new VectorWritable();
Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TermCountMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TermCountMapper.java?rev=907642&r1=907641&r2=907642&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TermCountMapper.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TermCountMapper.java Mon Feb 8 12:51:51 2010
@@ -18,8 +18,6 @@
package org.apache.mahout.utils.vectors.text.term;
import java.io.IOException;
-import java.util.HashMap;
-import java.util.Map;
import java.util.Map.Entry;
import org.apache.commons.lang.mutable.MutableLong;
@@ -30,6 +28,8 @@
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.mahout.common.StringTuple;
+import org.apache.mahout.math.function.ObjectLongProcedure;
+import org.apache.mahout.math.map.OpenObjectLongHashMap;
/**
* TextVectorizer Term Count Mapper. Tokenizes a text document and outputs the
@@ -41,20 +41,24 @@
@Override
public void map(Text key,
StringTuple value,
- OutputCollector<Text,LongWritable> output,
- Reporter reporter) throws IOException {
-
- Map<String,MutableLong> wordCount = new HashMap<String,MutableLong>();
+ final OutputCollector<Text,LongWritable> output,
+ final Reporter reporter) throws IOException {
+ OpenObjectLongHashMap<String> wordCount = new OpenObjectLongHashMap<String>();
for (String word : value.getEntries()) {
if (wordCount.containsKey(word) == false) {
- wordCount.put(word, new MutableLong(0));
- }
- wordCount.get(word).increment();
- }
-
- for (Entry<String,MutableLong> entry : wordCount.entrySet()) {
- output.collect(new Text(entry.getKey()), new LongWritable(entry
- .getValue().longValue()));
+ wordCount.put(word, 1);
+ } else wordCount.put(word, wordCount.get(word) + 1);
}
+ wordCount.forEachPair(new ObjectLongProcedure<String>() {
+ @Override
+ public boolean apply(String first, long second) {
+ try {
+ output.collect(new Text(first), new LongWritable(second));
+ } catch (IOException e) {
+ reporter.incrCounter("Exception", "Output IO Exception", 1);
+ }
+ return true;
+ }
+ });
}
}
Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java?rev=907642&r1=907641&r2=907642&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java Mon Feb 8 12:51:51 2010
@@ -43,7 +43,6 @@
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.common.Pair;
import org.apache.mahout.math.VectorWritable;
-import org.apache.mahout.utils.vectors.common.PartialVectorMergeReducer;
import org.apache.mahout.utils.vectors.common.PartialVectorMerger;
import org.apache.mahout.utils.vectors.text.term.TermDocumentCountMapper;
import org.apache.mahout.utils.vectors.text.term.TermDocumentCountReducer;