You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by sr...@apache.org on 2010/04/01 13:11:44 UTC
svn commit: r929926 - in /lucene/mahout/trunk/core/src:
main/java/org/apache/mahout/fpm/pfpgrowth/fpgrowth/FPGrowth.java
test/java/org/apache/mahout/fpm/pfpgrowth/FPGrowthTest.java
Author: srowen
Date: Thu Apr 1 11:11:44 2010
New Revision: 929926
URL: http://svn.apache.org/viewvc?rev=929926&view=rev
Log:
MAHOUT-355
Modified:
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/fpgrowth/FPGrowth.java
lucene/mahout/trunk/core/src/test/java/org/apache/mahout/fpm/pfpgrowth/FPGrowthTest.java
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/fpgrowth/FPGrowth.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/fpgrowth/FPGrowth.java?rev=929926&r1=929925&r2=929926&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/fpgrowth/FPGrowth.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/fpgrowth/FPGrowth.java Thu Apr 1 11:11:44 2010
@@ -48,20 +48,20 @@ import org.slf4j.LoggerFactory;
/**
* Implementation of PFGrowth Algorithm with FP-Bonsai pruning
- *
+ *
* Generic parameter A is the object type used as the cell items in a transaction list.
- *
+ *
* @param <A>
* the type used
*/
public class FPGrowth<A extends Comparable<? super A>> {
private static final Logger log = LoggerFactory.getLogger(FPGrowth.class);
-
+
public static List<Pair<String,TopKStringPatterns>> readFrequentPattern(FileSystem fs,
Configuration conf,
Path path) throws IOException {
-
+
List<Pair<String,TopKStringPatterns>> ret = new ArrayList<Pair<String,TopKStringPatterns>>();
Text key = new Text();
TopKStringPatterns value = new TopKStringPatterns();
@@ -73,11 +73,11 @@ public class FPGrowth<A extends Comparab
}
return ret;
}
-
+
/**
* Generate the Feature Frequency list from the given transaction whose
* frequency > minSupport
- *
+ *
* @param transactions
* Iterator over the transaction database
* @param minSupport
@@ -86,7 +86,7 @@ public class FPGrowth<A extends Comparab
*/
public final List<Pair<A,Long>> generateFList(Iterator<Pair<List<A>,Long>> transactions,
int minSupport) {
-
+
Map<A,MutableLong> attributeSupport = new HashMap<A,MutableLong>();
// int count = 0;
while (transactions.hasNext()) {
@@ -106,9 +106,9 @@ public class FPGrowth<A extends Comparab
for (Entry<A,MutableLong> e : attributeSupport.entrySet()) {
fList.add(new Pair<A,Long>(e.getKey(), e.getValue().longValue()));
}
-
+
Collections.sort(fList, new Comparator<Pair<A,Long>>() {
-
+
@Override
public int compare(Pair<A,Long> o1, Pair<A,Long> o2) {
int ret = o2.getSecond().compareTo(o1.getSecond());
@@ -117,16 +117,16 @@ public class FPGrowth<A extends Comparab
}
return o1.getFirst().compareTo(o2.getFirst());
}
-
+
});
-
+
return fList;
}
-
+
/**
* Generate Top K Frequent Patterns for every feature in returnableFeatures
* given a stream of transactions and the minimum support
- *
+ *
* @param transactionStream
* Iterator of transaction
* @param frequencyList
@@ -137,7 +137,7 @@ public class FPGrowth<A extends Comparab
* Number of top frequent patterns to keep
* @param returnableFeatures
* set of features for which the frequent patterns are mined. If the
- * set is null, then top K patterns for every frequent item (an item
+ * set is empty or null, then top K patterns for every frequent item (an item
* whose support> minSupport) is generated
* @param output
* The output collector to which the the generated patterns are
@@ -151,10 +151,10 @@ public class FPGrowth<A extends Comparab
Set<A> returnableFeatures,
OutputCollector<A,List<Pair<List<A>,Long>>> output,
StatusUpdater updater) throws IOException {
-
+
Map<Integer,A> reverseMapping = new HashMap<Integer,A>();
Map<A,Integer> attributeIdMapping = new HashMap<A,Integer>();
-
+
int id = 0;
for (Pair<A,Long> feature : frequencyList) {
A attrib = feature.getFirst();
@@ -165,7 +165,7 @@ public class FPGrowth<A extends Comparab
attributeIdMapping.put(attrib, id);
reverseMapping.put(id++, attrib);
}
-
+
long[] attributeFrequency = new long[attributeIdMapping.size()];
for (Pair<A,Long> feature : frequencyList) {
A attrib = feature.getFirst();
@@ -175,11 +175,11 @@ public class FPGrowth<A extends Comparab
}
attributeFrequency[attributeIdMapping.get(attrib)] = frequency;
}
-
+
log.info("Number of unique items {}", frequencyList.size());
-
+
Set<Integer> returnFeatures = new HashSet<Integer>();
- if (returnableFeatures.isEmpty() == false) {
+ if (returnableFeatures != null && !returnableFeatures.isEmpty()) {
for (A attrib : returnableFeatures) {
if (attributeIdMapping.containsKey(attrib)) {
returnFeatures.add(attributeIdMapping.get(attrib));
@@ -192,18 +192,18 @@ public class FPGrowth<A extends Comparab
returnFeatures.add(j);
}
}
-
+
log.info("Number of unique pruned items {}", attributeIdMapping.size());
generateTopKFrequentPatterns(new TransactionIterator<A>(transactionStream,
attributeIdMapping), attributeFrequency, minSupport, k, reverseMapping
.size(), returnFeatures, new TopKPatternsOutputConverter<A>(output,
reverseMapping), updater);
-
+
}
-
+
/**
* Top K FpGrowth Algorithm
- *
+ *
* @param tree
* to be mined
* @param minSupportMutable
@@ -223,9 +223,9 @@ public class FPGrowth<A extends Comparab
Set<Integer> requiredFeatures,
TopKPatternsOutputConverter<A> outputCollector,
StatusUpdater updater) throws IOException {
-
+
long minSupportValue = minSupportMutable.longValue();
-
+
Map<Integer,FrequentPatternMaxHeap> patterns = new HashMap<Integer,FrequentPatternMaxHeap>();
FPTreeDepthCache treeCache = new FPTreeDepthCache();
for (int i = tree.getHeaderTableCount() - 1; i >= 0; i--) {
@@ -239,7 +239,7 @@ public class FPGrowth<A extends Comparab
treeCache, 0, attribute, updater);
patterns.put(attribute, frequentPatterns);
outputCollector.collect(attribute, frequentPatterns);
-
+
minSupportValue = Math.max(minSupportValue, minSupport.longValue() / 2);
log.info("Found {} Patterns with Least Support {}", patterns.get(
attribute).count(), patterns.get(attribute).leastSupport());
@@ -248,13 +248,13 @@ public class FPGrowth<A extends Comparab
treeCache.getHits(), treeCache.getMisses());
return patterns;
}
-
+
private static FrequentPatternMaxHeap generateSinglePathPatterns(FPTree tree,
int k,
MutableLong minSupportMutable) {
FrequentPatternMaxHeap frequentPatterns = new FrequentPatternMaxHeap(k,
false);
-
+
int tempNode = FPTree.ROOTNODEID;
Pattern frequentItem = new Pattern();
while (tree.childCount(tempNode) != 0) {
@@ -271,14 +271,14 @@ public class FPGrowth<A extends Comparab
if (frequentItem.length() > 0) {
frequentPatterns.insert(frequentItem);
}
-
+
return frequentPatterns;
}
-
+
/**
* Internal TopKFrequentPattern Generation algorithm, which represents the A's
* as integers and transforms features to use only integers
- *
+ *
* @param transactions
* Transaction database Iterator
* @param attributeFrequency
@@ -301,12 +301,12 @@ public class FPGrowth<A extends Comparab
long[] attributeFrequency, long minSupport, int k, int featureSetSize,
Set<Integer> returnFeatures, TopKPatternsOutputConverter<A> topKPatternsOutputCollector,
StatusUpdater updater) throws IOException {
-
+
FPTree tree = new FPTree(featureSetSize);
for (int i = 0; i < featureSetSize; i++) {
tree.addHeaderCount(i, attributeFrequency[i]);
}
-
+
// Constructing initial FPTree from the list of transactions
MutableLong minSupportMutable = new MutableLong(minSupport);
int nodecount = 0;
@@ -323,13 +323,13 @@ public class FPGrowth<A extends Comparab
log.info("FPTree Building: Read {} Transactions", i);
}
}
-
+
log.info("Number of Nodes in the FP Tree: {}", nodecount);
-
+
return fpGrowth(tree, minSupportMutable, k, returnFeatures,
topKPatternsOutputCollector, updater);
}
-
+
private static FrequentPatternMaxHeap growth(FPTree tree,
MutableLong minSupportMutable,
int k,
@@ -337,18 +337,18 @@ public class FPGrowth<A extends Comparab
int level,
int currentAttribute,
StatusUpdater updater) {
-
+
FrequentPatternMaxHeap frequentPatterns = new FrequentPatternMaxHeap(k,
true);
-
+
int i = Arrays.binarySearch(tree.getHeaderTableAttributes(),
currentAttribute);
if (i < 0) {
return frequentPatterns;
}
-
+
int headerTableCount = tree.getHeaderTableCount();
-
+
while (i < headerTableCount) {
int attribute = tree.getAttributeAtIndex(i);
long count = tree.getHeaderSupportCount(attribute);
@@ -362,15 +362,15 @@ public class FPGrowth<A extends Comparab
traverseAndBuildConditionalFPTreeData(tree.getHeaderNext(attribute),
minSupportMutable, conditionalTree, tree);
// printTree(conditionalTree);
-
+
}
-
+
FrequentPatternMaxHeap returnedPatterns;
if (attribute == currentAttribute) {
-
+
returnedPatterns = growthTopDown(conditionalTree, minSupportMutable, k,
treeCache, level + 1, true, currentAttribute, updater);
-
+
frequentPatterns = mergeHeap(frequentPatterns, returnedPatterns,
attribute, count, true, true);
} else {
@@ -386,10 +386,10 @@ public class FPGrowth<A extends Comparab
}
i++;
}
-
+
return frequentPatterns;
}
-
+
private static FrequentPatternMaxHeap growthBottomUp(FPTree tree,
MutableLong minSupportMutable,
int k,
@@ -398,10 +398,10 @@ public class FPGrowth<A extends Comparab
boolean conditionalOfCurrentAttribute,
int currentAttribute,
StatusUpdater updater) {
-
+
FrequentPatternMaxHeap frequentPatterns = new FrequentPatternMaxHeap(k,
false);
-
+
if (conditionalOfCurrentAttribute == false) {
int index = Arrays.binarySearch(tree.getHeaderTableAttributes(),
currentAttribute);
@@ -415,11 +415,11 @@ public class FPGrowth<A extends Comparab
}
}
}
-
+
if (tree.singlePath()) {
return generateSinglePathPatterns(tree, k, minSupportMutable);
}
-
+
updater.update("Bottom Up FP Growth");
for (int i = tree.getHeaderTableCount() - 1; i >= 0; i--) {
int attribute = tree.getAttributeAtIndex(i);
@@ -428,14 +428,14 @@ public class FPGrowth<A extends Comparab
continue;
}
FPTree conditionalTree = treeCache.getTree(level);
-
+
FrequentPatternMaxHeap returnedPatterns;
if (conditionalOfCurrentAttribute) {
traverseAndBuildConditionalFPTreeData(tree.getHeaderNext(attribute),
minSupportMutable, conditionalTree, tree);
returnedPatterns = growthBottomUp(conditionalTree, minSupportMutable,
k, treeCache, level + 1, true, currentAttribute, updater);
-
+
frequentPatterns = mergeHeap(frequentPatterns, returnedPatterns,
attribute, count, true, false);
} else {
@@ -444,7 +444,7 @@ public class FPGrowth<A extends Comparab
minSupportMutable, conditionalTree, tree);
returnedPatterns = growthBottomUp(conditionalTree, minSupportMutable,
k, treeCache, level + 1, true, currentAttribute, updater);
-
+
frequentPatterns = mergeHeap(frequentPatterns, returnedPatterns,
attribute, count, true, false);
} else if (attribute > currentAttribute) {
@@ -456,17 +456,17 @@ public class FPGrowth<A extends Comparab
attribute, count, false, false);
}
}
-
+
if (frequentPatterns.isFull()) {
if (minSupportMutable.intValue() < frequentPatterns.leastSupport()) {
minSupportMutable.setValue(frequentPatterns.leastSupport());
}
}
}
-
+
return frequentPatterns;
}
-
+
private static FrequentPatternMaxHeap growthTopDown(FPTree tree,
MutableLong minSupportMutable,
int k,
@@ -475,10 +475,10 @@ public class FPGrowth<A extends Comparab
boolean conditionalOfCurrentAttribute,
int currentAttribute,
StatusUpdater updater) {
-
+
FrequentPatternMaxHeap frequentPatterns = new FrequentPatternMaxHeap(k,
true);
-
+
if (conditionalOfCurrentAttribute == false) {
int index = Arrays.binarySearch(tree.getHeaderTableAttributes(),
currentAttribute);
@@ -492,32 +492,32 @@ public class FPGrowth<A extends Comparab
}
}
}
-
+
if (tree.singlePath()) {
return generateSinglePathPatterns(tree, k, minSupportMutable);
}
-
+
updater.update("Top Down Growth:");
-
+
for (int i = 0; i < tree.getHeaderTableCount(); i++) {
int attribute = tree.getAttributeAtIndex(i);
long count = tree.getHeaderSupportCount(attribute);
if (count < minSupportMutable.longValue()) {
continue;
}
-
+
FPTree conditionalTree = treeCache.getTree(level);
-
+
FrequentPatternMaxHeap returnedPatterns;
if (conditionalOfCurrentAttribute) {
traverseAndBuildConditionalFPTreeData(tree.getHeaderNext(attribute),
minSupportMutable, conditionalTree, tree);
-
+
returnedPatterns = growthBottomUp(conditionalTree, minSupportMutable,
k, treeCache, level + 1, true, currentAttribute, updater);
frequentPatterns = mergeHeap(frequentPatterns, returnedPatterns,
attribute, count, true, true);
-
+
} else {
if (attribute == currentAttribute) {
traverseAndBuildConditionalFPTreeData(tree.getHeaderNext(attribute),
@@ -526,7 +526,7 @@ public class FPGrowth<A extends Comparab
k, treeCache, level + 1, true, currentAttribute, updater);
frequentPatterns = mergeHeap(frequentPatterns, returnedPatterns,
attribute, count, true, false);
-
+
} else if (attribute > currentAttribute) {
traverseAndBuildConditionalFPTreeData(tree.getHeaderNext(attribute),
minSupportMutable, conditionalTree, tree);
@@ -534,7 +534,7 @@ public class FPGrowth<A extends Comparab
k, treeCache, level + 1, false, currentAttribute, updater);
frequentPatterns = mergeHeap(frequentPatterns, returnedPatterns,
attribute, count, false, true);
-
+
}
}
if (frequentPatterns.isFull()) {
@@ -543,10 +543,10 @@ public class FPGrowth<A extends Comparab
}
}
}
-
+
return frequentPatterns;
}
-
+
private static FrequentPatternMaxHeap mergeHeap(FrequentPatternMaxHeap frequentPatterns,
FrequentPatternMaxHeap returnedPatterns,
int attribute,
@@ -559,23 +559,23 @@ public class FPGrowth<A extends Comparab
p.add(attribute, count);
frequentPatterns.insert(p);
}
-
+
return frequentPatterns;
}
-
+
private static void traverseAndBuildConditionalFPTreeData(int firstConditionalNode,
MutableLong minSupportMutable,
FPTree conditionalTree,
FPTree tree) {
-
+
// Build Subtable
int conditionalNode = firstConditionalNode;
-
+
while (conditionalNode != -1) {
long nextNodeCount = tree.count(conditionalNode);
int pathNode = tree.parent(conditionalNode);
int prevConditional = -1;
-
+
while (pathNode != 0) { // dummy root node
int attribute = tree.attribute(pathNode);
if (tree.getHeaderSupportCount(attribute) < minSupportMutable
@@ -585,10 +585,10 @@ public class FPGrowth<A extends Comparab
}
// update and increment the headerTable Counts
conditionalTree.addHeaderCount(attribute, nextNodeCount);
-
+
int conditional = tree.conditional(pathNode);
// if its a new conditional tree node
-
+
if (conditional == 0) {
tree.setConditional(pathNode, conditionalTree.createConditionalNode(
attribute, 0));
@@ -597,35 +597,35 @@ public class FPGrowth<A extends Comparab
} else {
conditionalTree.setSinglePath(false);
}
-
+
if (prevConditional != -1) { // if there is a child element
conditionalTree.setParent(prevConditional, conditional);
}
-
+
conditionalTree.addCount(conditional, nextNodeCount);
prevConditional = conditional;
-
+
pathNode = tree.parent(pathNode);
-
+
}
if (prevConditional != -1) {
conditionalTree.setParent(prevConditional, FPTree.ROOTNODEID);
if (conditionalTree.childCount(FPTree.ROOTNODEID) > 1
&& conditionalTree.singlePath()) {
conditionalTree.setSinglePath(false);
-
+
}
}
conditionalNode = tree.next(conditionalNode);
}
-
+
tree.clearConditional();
conditionalTree.reorderHeaderTable();
pruneFPTree(minSupportMutable, conditionalTree);
// prune Conditional Tree
-
+
}
-
+
private static void pruneFPTree(MutableLong minSupportMutable, FPTree tree) {
for (int i = 0; i < tree.getHeaderTableCount(); i++) {
int currentAttribute = tree.getAttributeAtIndex(i);
@@ -634,31 +634,31 @@ public class FPGrowth<A extends Comparab
int nextNode = tree.getHeaderNext(currentAttribute);
tree.removeHeaderNext(currentAttribute);
while (nextNode != -1) {
-
+
int mychildCount = tree.childCount(nextNode);
-
+
int parentNode = tree.parent(nextNode);
-
+
for (int j = 0; j < mychildCount; j++) {
Integer myChildId = tree.childAtIndex(nextNode, j);
tree.replaceChild(parentNode, nextNode, myChildId);
}
nextNode = tree.next(nextNode);
}
-
+
}
}
-
+
for (int i = 0; i < tree.getHeaderTableCount(); i++) {
int currentAttribute = tree.getAttributeAtIndex(i);
int nextNode = tree.getHeaderNext(currentAttribute);
-
+
OpenIntIntHashMap prevNode = new OpenIntIntHashMap();
int justPrevNode = -1;
while (nextNode != -1) {
-
+
int parent = tree.parent(nextNode);
-
+
if (prevNode.containsKey(parent) == false) {
prevNode.put(parent, nextNode);
} else {
@@ -677,15 +677,15 @@ public class FPGrowth<A extends Comparab
nextNode = tree.next(nextNode);
}
}
-
+
// prune Conditional Tree
-
+
}
-
+
/**
* Create FPTree with node counts incremented by addCount variable given the
* root node and the List of Attributes in transaction sorted by support
- *
+ *
* @param tree
* object to which the transaction has to be added to
* @param myList
@@ -704,11 +704,11 @@ public class FPGrowth<A extends Comparab
long addCount,
MutableLong minSupport,
long[] attributeFrequency) {
-
+
int temp = FPTree.ROOTNODEID;
int ret = 0;
boolean addCountMode = true;
-
+
for (int attribute : myList) {
if (attributeFrequency[attribute] < minSupport.intValue()) {
return ret;
@@ -729,8 +729,8 @@ public class FPGrowth<A extends Comparab
ret++;
}
}
-
+
return ret;
-
+
}
}
Modified: lucene/mahout/trunk/core/src/test/java/org/apache/mahout/fpm/pfpgrowth/FPGrowthTest.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/test/java/org/apache/mahout/fpm/pfpgrowth/FPGrowthTest.java?rev=929926&r1=929925&r2=929926&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/test/java/org/apache/mahout/fpm/pfpgrowth/FPGrowthTest.java (original)
+++ lucene/mahout/trunk/core/src/test/java/org/apache/mahout/fpm/pfpgrowth/FPGrowthTest.java Thu Apr 1 11:11:44 2010
@@ -30,6 +30,7 @@ import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.OutputCollector;
import org.apache.mahout.common.MahoutTestCase;
import org.apache.mahout.common.Pair;
import org.apache.mahout.fpm.pfpgrowth.convertors.ContextStatusUpdater;
@@ -84,4 +85,30 @@ public class FPGrowthTest extends Mahout
frequentPatterns.toString());
}
+
+ /**
+ * Trivial test for MAHOUT-355
+ */
+ public void testNoNullPointerExceptionWhenReturnableFeaturesIsNull() throws IOException {
+
+ FPGrowth<String> fp = new FPGrowth<String>();
+
+ Collection<Pair<List<String>,Long>> transactions = new ArrayList<Pair<List<String>,Long>>();
+ transactions.add(new Pair<List<String>,Long>(Arrays.asList("E", "A", "D", "B"), 1L));
+
+ OutputCollector<String, List<Pair<List<String>, Long>>> noOutput = new OutputCollector<String,List<Pair<List<String>,Long>>>() {
+ @Override
+ public void collect(String arg0, List<Pair<List<String>, Long>> arg1) {
+ }
+ };
+
+ fp.generateTopKFrequentPatterns(
+ transactions.iterator(),
+ fp.generateFList(transactions.iterator(), 3),
+ 3,
+ 100,
+ null,
+ noOutput,
+ new ContextStatusUpdater(null));
+ }
}