You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by sr...@apache.org on 2010/04/01 13:11:44 UTC

svn commit: r929926 - in /lucene/mahout/trunk/core/src: main/java/org/apache/mahout/fpm/pfpgrowth/fpgrowth/FPGrowth.java test/java/org/apache/mahout/fpm/pfpgrowth/FPGrowthTest.java

Author: srowen
Date: Thu Apr  1 11:11:44 2010
New Revision: 929926

URL: http://svn.apache.org/viewvc?rev=929926&view=rev
Log:
MAHOUT-355

Modified:
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/fpgrowth/FPGrowth.java
    lucene/mahout/trunk/core/src/test/java/org/apache/mahout/fpm/pfpgrowth/FPGrowthTest.java

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/fpgrowth/FPGrowth.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/fpgrowth/FPGrowth.java?rev=929926&r1=929925&r2=929926&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/fpgrowth/FPGrowth.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/fpgrowth/FPGrowth.java Thu Apr  1 11:11:44 2010
@@ -48,20 +48,20 @@ import org.slf4j.LoggerFactory;
 
 /**
  * Implementation of PFGrowth Algorithm with FP-Bonsai pruning
- * 
+ *
  * Generic parameter A is the object type used as the cell items in a transaction list.
- * 
+ *
  * @param <A>
  *          the type used
  */
 public class FPGrowth<A extends Comparable<? super A>> {
 
   private static final Logger log = LoggerFactory.getLogger(FPGrowth.class);
-  
+
   public static List<Pair<String,TopKStringPatterns>> readFrequentPattern(FileSystem fs,
     Configuration conf,
     Path path) throws IOException {
-    
+
     List<Pair<String,TopKStringPatterns>> ret = new ArrayList<Pair<String,TopKStringPatterns>>();
     Text key = new Text();
     TopKStringPatterns value = new TopKStringPatterns();
@@ -73,11 +73,11 @@ public class FPGrowth<A extends Comparab
     }
     return ret;
   }
-  
+
   /**
    * Generate the Feature Frequency list from the given transaction whose
    * frequency > minSupport
-   * 
+   *
    * @param transactions
    *          Iterator over the transaction database
    * @param minSupport
@@ -86,7 +86,7 @@ public class FPGrowth<A extends Comparab
    */
   public final List<Pair<A,Long>> generateFList(Iterator<Pair<List<A>,Long>> transactions,
     int minSupport) {
-    
+
     Map<A,MutableLong> attributeSupport = new HashMap<A,MutableLong>();
     // int count = 0;
     while (transactions.hasNext()) {
@@ -106,9 +106,9 @@ public class FPGrowth<A extends Comparab
     for (Entry<A,MutableLong> e : attributeSupport.entrySet()) {
       fList.add(new Pair<A,Long>(e.getKey(), e.getValue().longValue()));
     }
-    
+
     Collections.sort(fList, new Comparator<Pair<A,Long>>() {
-      
+
       @Override
       public int compare(Pair<A,Long> o1, Pair<A,Long> o2) {
         int ret = o2.getSecond().compareTo(o1.getSecond());
@@ -117,16 +117,16 @@ public class FPGrowth<A extends Comparab
         }
         return o1.getFirst().compareTo(o2.getFirst());
       }
-      
+
     });
-    
+
     return fList;
   }
-  
+
   /**
    * Generate Top K Frequent Patterns for every feature in returnableFeatures
    * given a stream of transactions and the minimum support
-   * 
+   *
    * @param transactionStream
    *          Iterator of transaction
    * @param frequencyList
@@ -137,7 +137,7 @@ public class FPGrowth<A extends Comparab
    *          Number of top frequent patterns to keep
    * @param returnableFeatures
    *          set of features for which the frequent patterns are mined. If the
-   *          set is null, then top K patterns for every frequent item (an item
+   *          set is empty or null, then top K patterns for every frequent item (an item
    *          whose support> minSupport) is generated
    * @param output
    *          The output collector to which the the generated patterns are
@@ -151,10 +151,10 @@ public class FPGrowth<A extends Comparab
                                                  Set<A> returnableFeatures,
                                                  OutputCollector<A,List<Pair<List<A>,Long>>> output,
                                                  StatusUpdater updater) throws IOException {
-    
+
     Map<Integer,A> reverseMapping = new HashMap<Integer,A>();
     Map<A,Integer> attributeIdMapping = new HashMap<A,Integer>();
-    
+
     int id = 0;
     for (Pair<A,Long> feature : frequencyList) {
       A attrib = feature.getFirst();
@@ -165,7 +165,7 @@ public class FPGrowth<A extends Comparab
       attributeIdMapping.put(attrib, id);
       reverseMapping.put(id++, attrib);
     }
-    
+
     long[] attributeFrequency = new long[attributeIdMapping.size()];
     for (Pair<A,Long> feature : frequencyList) {
       A attrib = feature.getFirst();
@@ -175,11 +175,11 @@ public class FPGrowth<A extends Comparab
       }
       attributeFrequency[attributeIdMapping.get(attrib)] = frequency;
     }
-    
+
     log.info("Number of unique items {}", frequencyList.size());
-    
+
     Set<Integer> returnFeatures = new HashSet<Integer>();
-    if (returnableFeatures.isEmpty() == false) {
+    if (returnableFeatures != null && !returnableFeatures.isEmpty()) {
       for (A attrib : returnableFeatures) {
         if (attributeIdMapping.containsKey(attrib)) {
           returnFeatures.add(attributeIdMapping.get(attrib));
@@ -192,18 +192,18 @@ public class FPGrowth<A extends Comparab
         returnFeatures.add(j);
       }
     }
-    
+
     log.info("Number of unique pruned items {}", attributeIdMapping.size());
     generateTopKFrequentPatterns(new TransactionIterator<A>(transactionStream,
         attributeIdMapping), attributeFrequency, minSupport, k, reverseMapping
         .size(), returnFeatures, new TopKPatternsOutputConverter<A>(output,
             reverseMapping), updater);
-    
+
   }
-  
+
   /**
    * Top K FpGrowth Algorithm
-   * 
+   *
    * @param tree
    *          to be mined
    * @param minSupportMutable
@@ -223,9 +223,9 @@ public class FPGrowth<A extends Comparab
     Set<Integer> requiredFeatures,
     TopKPatternsOutputConverter<A> outputCollector,
     StatusUpdater updater) throws IOException {
-    
+
     long minSupportValue = minSupportMutable.longValue();
-    
+
     Map<Integer,FrequentPatternMaxHeap> patterns = new HashMap<Integer,FrequentPatternMaxHeap>();
     FPTreeDepthCache treeCache = new FPTreeDepthCache();
     for (int i = tree.getHeaderTableCount() - 1; i >= 0; i--) {
@@ -239,7 +239,7 @@ public class FPGrowth<A extends Comparab
         treeCache, 0, attribute, updater);
       patterns.put(attribute, frequentPatterns);
       outputCollector.collect(attribute, frequentPatterns);
-      
+
       minSupportValue = Math.max(minSupportValue, minSupport.longValue() / 2);
       log.info("Found {} Patterns with Least Support {}", patterns.get(
         attribute).count(), patterns.get(attribute).leastSupport());
@@ -248,13 +248,13 @@ public class FPGrowth<A extends Comparab
       treeCache.getHits(), treeCache.getMisses());
     return patterns;
   }
-  
+
   private static FrequentPatternMaxHeap generateSinglePathPatterns(FPTree tree,
                                                                    int k,
                                                                    MutableLong minSupportMutable) {
     FrequentPatternMaxHeap frequentPatterns = new FrequentPatternMaxHeap(k,
       false);
-    
+
     int tempNode = FPTree.ROOTNODEID;
     Pattern frequentItem = new Pattern();
     while (tree.childCount(tempNode) != 0) {
@@ -271,14 +271,14 @@ public class FPGrowth<A extends Comparab
     if (frequentItem.length() > 0) {
       frequentPatterns.insert(frequentItem);
     }
-    
+
     return frequentPatterns;
   }
-  
+
   /**
    * Internal TopKFrequentPattern Generation algorithm, which represents the A's
    * as integers and transforms features to use only integers
-   * 
+   *
    * @param transactions
    *          Transaction database Iterator
    * @param attributeFrequency
@@ -301,12 +301,12 @@ public class FPGrowth<A extends Comparab
     long[] attributeFrequency, long minSupport, int k, int featureSetSize,
     Set<Integer> returnFeatures, TopKPatternsOutputConverter<A> topKPatternsOutputCollector,
     StatusUpdater updater) throws IOException {
-    
+
     FPTree tree = new FPTree(featureSetSize);
     for (int i = 0; i < featureSetSize; i++) {
       tree.addHeaderCount(i, attributeFrequency[i]);
     }
-    
+
     // Constructing initial FPTree from the list of transactions
     MutableLong minSupportMutable = new MutableLong(minSupport);
     int nodecount = 0;
@@ -323,13 +323,13 @@ public class FPGrowth<A extends Comparab
         log.info("FPTree Building: Read {} Transactions", i);
       }
     }
-    
+
     log.info("Number of Nodes in the FP Tree: {}", nodecount);
-    
+
     return fpGrowth(tree, minSupportMutable, k, returnFeatures,
       topKPatternsOutputCollector, updater);
   }
-  
+
   private static FrequentPatternMaxHeap growth(FPTree tree,
                                                MutableLong minSupportMutable,
                                                int k,
@@ -337,18 +337,18 @@ public class FPGrowth<A extends Comparab
                                                int level,
                                                int currentAttribute,
                                                StatusUpdater updater) {
-    
+
     FrequentPatternMaxHeap frequentPatterns = new FrequentPatternMaxHeap(k,
       true);
-    
+
     int i = Arrays.binarySearch(tree.getHeaderTableAttributes(),
       currentAttribute);
     if (i < 0) {
       return frequentPatterns;
     }
-    
+
     int headerTableCount = tree.getHeaderTableCount();
-    
+
     while (i < headerTableCount) {
       int attribute = tree.getAttributeAtIndex(i);
       long count = tree.getHeaderSupportCount(attribute);
@@ -362,15 +362,15 @@ public class FPGrowth<A extends Comparab
         traverseAndBuildConditionalFPTreeData(tree.getHeaderNext(attribute),
           minSupportMutable, conditionalTree, tree);
         // printTree(conditionalTree);
-        
+
       }
-      
+
       FrequentPatternMaxHeap returnedPatterns;
       if (attribute == currentAttribute) {
-        
+
         returnedPatterns = growthTopDown(conditionalTree, minSupportMutable, k,
           treeCache, level + 1, true, currentAttribute, updater);
-        
+
         frequentPatterns = mergeHeap(frequentPatterns, returnedPatterns,
           attribute, count, true, true);
       } else {
@@ -386,10 +386,10 @@ public class FPGrowth<A extends Comparab
       }
       i++;
     }
-    
+
     return frequentPatterns;
   }
-  
+
   private static FrequentPatternMaxHeap growthBottomUp(FPTree tree,
                                                        MutableLong minSupportMutable,
                                                        int k,
@@ -398,10 +398,10 @@ public class FPGrowth<A extends Comparab
                                                        boolean conditionalOfCurrentAttribute,
                                                        int currentAttribute,
                                                        StatusUpdater updater) {
-    
+
     FrequentPatternMaxHeap frequentPatterns = new FrequentPatternMaxHeap(k,
       false);
-    
+
     if (conditionalOfCurrentAttribute == false) {
       int index = Arrays.binarySearch(tree.getHeaderTableAttributes(),
         currentAttribute);
@@ -415,11 +415,11 @@ public class FPGrowth<A extends Comparab
         }
       }
     }
-    
+
     if (tree.singlePath()) {
       return generateSinglePathPatterns(tree, k, minSupportMutable);
     }
-    
+
     updater.update("Bottom Up FP Growth");
     for (int i = tree.getHeaderTableCount() - 1; i >= 0; i--) {
       int attribute = tree.getAttributeAtIndex(i);
@@ -428,14 +428,14 @@ public class FPGrowth<A extends Comparab
         continue;
       }
       FPTree conditionalTree = treeCache.getTree(level);
-      
+
       FrequentPatternMaxHeap returnedPatterns;
       if (conditionalOfCurrentAttribute) {
         traverseAndBuildConditionalFPTreeData(tree.getHeaderNext(attribute),
           minSupportMutable, conditionalTree, tree);
         returnedPatterns = growthBottomUp(conditionalTree, minSupportMutable,
           k, treeCache, level + 1, true, currentAttribute, updater);
-        
+
         frequentPatterns = mergeHeap(frequentPatterns, returnedPatterns,
           attribute, count, true, false);
       } else {
@@ -444,7 +444,7 @@ public class FPGrowth<A extends Comparab
             minSupportMutable, conditionalTree, tree);
           returnedPatterns = growthBottomUp(conditionalTree, minSupportMutable,
             k, treeCache, level + 1, true, currentAttribute, updater);
-          
+
           frequentPatterns = mergeHeap(frequentPatterns, returnedPatterns,
             attribute, count, true, false);
         } else if (attribute > currentAttribute) {
@@ -456,17 +456,17 @@ public class FPGrowth<A extends Comparab
             attribute, count, false, false);
         }
       }
-      
+
       if (frequentPatterns.isFull()) {
         if (minSupportMutable.intValue() < frequentPatterns.leastSupport()) {
           minSupportMutable.setValue(frequentPatterns.leastSupport());
         }
       }
     }
-    
+
     return frequentPatterns;
   }
-  
+
   private static FrequentPatternMaxHeap growthTopDown(FPTree tree,
                                                       MutableLong minSupportMutable,
                                                       int k,
@@ -475,10 +475,10 @@ public class FPGrowth<A extends Comparab
                                                       boolean conditionalOfCurrentAttribute,
                                                       int currentAttribute,
                                                       StatusUpdater updater) {
-    
+
     FrequentPatternMaxHeap frequentPatterns = new FrequentPatternMaxHeap(k,
       true);
-    
+
     if (conditionalOfCurrentAttribute == false) {
       int index = Arrays.binarySearch(tree.getHeaderTableAttributes(),
         currentAttribute);
@@ -492,32 +492,32 @@ public class FPGrowth<A extends Comparab
         }
       }
     }
-    
+
     if (tree.singlePath()) {
       return generateSinglePathPatterns(tree, k, minSupportMutable);
     }
-    
+
     updater.update("Top Down Growth:");
-    
+
     for (int i = 0; i < tree.getHeaderTableCount(); i++) {
       int attribute = tree.getAttributeAtIndex(i);
       long count = tree.getHeaderSupportCount(attribute);
       if (count < minSupportMutable.longValue()) {
         continue;
       }
-      
+
       FPTree conditionalTree = treeCache.getTree(level);
-      
+
       FrequentPatternMaxHeap returnedPatterns;
       if (conditionalOfCurrentAttribute) {
         traverseAndBuildConditionalFPTreeData(tree.getHeaderNext(attribute),
           minSupportMutable, conditionalTree, tree);
-        
+
         returnedPatterns = growthBottomUp(conditionalTree, minSupportMutable,
           k, treeCache, level + 1, true, currentAttribute, updater);
         frequentPatterns = mergeHeap(frequentPatterns, returnedPatterns,
           attribute, count, true, true);
-        
+
       } else {
         if (attribute == currentAttribute) {
           traverseAndBuildConditionalFPTreeData(tree.getHeaderNext(attribute),
@@ -526,7 +526,7 @@ public class FPGrowth<A extends Comparab
             k, treeCache, level + 1, true, currentAttribute, updater);
           frequentPatterns = mergeHeap(frequentPatterns, returnedPatterns,
             attribute, count, true, false);
-          
+
         } else if (attribute > currentAttribute) {
           traverseAndBuildConditionalFPTreeData(tree.getHeaderNext(attribute),
             minSupportMutable, conditionalTree, tree);
@@ -534,7 +534,7 @@ public class FPGrowth<A extends Comparab
             k, treeCache, level + 1, false, currentAttribute, updater);
           frequentPatterns = mergeHeap(frequentPatterns, returnedPatterns,
             attribute, count, false, true);
-          
+
         }
       }
       if (frequentPatterns.isFull()) {
@@ -543,10 +543,10 @@ public class FPGrowth<A extends Comparab
         }
       }
     }
-    
+
     return frequentPatterns;
   }
-  
+
   private static FrequentPatternMaxHeap mergeHeap(FrequentPatternMaxHeap frequentPatterns,
                                                   FrequentPatternMaxHeap returnedPatterns,
                                                   int attribute,
@@ -559,23 +559,23 @@ public class FPGrowth<A extends Comparab
       p.add(attribute, count);
       frequentPatterns.insert(p);
     }
-    
+
     return frequentPatterns;
   }
-  
+
   private static void traverseAndBuildConditionalFPTreeData(int firstConditionalNode,
                                                             MutableLong minSupportMutable,
                                                             FPTree conditionalTree,
                                                             FPTree tree) {
-    
+
     // Build Subtable
     int conditionalNode = firstConditionalNode;
-    
+
     while (conditionalNode != -1) {
       long nextNodeCount = tree.count(conditionalNode);
       int pathNode = tree.parent(conditionalNode);
       int prevConditional = -1;
-      
+
       while (pathNode != 0) { // dummy root node
         int attribute = tree.attribute(pathNode);
         if (tree.getHeaderSupportCount(attribute) < minSupportMutable
@@ -585,10 +585,10 @@ public class FPGrowth<A extends Comparab
         }
         // update and increment the headerTable Counts
         conditionalTree.addHeaderCount(attribute, nextNodeCount);
-        
+
         int conditional = tree.conditional(pathNode);
         // if its a new conditional tree node
-        
+
         if (conditional == 0) {
           tree.setConditional(pathNode, conditionalTree.createConditionalNode(
             attribute, 0));
@@ -597,35 +597,35 @@ public class FPGrowth<A extends Comparab
         } else {
           conditionalTree.setSinglePath(false);
         }
-        
+
         if (prevConditional != -1) { // if there is a child element
           conditionalTree.setParent(prevConditional, conditional);
         }
-        
+
         conditionalTree.addCount(conditional, nextNodeCount);
         prevConditional = conditional;
-        
+
         pathNode = tree.parent(pathNode);
-        
+
       }
       if (prevConditional != -1) {
         conditionalTree.setParent(prevConditional, FPTree.ROOTNODEID);
         if (conditionalTree.childCount(FPTree.ROOTNODEID) > 1
             && conditionalTree.singlePath()) {
           conditionalTree.setSinglePath(false);
-          
+
         }
       }
       conditionalNode = tree.next(conditionalNode);
     }
-    
+
     tree.clearConditional();
     conditionalTree.reorderHeaderTable();
     pruneFPTree(minSupportMutable, conditionalTree);
     // prune Conditional Tree
-    
+
   }
-  
+
   private static void pruneFPTree(MutableLong minSupportMutable, FPTree tree) {
     for (int i = 0; i < tree.getHeaderTableCount(); i++) {
       int currentAttribute = tree.getAttributeAtIndex(i);
@@ -634,31 +634,31 @@ public class FPGrowth<A extends Comparab
         int nextNode = tree.getHeaderNext(currentAttribute);
         tree.removeHeaderNext(currentAttribute);
         while (nextNode != -1) {
-          
+
           int mychildCount = tree.childCount(nextNode);
-          
+
           int parentNode = tree.parent(nextNode);
-          
+
           for (int j = 0; j < mychildCount; j++) {
             Integer myChildId = tree.childAtIndex(nextNode, j);
             tree.replaceChild(parentNode, nextNode, myChildId);
           }
           nextNode = tree.next(nextNode);
         }
-        
+
       }
     }
-    
+
     for (int i = 0; i < tree.getHeaderTableCount(); i++) {
       int currentAttribute = tree.getAttributeAtIndex(i);
       int nextNode = tree.getHeaderNext(currentAttribute);
-      
+
       OpenIntIntHashMap prevNode = new OpenIntIntHashMap();
       int justPrevNode = -1;
       while (nextNode != -1) {
-        
+
         int parent = tree.parent(nextNode);
-        
+
         if (prevNode.containsKey(parent) == false) {
           prevNode.put(parent, nextNode);
         } else {
@@ -677,15 +677,15 @@ public class FPGrowth<A extends Comparab
         nextNode = tree.next(nextNode);
       }
     }
-    
+
     // prune Conditional Tree
-    
+
   }
-  
+
   /**
    * Create FPTree with node counts incremented by addCount variable given the
    * root node and the List of Attributes in transaction sorted by support
-   * 
+   *
    * @param tree
    *          object to which the transaction has to be added to
    * @param myList
@@ -704,11 +704,11 @@ public class FPGrowth<A extends Comparab
                                   long addCount,
                                   MutableLong minSupport,
                                   long[] attributeFrequency) {
-    
+
     int temp = FPTree.ROOTNODEID;
     int ret = 0;
     boolean addCountMode = true;
-    
+
     for (int attribute : myList) {
       if (attributeFrequency[attribute] < minSupport.intValue()) {
         return ret;
@@ -729,8 +729,8 @@ public class FPGrowth<A extends Comparab
         ret++;
       }
     }
-    
+
     return ret;
-    
+
   }
 }

Modified: lucene/mahout/trunk/core/src/test/java/org/apache/mahout/fpm/pfpgrowth/FPGrowthTest.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/test/java/org/apache/mahout/fpm/pfpgrowth/FPGrowthTest.java?rev=929926&r1=929925&r2=929926&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/test/java/org/apache/mahout/fpm/pfpgrowth/FPGrowthTest.java (original)
+++ lucene/mahout/trunk/core/src/test/java/org/apache/mahout/fpm/pfpgrowth/FPGrowthTest.java Thu Apr  1 11:11:44 2010
@@ -30,6 +30,7 @@ import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.SequenceFile;
 import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.OutputCollector;
 import org.apache.mahout.common.MahoutTestCase;
 import org.apache.mahout.common.Pair;
 import org.apache.mahout.fpm.pfpgrowth.convertors.ContextStatusUpdater;
@@ -84,4 +85,30 @@ public class FPGrowthTest extends Mahout
       frequentPatterns.toString());
 
   }
+
+  /**
+   * Trivial test for MAHOUT-355
+   */
+  public void testNoNullPointerExceptionWhenReturnableFeaturesIsNull() throws IOException {
+
+    FPGrowth<String> fp = new FPGrowth<String>();
+
+    Collection<Pair<List<String>,Long>> transactions = new ArrayList<Pair<List<String>,Long>>();
+    transactions.add(new Pair<List<String>,Long>(Arrays.asList("E", "A", "D", "B"), 1L));
+
+    OutputCollector<String, List<Pair<List<String>, Long>>> noOutput = new OutputCollector<String,List<Pair<List<String>,Long>>>() {
+      @Override
+      public void collect(String arg0, List<Pair<List<String>, Long>> arg1) { 
+      }
+    };
+
+    fp.generateTopKFrequentPatterns(
+        transactions.iterator(),
+        fp.generateFList(transactions.iterator(), 3),
+        3,
+        100,
+        null,
+        noOutput,
+        new ContextStatusUpdater(null));
+  }
 }