You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ro...@apache.org on 2010/02/05 12:00:26 UTC
svn commit: r906896 [2/2] - in /lucene/mahout/trunk:
core/src/main/java/org/apache/mahout/fpm/pfpgrowth/
core/src/main/java/org/apache/mahout/fpm/pfpgrowth/convertors/
core/src/main/java/org/apache/mahout/fpm/pfpgrowth/convertors/integer/
core/src/main...
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/convertors/string/TopKStringPatterns.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/convertors/string/TopKStringPatterns.java?rev=906896&r1=906895&r2=906896&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/convertors/string/TopKStringPatterns.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/convertors/string/TopKStringPatterns.java Fri Feb 5 11:00:25 2010
@@ -28,32 +28,36 @@
import org.apache.hadoop.io.Writable;
import org.apache.mahout.common.Pair;
+/**
+ * A class which collects Top K string patterns
+ *
+ */
public final class TopKStringPatterns implements Writable {
- private List<Pair<List<String>, Long>> frequentPatterns = null;
-
+ private List<Pair<List<String>,Long>> frequentPatterns;
+
public TopKStringPatterns() {
- frequentPatterns = new ArrayList<Pair<List<String>, Long>>();
+ frequentPatterns = new ArrayList<Pair<List<String>,Long>>();
}
-
- public TopKStringPatterns(List<Pair<List<String>, Long>> patterns) {
- frequentPatterns = new ArrayList<Pair<List<String>, Long>>();
+
+ public TopKStringPatterns(List<Pair<List<String>,Long>> patterns) {
+ frequentPatterns = new ArrayList<Pair<List<String>,Long>>();
frequentPatterns.addAll(patterns);
}
-
- public Iterator<Pair<List<String>, Long>> iterator() {
+
+ public Iterator<Pair<List<String>,Long>> iterator() {
return frequentPatterns.iterator();
}
-
- public List<Pair<List<String>, Long>> getPatterns() {
+
+ public List<Pair<List<String>,Long>> getPatterns() {
return frequentPatterns;
}
-
+
public TopKStringPatterns merge(TopKStringPatterns pattern, int heapSize) {
- List<Pair<List<String>, Long>> patterns = new ArrayList<Pair<List<String>, Long>>();
- Iterator<Pair<List<String>, Long>> myIterator = frequentPatterns.iterator();
- Iterator<Pair<List<String>, Long>> otherIterator = pattern.iterator();
- Pair<List<String>, Long> myItem = null;
- Pair<List<String>, Long> otherItem = null;
+ List<Pair<List<String>,Long>> patterns = new ArrayList<Pair<List<String>,Long>>();
+ Iterator<Pair<List<String>,Long>> myIterator = frequentPatterns.iterator();
+ Iterator<Pair<List<String>,Long>> otherIterator = pattern.iterator();
+ Pair<List<String>,Long> myItem = null;
+ Pair<List<String>,Long> otherItem = null;
for (int i = 0; i < heapSize; i++) {
if (myItem == null && myIterator.hasNext()) {
myItem = myIterator.next();
@@ -68,7 +72,7 @@
if (cmp == 0) {
for (int j = 0; j < myItem.getFirst().size(); j++) {
cmp = myItem.getFirst().get(j).compareTo(
- otherItem.getFirst().get(j));
+ otherItem.getFirst().get(j));
if (cmp != 0) {
break;
}
@@ -97,7 +101,7 @@
}
return new TopKStringPatterns(patterns);
}
-
+
@Override
public void readFields(DataInput in) throws IOException {
frequentPatterns.clear();
@@ -112,14 +116,14 @@
in.readFully(data);
items.add(Bytes.toString(data));
}
- frequentPatterns.add(new Pair<List<String>, Long>(items, support));
+ frequentPatterns.add(new Pair<List<String>,Long>(items, support));
}
}
-
+
@Override
public void write(DataOutput out) throws IOException {
out.writeInt(frequentPatterns.size());
- for (Pair<List<String>, Long> pattern : frequentPatterns) {
+ for (Pair<List<String>,Long> pattern : frequentPatterns) {
out.writeInt(pattern.getFirst().size());
out.writeLong(pattern.getSecond());
for (String item : pattern.getFirst()) {
@@ -129,18 +133,18 @@
}
}
}
-
+
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
String sep = "";
- for (Pair<List<String>, Long> pattern : frequentPatterns) {
+ for (Pair<List<String>,Long> pattern : frequentPatterns) {
sb.append(sep);
sb.append(pattern.toString());
sep = ", ";
-
+
}
return sb.toString();
-
+
}
}
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/fpgrowth/FPGrowth.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/fpgrowth/FPGrowth.java?rev=906896&r1=906895&r2=906896&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/fpgrowth/FPGrowth.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/fpgrowth/FPGrowth.java Fri Feb 5 11:00:25 2010
@@ -27,8 +27,8 @@
import java.util.Iterator;
import java.util.List;
import java.util.Map;
-import java.util.Set;
import java.util.Map.Entry;
+import java.util.Set;
import org.apache.commons.lang.mutable.MutableLong;
import org.apache.hadoop.conf.Configuration;
@@ -40,97 +40,121 @@
import org.apache.mahout.cf.taste.impl.common.FastByIDMap;
import org.apache.mahout.common.Pair;
import org.apache.mahout.fpm.pfpgrowth.convertors.StatusUpdater;
-import org.apache.mahout.fpm.pfpgrowth.convertors.TopKPatternsOutputConvertor;
+import org.apache.mahout.fpm.pfpgrowth.convertors.TopKPatternsOutputConverter;
import org.apache.mahout.fpm.pfpgrowth.convertors.TransactionIterator;
import org.apache.mahout.fpm.pfpgrowth.convertors.string.TopKStringPatterns;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-/** PFPGrowth Class has both vanilla FPGrowth and Top K FPGrowth */
+/**
+ * Implementation of PFGrowth Algorithm with FP-Bonsai pruning
+ *
+ * @param A
+ * The object type used as the cell items in a transaction list
+ */
public class FPGrowth<A extends Comparable<? super A>> {
-
+
private static final Logger log = LoggerFactory.getLogger(FPGrowth.class);
-
- public static List<Pair<String, TopKStringPatterns>> readFrequentPattern(FileSystem fs,
- Configuration conf, Path path) throws IOException {
-
- List<Pair<String, TopKStringPatterns>> ret = new ArrayList<Pair<String, TopKStringPatterns>>();
+
+ public static List<Pair<String,TopKStringPatterns>> readFrequentPattern(FileSystem fs,
+ Configuration conf,
+ Path path) throws IOException {
+
+ List<Pair<String,TopKStringPatterns>> ret = new ArrayList<Pair<String,TopKStringPatterns>>();
Text key = new Text();
TopKStringPatterns value = new TopKStringPatterns();
SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
// key is feature value is count
while (reader.next(key, value)) {
- ret.add(new Pair<String, TopKStringPatterns>(key.toString(), new TopKStringPatterns(value
- .getPatterns())));
+ ret.add(new Pair<String,TopKStringPatterns>(key.toString(),
+ new TopKStringPatterns(value.getPatterns())));
}
return ret;
}
-
+
/**
- * Generate the Feature Frequency list from the given transaction whose frequency > minSupport
+ * Generate the Feature Frequency list from the given transaction whose
+ * frequency > minSupport
*
- * @param transactions Iterator over the transaction database
- * @param minSupport minSupport of the feature to be included
+ * @param transactions
+ * Iterator over the transaction database
+ * @param minSupport
+ * minSupport of the feature to be included
* @return the List of features and their associated frequency as a Pair
*/
- public final List<Pair<A, Long>> generateFList(Iterator<Pair<List<A>, Long>> transactions, int minSupport) {
-
- Map<A, MutableLong> attributeSupport = new HashMap<A, MutableLong>();
+ public final List<Pair<A,Long>> generateFList(Iterator<Pair<List<A>,Long>> transactions,
+ int minSupport) {
+
+ Map<A,MutableLong> attributeSupport = new HashMap<A,MutableLong>();
// int count = 0;
while (transactions.hasNext()) {
- Pair<List<A>, Long> transaction = transactions.next();
+ Pair<List<A>,Long> transaction = transactions.next();
for (A attribute : transaction.getFirst()) {
if (attributeSupport.containsKey(attribute) == false) {
- attributeSupport.put(attribute, new MutableLong(transaction.getSecond()));
+ attributeSupport.put(attribute, new MutableLong(transaction
+ .getSecond()));
} else {
- attributeSupport.get(attribute).add(transaction.getSecond().longValue());
+ attributeSupport.get(attribute).add(
+ transaction.getSecond().longValue());
// count++;
}
}
}
- List<Pair<A, Long>> fList = new ArrayList<Pair<A, Long>>();
- for (Entry<A, MutableLong> e : attributeSupport.entrySet()) {
- fList.add(new Pair<A, Long>(e.getKey(), e.getValue().longValue()));
- }
-
- Collections.sort(fList, new Comparator<Pair<A, Long>>() {
-
+ List<Pair<A,Long>> fList = new ArrayList<Pair<A,Long>>();
+ for (Entry<A,MutableLong> e : attributeSupport.entrySet()) {
+ fList.add(new Pair<A,Long>(e.getKey(), e.getValue().longValue()));
+ }
+
+ Collections.sort(fList, new Comparator<Pair<A,Long>>() {
+
@Override
- public int compare(Pair<A, Long> o1, Pair<A, Long> o2) {
+ public int compare(Pair<A,Long> o1, Pair<A,Long> o2) {
int ret = o2.getSecond().compareTo(o1.getSecond());
if (ret != 0) {
return ret;
}
return o1.getFirst().compareTo(o2.getFirst());
}
-
+
});
-
+
return fList;
}
-
+
/**
- * Generate Top K Frequent Patterns for every feature in returnableFeatures given a stream of transactions
- * and the minimum support
+ * Generate Top K Frequent Patterns for every feature in returnableFeatures
+ * given a stream of transactions and the minimum support
*
- * @param transactionStream Iterator of transaction
- * @param frequencyList list of frequent features and their support value
- * @param minSupport minimum support of the transactions
- * @param k Number of top frequent patterns to keep
- * @param returnableFeatures set of features for which the frequent patterns are mined. If the set is null,
- * then top K patterns for every frequent item (an item whose support> minSupport) is generated
- * @param output The output collector to which the the generated patterns are written
+ * @param transactionStream
+ * Iterator of transaction
+ * @param frequencyList
+ * list of frequent features and their support value
+ * @param minSupport
+ * minimum support of the transactions
+ * @param k
+ * Number of top frequent patterns to keep
+ * @param returnableFeatures
+ * set of features for which the frequent patterns are mined. If the
+ * set is null, then top K patterns for every frequent item (an item
+ * whose support> minSupport) is generated
+ * @param output
+ * The output collector to which the the generated patterns are
+ * written
* @throws IOException
*/
- public final void generateTopKFrequentPatterns(Iterator<Pair<List<A>, Long>> transactionStream,
- List<Pair<A, Long>> frequencyList, long minSupport, int k, Set<A> returnableFeatures,
- OutputCollector<A, List<Pair<List<A>, Long>>> output, StatusUpdater updater) throws IOException {
-
- Map<Integer, A> reverseMapping = new HashMap<Integer, A>();
- Map<A, Integer> attributeIdMapping = new HashMap<A, Integer>();
-
+ public final void generateTopKFrequentPatterns(Iterator<Pair<List<A>,Long>> transactionStream,
+ List<Pair<A,Long>> frequencyList,
+ long minSupport,
+ int k,
+ Set<A> returnableFeatures,
+ OutputCollector<A,List<Pair<List<A>,Long>>> output,
+ StatusUpdater updater) throws IOException {
+
+ Map<Integer,A> reverseMapping = new HashMap<Integer,A>();
+ Map<A,Integer> attributeIdMapping = new HashMap<A,Integer>();
+
int id = 0;
- for (Pair<A, Long> feature : frequencyList) {
+ for (Pair<A,Long> feature : frequencyList) {
A attrib = feature.getFirst();
Long frequency = feature.getSecond();
if (frequency < minSupport) {
@@ -139,9 +163,9 @@
attributeIdMapping.put(attrib, id);
reverseMapping.put(id++, attrib);
}
-
+
long[] attributeFrequency = new long[attributeIdMapping.size()];
- for (Pair<A, Long> feature : frequencyList) {
+ for (Pair<A,Long> feature : frequencyList) {
A attrib = feature.getFirst();
Long frequency = feature.getSecond();
if (frequency < minSupport) {
@@ -149,15 +173,16 @@
}
attributeFrequency[attributeIdMapping.get(attrib)] = frequency;
}
-
+
log.info("Number of unique items {}", frequencyList.size());
-
+
Set<Integer> returnFeatures = new HashSet<Integer>();
if (returnableFeatures.isEmpty() == false) {
for (A attrib : returnableFeatures) {
if (attributeIdMapping.containsKey(attrib)) {
returnFeatures.add(attributeIdMapping.get(attrib));
- log.info("Adding Pattern {}=>{}", attrib, attributeIdMapping.get(attrib));
+ log.info("Adding Pattern {}=>{}", attrib, attributeIdMapping
+ .get(attrib));
}
}
} else {
@@ -165,31 +190,41 @@
returnFeatures.add(j);
}
}
-
+
log.info("Number of unique pruned items {}", attributeIdMapping.size());
- generateTopKFrequentPatterns(new TransactionIterator<A>(transactionStream, attributeIdMapping),
- attributeFrequency, minSupport, k, reverseMapping.size(), returnFeatures,
- new TopKPatternsOutputConvertor<A>(output, reverseMapping), updater);
-
+ generateTopKFrequentPatterns(new TransactionIterator<A>(transactionStream,
+ attributeIdMapping), attributeFrequency, minSupport, k, reverseMapping
+ .size(), returnFeatures, new TopKPatternsOutputConverter<A>(output,
+ reverseMapping), updater);
+
}
-
+
/**
* Top K FpGrowth Algorithm
*
- * @param tree to be mined
- * @param minSupportMutable minimum support of the pattern to keep
- * @param k Number of top frequent patterns to keep
- * @param requiredFeatures Set of integer id's of features to mine
- * @param outputCollector the Collector class which converts the given frequent pattern in integer to A
+ * @param tree
+ * to be mined
+ * @param minSupportMutable
+ * minimum support of the pattern to keep
+ * @param k
+ * Number of top frequent patterns to keep
+ * @param requiredFeatures
+ * Set of integer id's of features to mine
+ * @param outputCollector
+ * the Collector class which converts the given frequent pattern in
+ * integer to A
* @return Top K Frequent Patterns for each feature and their support
*/
- private Map<Integer, FrequentPatternMaxHeap> fpGrowth(FPTree tree, MutableLong minSupportMutable, int k,
- Set<Integer> requiredFeatures, TopKPatternsOutputConvertor<A> outputCollector, StatusUpdater updater)
- throws IOException {
-
+ private Map<Integer,FrequentPatternMaxHeap> fpGrowth(FPTree tree,
+ MutableLong minSupportMutable,
+ int k,
+ Set<Integer> requiredFeatures,
+ TopKPatternsOutputConverter<A> outputCollector,
+ StatusUpdater updater) throws IOException {
+
long minSupportValue = minSupportMutable.longValue();
-
- Map<Integer, FrequentPatternMaxHeap> patterns = new HashMap<Integer, FrequentPatternMaxHeap>();
+
+ Map<Integer,FrequentPatternMaxHeap> patterns = new HashMap<Integer,FrequentPatternMaxHeap>();
FPTreeDepthCache treeCache = new FPTreeDepthCache();
for (int i = tree.getHeaderTableCount() - 1; i >= 0; i--) {
int attribute = tree.getAttributeAtIndex(i);
@@ -198,29 +233,32 @@
}
log.info("Mining FTree Tree for all patterns with {}", attribute);
MutableLong minSupport = new MutableLong(minSupportValue);
- FrequentPatternMaxHeap frequentPatterns = growth(tree, minSupport, k, treeCache, 0, attribute,
- updater);
+ FrequentPatternMaxHeap frequentPatterns = growth(tree, minSupport, k,
+ treeCache, 0, attribute, updater);
patterns.put(attribute, frequentPatterns);
outputCollector.collect(attribute, frequentPatterns);
-
+
minSupportValue = Math.max(minSupportValue, minSupport.longValue() / 2);
- log.info("Found {} Patterns with Least Support {}", patterns.get(attribute).count(), patterns.get(
- attribute).leastSupport());
+ log.info("Found {} Patterns with Least Support {}", patterns.get(
+ attribute).count(), patterns.get(attribute).leastSupport());
}
- log.info("Tree Cache: First Level: Cache hits={} Cache Misses={}", treeCache.getHits(), treeCache
- .getMisses());
+ log.info("Tree Cache: First Level: Cache hits={} Cache Misses={}",
+ treeCache.getHits(), treeCache.getMisses());
return patterns;
}
-
- private static FrequentPatternMaxHeap generateSinglePathPatterns(FPTree tree, int k,
- MutableLong minSupportMutable) {
- FrequentPatternMaxHeap frequentPatterns = new FrequentPatternMaxHeap(k, false);
-
+
+ private static FrequentPatternMaxHeap generateSinglePathPatterns(FPTree tree,
+ int k,
+ MutableLong minSupportMutable) {
+ FrequentPatternMaxHeap frequentPatterns = new FrequentPatternMaxHeap(k,
+ false);
+
int tempNode = FPTree.ROOTNODEID;
Pattern frequentItem = new Pattern();
while (tree.childCount(tempNode) != 0) {
if (tree.childCount(tempNode) > 1) {
- log.info("This should not happen {} {}", tree.childCount(tempNode), tempNode);
+ log.info("This should not happen {} {}", tree.childCount(tempNode),
+ tempNode);
}
tempNode = tree.childAtIndex(tempNode, 0);
if (tree.count(tempNode) < minSupportMutable.intValue()) {
@@ -231,68 +269,87 @@
if (frequentItem.length() > 0) {
frequentPatterns.insert(frequentItem);
}
-
+
return frequentPatterns;
}
-
+
/**
- * Internal TopKFrequentPattern Generation algorithm, which represents the A's as integers and transforms
- * features to use only integers
+ * Internal TopKFrequentPattern Generation algorithm, which represents the A's
+ * as integers and transforms features to use only integers
*
- * @param transactions Transaction database Iterator
- * @param attributeFrequency array representing the Frequency of the corresponding attribute id
- * @param minSupport minimum support of the pattern to be mined
- * @param k Max value of the Size of the Max-Heap in which Patterns are held
- * @param featureSetSize number of features
- * @param returnFeatures the id's of the features for which Top K patterns have to be mined
- * @param topKPatternsOutputCollector the outputCollector which transforms the given Pattern in integer
- * format to the corresponding A Format
+ * @param transactions
+ * Transaction database Iterator
+ * @param attributeFrequency
+ * array representing the Frequency of the corresponding attribute id
+ * @param minSupport
+ * minimum support of the pattern to be mined
+ * @param k
+ * Max value of the Size of the Max-Heap in which Patterns are held
+ * @param featureSetSize
+ * number of features
+ * @param returnFeatures
+ * the id's of the features for which Top K patterns have to be mined
+ * @param topKPatternsOutputCollector
+ * the outputCollector which transforms the given Pattern in integer
+ * format to the corresponding A Format
* @return Top K frequent patterns for each attribute
*/
- private Map<Integer, FrequentPatternMaxHeap> generateTopKFrequentPatterns(
- Iterator<Pair<int[], Long>> transactions, long[] attributeFrequency, long minSupport, int k,
- int featureSetSize, Set<Integer> returnFeatures,
- TopKPatternsOutputConvertor<A> topKPatternsOutputCollector, StatusUpdater updater) throws IOException {
-
+ private Map<Integer,FrequentPatternMaxHeap> generateTopKFrequentPatterns(Iterator<Pair<int[],Long>> transactions,
+ long[] attributeFrequency,
+ long minSupport,
+ int k,
+ int featureSetSize,
+ Set<Integer> returnFeatures,
+ TopKPatternsOutputConverter<A> topKPatternsOutputCollector,
+ StatusUpdater updater) throws IOException {
+
FPTree tree = new FPTree(featureSetSize);
for (int i = 0; i < featureSetSize; i++) {
tree.addHeaderCount(i, attributeFrequency[i]);
}
-
+
// Constructing initial FPTree from the list of transactions
MutableLong minSupportMutable = new MutableLong(minSupport);
int nodecount = 0;
// int attribcount = 0;
int i = 0;
while (transactions.hasNext()) {
- Pair<int[], Long> transaction = transactions.next();
+ Pair<int[],Long> transaction = transactions.next();
Arrays.sort(transaction.getFirst());
// attribcount += transaction.length;
- nodecount += treeAddCount(tree, transaction.getFirst(), transaction.getSecond(),
- minSupportMutable, attributeFrequency);
+ nodecount += treeAddCount(tree, transaction.getFirst(), transaction
+ .getSecond(), minSupportMutable, attributeFrequency);
i++;
if (i % 10000 == 0) {
log.info("FPTree Building: Read {} Transactions", i);
}
}
-
+
log.info("Number of Nodes in the FP Tree: {}", nodecount);
-
- return fpGrowth(tree, minSupportMutable, k, returnFeatures, topKPatternsOutputCollector, updater);
- }
-
- private static FrequentPatternMaxHeap growth(FPTree tree, MutableLong minSupportMutable, int k,
- FPTreeDepthCache treeCache, int level, int currentAttribute, StatusUpdater updater) {
-
- FrequentPatternMaxHeap frequentPatterns = new FrequentPatternMaxHeap(k, true);
-
- int i = Arrays.binarySearch(tree.getHeaderTableAttributes(), currentAttribute);
+
+ return fpGrowth(tree, minSupportMutable, k, returnFeatures,
+ topKPatternsOutputCollector, updater);
+ }
+
+ private static FrequentPatternMaxHeap growth(FPTree tree,
+ MutableLong minSupportMutable,
+ int k,
+ FPTreeDepthCache treeCache,
+ int level,
+ int currentAttribute,
+ StatusUpdater updater) {
+
+ FrequentPatternMaxHeap frequentPatterns = new FrequentPatternMaxHeap(k,
+ true);
+
+ int i = Arrays.binarySearch(tree.getHeaderTableAttributes(),
+ currentAttribute);
if (i < 0) {
return frequentPatterns;
}
-
+
int headerTableCount = tree.getHeaderTableCount();
-
+
while (i < headerTableCount) {
int attribute = tree.getAttributeAtIndex(i);
long count = tree.getHeaderSupportCount(attribute);
@@ -303,23 +360,25 @@
updater.update("FPGrowth Algorithm for a given feature: " + attribute);
FPTree conditionalTree = treeCache.getFirstLevelTree(attribute);
if (conditionalTree.isEmpty()) {
- traverseAndBuildConditionalFPTreeData(tree.getHeaderNext(attribute), minSupportMutable,
- conditionalTree, tree);
+ traverseAndBuildConditionalFPTreeData(tree.getHeaderNext(attribute),
+ minSupportMutable, conditionalTree, tree);
// printTree(conditionalTree);
-
+
}
-
+
FrequentPatternMaxHeap returnedPatterns;
if (attribute == currentAttribute) {
-
- returnedPatterns = growthTopDown(conditionalTree, minSupportMutable, k, treeCache, level + 1, true,
- currentAttribute, updater);
-
- frequentPatterns = mergeHeap(frequentPatterns, returnedPatterns, attribute, count, true, true);
+
+ returnedPatterns = growthTopDown(conditionalTree, minSupportMutable, k,
+ treeCache, level + 1, true, currentAttribute, updater);
+
+ frequentPatterns = mergeHeap(frequentPatterns, returnedPatterns,
+ attribute, count, true, true);
} else {
- returnedPatterns = growthTopDown(conditionalTree, minSupportMutable, k, treeCache, level + 1,
- false, currentAttribute, updater);
- frequentPatterns = mergeHeap(frequentPatterns, returnedPatterns, attribute, count, false, true);
+ returnedPatterns = growthTopDown(conditionalTree, minSupportMutable, k,
+ treeCache, level + 1, false, currentAttribute, updater);
+ frequentPatterns = mergeHeap(frequentPatterns, returnedPatterns,
+ attribute, count, false, true);
}
if (frequentPatterns.isFull()) {
if (minSupportMutable.intValue() < frequentPatterns.leastSupport()) {
@@ -328,18 +387,25 @@
}
i++;
}
-
+
return frequentPatterns;
}
-
- private static FrequentPatternMaxHeap growthBottomUp(FPTree tree, MutableLong minSupportMutable, int k,
- FPTreeDepthCache treeCache, int level, boolean conditionalOfCurrentAttribute, int currentAttribute,
- StatusUpdater updater) {
-
- FrequentPatternMaxHeap frequentPatterns = new FrequentPatternMaxHeap(k, false);
-
+
+ private static FrequentPatternMaxHeap growthBottomUp(FPTree tree,
+ MutableLong minSupportMutable,
+ int k,
+ FPTreeDepthCache treeCache,
+ int level,
+ boolean conditionalOfCurrentAttribute,
+ int currentAttribute,
+ StatusUpdater updater) {
+
+ FrequentPatternMaxHeap frequentPatterns = new FrequentPatternMaxHeap(k,
+ false);
+
if (conditionalOfCurrentAttribute == false) {
- int index = Arrays.binarySearch(tree.getHeaderTableAttributes(), currentAttribute);
+ int index = Arrays.binarySearch(tree.getHeaderTableAttributes(),
+ currentAttribute);
if (index < 0) {
return frequentPatterns;
} else {
@@ -350,11 +416,11 @@
}
}
}
-
+
if (tree.singlePath()) {
return generateSinglePathPatterns(tree, k, minSupportMutable);
}
-
+
updater.update("Bottom Up FP Growth");
for (int i = tree.getHeaderTableCount() - 1; i >= 0; i--) {
int attribute = tree.getAttributeAtIndex(i);
@@ -363,50 +429,60 @@
continue;
}
FPTree conditionalTree = treeCache.getTree(level);
-
+
FrequentPatternMaxHeap returnedPatterns;
if (conditionalOfCurrentAttribute) {
- traverseAndBuildConditionalFPTreeData(tree.getHeaderNext(attribute), minSupportMutable,
- conditionalTree, tree);
- returnedPatterns = growthBottomUp(conditionalTree, minSupportMutable, k, treeCache, level + 1,
- true, currentAttribute, updater);
-
- frequentPatterns = mergeHeap(frequentPatterns, returnedPatterns, attribute, count, true, false);
+ traverseAndBuildConditionalFPTreeData(tree.getHeaderNext(attribute),
+ minSupportMutable, conditionalTree, tree);
+ returnedPatterns = growthBottomUp(conditionalTree, minSupportMutable,
+ k, treeCache, level + 1, true, currentAttribute, updater);
+
+ frequentPatterns = mergeHeap(frequentPatterns, returnedPatterns,
+ attribute, count, true, false);
} else {
if (attribute == currentAttribute) {
- traverseAndBuildConditionalFPTreeData(tree.getHeaderNext(attribute), minSupportMutable,
- conditionalTree, tree);
- returnedPatterns = growthBottomUp(conditionalTree, minSupportMutable, k, treeCache, level + 1,
- true, currentAttribute, updater);
-
- frequentPatterns = mergeHeap(frequentPatterns, returnedPatterns, attribute, count, true, false);
+ traverseAndBuildConditionalFPTreeData(tree.getHeaderNext(attribute),
+ minSupportMutable, conditionalTree, tree);
+ returnedPatterns = growthBottomUp(conditionalTree, minSupportMutable,
+ k, treeCache, level + 1, true, currentAttribute, updater);
+
+ frequentPatterns = mergeHeap(frequentPatterns, returnedPatterns,
+ attribute, count, true, false);
} else if (attribute > currentAttribute) {
- traverseAndBuildConditionalFPTreeData(tree.getHeaderNext(attribute), minSupportMutable,
- conditionalTree, tree);
- returnedPatterns = growthBottomUp(conditionalTree, minSupportMutable, k, treeCache, level + 1,
- false, currentAttribute, updater);
- frequentPatterns = mergeHeap(frequentPatterns, returnedPatterns, attribute, count, false, false);
+ traverseAndBuildConditionalFPTreeData(tree.getHeaderNext(attribute),
+ minSupportMutable, conditionalTree, tree);
+ returnedPatterns = growthBottomUp(conditionalTree, minSupportMutable,
+ k, treeCache, level + 1, false, currentAttribute, updater);
+ frequentPatterns = mergeHeap(frequentPatterns, returnedPatterns,
+ attribute, count, false, false);
}
}
-
+
if (frequentPatterns.isFull()) {
if (minSupportMutable.intValue() < frequentPatterns.leastSupport()) {
minSupportMutable.setValue(frequentPatterns.leastSupport());
}
}
}
-
+
return frequentPatterns;
}
-
- private static FrequentPatternMaxHeap growthTopDown(FPTree tree, MutableLong minSupportMutable, int k,
- FPTreeDepthCache treeCache, int level, boolean conditionalOfCurrentAttribute, int currentAttribute,
- StatusUpdater updater) {
-
- FrequentPatternMaxHeap frequentPatterns = new FrequentPatternMaxHeap(k, true);
-
+
+ private static FrequentPatternMaxHeap growthTopDown(FPTree tree,
+ MutableLong minSupportMutable,
+ int k,
+ FPTreeDepthCache treeCache,
+ int level,
+ boolean conditionalOfCurrentAttribute,
+ int currentAttribute,
+ StatusUpdater updater) {
+
+ FrequentPatternMaxHeap frequentPatterns = new FrequentPatternMaxHeap(k,
+ true);
+
if (conditionalOfCurrentAttribute == false) {
- int index = Arrays.binarySearch(tree.getHeaderTableAttributes(), currentAttribute);
+ int index = Arrays.binarySearch(tree.getHeaderTableAttributes(),
+ currentAttribute);
if (index < 0) {
return frequentPatterns;
} else {
@@ -417,46 +493,49 @@
}
}
}
-
+
if (tree.singlePath()) {
return generateSinglePathPatterns(tree, k, minSupportMutable);
}
-
+
updater.update("Top Down Growth:");
-
+
for (int i = 0; i < tree.getHeaderTableCount(); i++) {
int attribute = tree.getAttributeAtIndex(i);
long count = tree.getHeaderSupportCount(attribute);
if (count < minSupportMutable.longValue()) {
continue;
}
-
+
FPTree conditionalTree = treeCache.getTree(level);
-
+
FrequentPatternMaxHeap returnedPatterns;
if (conditionalOfCurrentAttribute) {
- traverseAndBuildConditionalFPTreeData(tree.getHeaderNext(attribute), minSupportMutable,
- conditionalTree, tree);
-
- returnedPatterns = growthBottomUp(conditionalTree, minSupportMutable, k, treeCache, level + 1,
- true, currentAttribute, updater);
- frequentPatterns = mergeHeap(frequentPatterns, returnedPatterns, attribute, count, true, true);
-
+ traverseAndBuildConditionalFPTreeData(tree.getHeaderNext(attribute),
+ minSupportMutable, conditionalTree, tree);
+
+ returnedPatterns = growthBottomUp(conditionalTree, minSupportMutable,
+ k, treeCache, level + 1, true, currentAttribute, updater);
+ frequentPatterns = mergeHeap(frequentPatterns, returnedPatterns,
+ attribute, count, true, true);
+
} else {
if (attribute == currentAttribute) {
- traverseAndBuildConditionalFPTreeData(tree.getHeaderNext(attribute), minSupportMutable,
- conditionalTree, tree);
- returnedPatterns = growthBottomUp(conditionalTree, minSupportMutable, k, treeCache, level + 1,
- true, currentAttribute, updater);
- frequentPatterns = mergeHeap(frequentPatterns, returnedPatterns, attribute, count, true, false);
-
+ traverseAndBuildConditionalFPTreeData(tree.getHeaderNext(attribute),
+ minSupportMutable, conditionalTree, tree);
+ returnedPatterns = growthBottomUp(conditionalTree, minSupportMutable,
+ k, treeCache, level + 1, true, currentAttribute, updater);
+ frequentPatterns = mergeHeap(frequentPatterns, returnedPatterns,
+ attribute, count, true, false);
+
} else if (attribute > currentAttribute) {
- traverseAndBuildConditionalFPTreeData(tree.getHeaderNext(attribute), minSupportMutable,
- conditionalTree, tree);
- returnedPatterns = growthBottomUp(conditionalTree, minSupportMutable, k, treeCache, level + 1,
- false, currentAttribute, updater);
- frequentPatterns = mergeHeap(frequentPatterns, returnedPatterns, attribute, count, false, true);
-
+ traverseAndBuildConditionalFPTreeData(tree.getHeaderNext(attribute),
+ minSupportMutable, conditionalTree, tree);
+ returnedPatterns = growthBottomUp(conditionalTree, minSupportMutable,
+ k, treeCache, level + 1, false, currentAttribute, updater);
+ frequentPatterns = mergeHeap(frequentPatterns, returnedPatterns,
+ attribute, count, false, true);
+
}
}
if (frequentPatterns.isFull()) {
@@ -465,118 +544,128 @@
}
}
}
-
+
return frequentPatterns;
}
-
+
private static FrequentPatternMaxHeap mergeHeap(FrequentPatternMaxHeap frequentPatterns,
- FrequentPatternMaxHeap returnedPatterns, int attribute, long count, boolean addAttribute,
- boolean subPatternCheck) {
+ FrequentPatternMaxHeap returnedPatterns,
+ int attribute,
+ long count,
+ boolean addAttribute,
+ boolean subPatternCheck) {
frequentPatterns.addAll(returnedPatterns, attribute, count);
if (frequentPatterns.addable(count) && addAttribute) {
Pattern p = new Pattern();
p.add(attribute, count);
frequentPatterns.insert(p);
}
-
+
return frequentPatterns;
}
-
+
private static void traverseAndBuildConditionalFPTreeData(int firstConditionalNode,
- MutableLong minSupportMutable, FPTree conditionalTree, FPTree tree) {
-
+ MutableLong minSupportMutable,
+ FPTree conditionalTree,
+ FPTree tree) {
+
// Build Subtable
int conditionalNode = firstConditionalNode;
-
+
while (conditionalNode != -1) {
long nextNodeCount = tree.count(conditionalNode);
int pathNode = tree.parent(conditionalNode);
int prevConditional = -1;
-
+
while (pathNode != 0) { // dummy root node
int attribute = tree.attribute(pathNode);
- if (tree.getHeaderSupportCount(attribute) < minSupportMutable.intValue()) {
+ if (tree.getHeaderSupportCount(attribute) < minSupportMutable
+ .intValue()) {
pathNode = tree.parent(pathNode);
continue;
}
// update and increment the headerTable Counts
conditionalTree.addHeaderCount(attribute, nextNodeCount);
-
+
int conditional = tree.conditional(pathNode);
// if its a new conditional tree node
-
+
if (conditional == 0) {
- tree.setConditional(pathNode, conditionalTree.createConditionalNode(attribute, 0));
+ tree.setConditional(pathNode, conditionalTree.createConditionalNode(
+ attribute, 0));
conditional = tree.conditional(pathNode);
conditionalTree.addHeaderNext(attribute, conditional);
} else {
conditionalTree.setSinglePath(false);
}
-
+
if (prevConditional != -1) { // if there is a child element
conditionalTree.setParent(prevConditional, conditional);
}
-
+
conditionalTree.addCount(conditional, nextNodeCount);
prevConditional = conditional;
-
+
pathNode = tree.parent(pathNode);
-
+
}
if (prevConditional != -1) {
conditionalTree.setParent(prevConditional, FPTree.ROOTNODEID);
- if (conditionalTree.childCount(FPTree.ROOTNODEID) > 1 && conditionalTree.singlePath()) {
+ if (conditionalTree.childCount(FPTree.ROOTNODEID) > 1
+ && conditionalTree.singlePath()) {
conditionalTree.setSinglePath(false);
-
+
}
}
conditionalNode = tree.next(conditionalNode);
}
-
+
tree.clearConditional();
conditionalTree.reorderHeaderTable();
pruneFPTree(minSupportMutable, conditionalTree);
// prune Conditional Tree
-
+
}
-
+
private static void pruneFPTree(MutableLong minSupportMutable, FPTree tree) {
for (int i = 0; i < tree.getHeaderTableCount(); i++) {
int currentAttribute = tree.getAttributeAtIndex(i);
- if (tree.getHeaderSupportCount(currentAttribute) < minSupportMutable.intValue()) {
+ if (tree.getHeaderSupportCount(currentAttribute) < minSupportMutable
+ .intValue()) {
int nextNode = tree.getHeaderNext(currentAttribute);
tree.removeHeaderNext(currentAttribute);
while (nextNode != -1) {
-
+
int mychildCount = tree.childCount(nextNode);
-
+
int parentNode = tree.parent(nextNode);
-
+
for (int j = 0; j < mychildCount; j++) {
Integer myChildId = tree.childAtIndex(nextNode, j);
tree.replaceChild(parentNode, nextNode, myChildId);
}
nextNode = tree.next(nextNode);
}
-
+
}
}
-
+
for (int i = 0; i < tree.getHeaderTableCount(); i++) {
int currentAttribute = tree.getAttributeAtIndex(i);
int nextNode = tree.getHeaderNext(currentAttribute);
-
+
FastByIDMap<Integer> prevNode = new FastByIDMap<Integer>();
int justPrevNode = -1;
while (nextNode != -1) {
-
+
int parent = tree.parent(nextNode);
-
+
if (prevNode.containsKey(parent) == false) {
prevNode.put(parent, nextNode);
} else {
int prevNodeId = prevNode.get(parent);
- if (1 >= tree.childCount(prevNodeId) && 1 >= tree.childCount(nextNode)) {
+ if (1 >= tree.childCount(prevNodeId)
+ && 1 >= tree.childCount(nextNode)) {
tree.addCount(prevNodeId, tree.count(nextNode));
if (tree.childCount(nextNode) == 1) {
tree.addChild(prevNodeId, tree.childAtIndex(nextNode, 0));
@@ -589,29 +678,38 @@
nextNode = tree.next(nextNode);
}
}
-
+
// prune Conditional Tree
-
+
}
-
+
/**
- * Create FPTree with node counts incremented by addCount variable given the root node and the List of
- * Attributes in transaction sorted by support
+ * Create FPTree with node counts incremented by addCount variable given the
+ * root node and the List of Attributes in transaction sorted by support
*
- * @param tree object to which the transaction has to be added to
- * @param myList List of transactions sorted by support
- * @param addCount amount by which the Node count has to be incremented
- * @param minSupport the MutableLong value which contains the current value(dynamic) of support
- * @param attributeFrequency the list of attributes and their frequency
+ * @param tree
+ * object to which the transaction has to be added to
+ * @param myList
+ * List of transactions sorted by support
+ * @param addCount
+ * amount by which the Node count has to be incremented
+ * @param minSupport
+ * the MutableLong value which contains the current value(dynamic) of
+ * support
+ * @param attributeFrequency
+ * the list of attributes and their frequency
* @return the number of new nodes added
*/
- private static int treeAddCount(FPTree tree, int[] myList, long addCount, MutableLong minSupport,
- long[] attributeFrequency) {
-
+ private static int treeAddCount(FPTree tree,
+ int[] myList,
+ long addCount,
+ MutableLong minSupport,
+ long[] attributeFrequency) {
+
int temp = FPTree.ROOTNODEID;
int ret = 0;
boolean addCountMode = true;
-
+
for (int attribute : myList) {
if (attributeFrequency[attribute] < minSupport.intValue()) {
return ret;
@@ -632,8 +730,8 @@
ret++;
}
}
-
+
return ret;
-
+
}
}
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/fpgrowth/FPTree.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/fpgrowth/FPTree.java?rev=906896&r1=906895&r2=906896&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/fpgrowth/FPTree.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/fpgrowth/FPTree.java Fri Feb 5 11:00:25 2010
@@ -21,86 +21,91 @@
import java.util.Set;
import java.util.TreeSet;
+/**
+ * The Frequent Pattern Tree datastructure used for mining patterns using
+ * {@link FPGrowth} algorithm
+ *
+ */
public class FPTree {
-
+
public static final int ROOTNODEID = 0;
-
+
private static final int DEFAULT_CHILDREN_INITIAL_SIZE = 2;
-
+
private static final int DEFAULT_HEADER_TABLE_INITIAL_SIZE = 4;
-
+
private static final int DEFAULT_INITIAL_SIZE = 8;
-
+
private static final float GROWTH_RATE = 1.5f;
-
+
private static final int HEADERTABLEBLOCKSIZE = 2;
-
+
private static final int HT_LAST = 1;
-
+
private static final int HT_NEXT = 0;
-
+
private int[] attribute;
-
+
private int[] childCount;
-
+
private int[] conditional;
-
+
private long[] headerTableAttributeCount;
-
+
private int[] headerTableAttributes;
-
- private int headerTableCount = 0;
-
+
+ private int headerTableCount;
+
private int[] headerTableLookup;
-
+
private int[][] headerTableProperties;
-
+
private int[] next;
-
+
private int[][] nodeChildren;
-
+
private long[] nodeCount;
-
- private int nodes = 0;
-
+
+ private int nodes;
+
private int[] parent;
-
+
private boolean singlePath;
-
+
private final Set<Integer> sortedSet = new TreeSet<Integer>();
-
+
public FPTree() {
this(DEFAULT_INITIAL_SIZE, DEFAULT_HEADER_TABLE_INITIAL_SIZE);
}
-
+
public FPTree(int size) {
this(size, DEFAULT_HEADER_TABLE_INITIAL_SIZE);
}
-
+
private FPTree(int size, int headersize) {
if (size < DEFAULT_INITIAL_SIZE) {
size = DEFAULT_INITIAL_SIZE;
}
-
+
parent = new int[size];
next = new int[size];
childCount = new int[size];
attribute = new int[size];
nodeCount = new long[size];
-
+
nodeChildren = new int[size][];
conditional = new int[size];
-
+
headerTableAttributes = new int[DEFAULT_HEADER_TABLE_INITIAL_SIZE];
headerTableAttributeCount = new long[DEFAULT_HEADER_TABLE_INITIAL_SIZE];
headerTableLookup = new int[DEFAULT_HEADER_TABLE_INITIAL_SIZE];
Arrays.fill(headerTableLookup, -1);
headerTableProperties = new int[DEFAULT_HEADER_TABLE_INITIAL_SIZE][];
-
+
singlePath = true;
createRootNode();
}
-
+
public final void addChild(int parentNodeId, int childnodeId) {
int length = childCount[parentNodeId];
if (length >= nodeChildren[parentNodeId].length) {
@@ -108,12 +113,12 @@
}
nodeChildren[parentNodeId][length++] = childnodeId;
childCount[parentNodeId] = length;
-
+
if (length > 1 && singlePath) {
singlePath = false;
}
}
-
+
public final boolean addCount(int nodeId, long count) {
if (nodeId < nodes) {
this.nodeCount[nodeId] += count;
@@ -121,12 +126,12 @@
}
return false;
}
-
+
public final void addHeaderCount(int attributeValue, long count) {
int index = getHeaderIndex(attributeValue);
headerTableAttributeCount[index] += count;
}
-
+
public final void addHeaderNext(int attributeValue, int nodeId) {
int index = getHeaderIndex(attributeValue);
if (headerTableProperties[index][HT_NEXT] == -1) {
@@ -137,22 +142,22 @@
headerTableProperties[index][HT_LAST] = nodeId;
}
}
-
+
public final int attribute(int nodeId) {
return this.attribute[nodeId];
}
-
+
public final int childAtIndex(int nodeId, int index) {
if (childCount[nodeId] < index) {
return -1;
}
return nodeChildren[nodeId][index];
}
-
+
public final int childCount(int nodeId) {
return childCount[nodeId];
}
-
+
public final int childWithAttribute(int nodeId, int childAttribute) {
int length = childCount[nodeId];
for (int i = 0; i < length; i++) {
@@ -162,7 +167,7 @@
}
return -1;
}
-
+
public final void clear() {
nodes = 0;
headerTableCount = 0;
@@ -171,21 +176,21 @@
sortedSet.clear();
createRootNode();
}
-
+
public final void clearConditional() {
for (int i = nodes - 1; i >= 0; i--) {
conditional[i] = 0;
}
}
-
+
public final int conditional(int nodeId) {
return this.conditional[nodeId];
}
-
+
public final long count(int nodeId) {
return nodeCount[nodeId];
}
-
+
public final int createConditionalNode(int attributeValue, long count) {
if (nodes >= this.attribute.length) {
resize();
@@ -196,36 +201,36 @@
conditional[nodes] = 0;
this.attribute[nodes] = attributeValue;
nodeCount[nodes] = count;
-
+
if (nodeChildren[nodes] == null) {
nodeChildren[nodes] = new int[DEFAULT_CHILDREN_INITIAL_SIZE];
}
-
+
return nodes++;
}
-
+
public final int createNode(int parentNodeId, int attributeValue, long count) {
if (nodes >= this.attribute.length) {
resize();
}
-
+
childCount[nodes] = 0;
next[nodes] = -1;
parent[nodes] = parentNodeId;
this.attribute[nodes] = attributeValue;
nodeCount[nodes] = count;
-
+
conditional[nodes] = 0;
if (nodeChildren[nodes] == null) {
nodeChildren[nodes] = new int[DEFAULT_CHILDREN_INITIAL_SIZE];
}
-
+
int childNodeId = nodes++;
addChild(parentNodeId, childNodeId);
addHeaderNext(attributeValue, childNodeId);
return childNodeId;
}
-
+
public final int createRootNode() {
childCount[nodes] = 0;
next[nodes] = -1;
@@ -237,48 +242,48 @@
}
return nodes++;
}
-
+
public final int getAttributeAtIndex(int index) {
return headerTableAttributes[index];
}
-
+
public final int getHeaderNext(int attributeValue) {
int index = getHeaderIndex(attributeValue);
return headerTableProperties[index][HT_NEXT];
}
-
+
public final long getHeaderSupportCount(int attributeValue) {
int index = getHeaderIndex(attributeValue);
return headerTableAttributeCount[index];
}
-
+
public final int[] getHeaderTableAttributes() {
int[] attributes = new int[headerTableCount];
System.arraycopy(headerTableAttributes, 0, attributes, 0, headerTableCount);
return attributes;
}
-
+
public final int getHeaderTableCount() {
return headerTableCount;
}
-
+
public final boolean isEmpty() {
return nodes <= 1;
}
-
+
public final int next(int nodeId) {
return next[nodeId];
}
-
+
public final int parent(int nodeId) {
return parent[nodeId];
}
-
+
public final void removeHeaderNext(int attributeValue) {
int index = getHeaderIndex(attributeValue);
headerTableProperties[index][HT_NEXT] = -1;
}
-
+
public final void reorderHeaderTable() {
// Arrays.sort(headerTableAttributes, 0, headerTableCount);
int i = 0;
@@ -286,7 +291,7 @@
headerTableAttributes[i++] = attr;
}
}
-
+
public void replaceChild(int parentNodeId, int replacableNode, int childnodeId) {
for (int i = 0, j = childCount[parentNodeId]; i < j; i++) {
if (nodeChildren[parentNodeId][i] == replacableNode) {
@@ -295,7 +300,7 @@
}
}
}
-
+
public final boolean setConditional(int nodeId, int conditionalNode) {
if (nodeId < nodes) {
this.conditional[nodeId] = conditionalNode;
@@ -303,7 +308,7 @@
}
return false;
}
-
+
public final boolean setNext(int nodeId, int nextNode) {
if (nodeId < nodes) {
this.next[nodeId] = nextNode;
@@ -311,11 +316,11 @@
}
return false;
}
-
+
public final boolean setParent(int nodeId, int parentNode) {
if (nodeId < nodes) {
this.parent[nodeId] = parentNode;
-
+
int length = childCount[parentNode];
if (length >= nodeChildren[parentNode].length) {
resizeChildren(parentNode);
@@ -326,15 +331,15 @@
}
return false;
}
-
+
public final void setSinglePath(boolean bit) {
singlePath = bit;
}
-
+
public final boolean singlePath() {
return singlePath;
}
-
+
private int getHeaderIndex(int attributeValue) {
if (attributeValue >= headerTableLookup.length) {
resizeHeaderLookup(attributeValue);
@@ -357,13 +362,13 @@
}
return index;
}
-
+
private void resize() {
int size = (int) (GROWTH_RATE * nodes);
if (size < DEFAULT_INITIAL_SIZE) {
size = DEFAULT_INITIAL_SIZE;
}
-
+
int[] oldChildCount = childCount;
int[] oldAttribute = attribute;
long[] oldnodeCount = nodeCount;
@@ -371,16 +376,16 @@
int[] oldNext = next;
int[][] oldNodeChildren = nodeChildren;
int[] oldConditional = conditional;
-
+
childCount = new int[size];
attribute = new int[size];
nodeCount = new long[size];
parent = new int[size];
next = new int[size];
-
+
nodeChildren = new int[size][];
conditional = new int[size];
-
+
System.arraycopy(oldChildCount, 0, this.childCount, 0, nodes);
System.arraycopy(oldAttribute, 0, this.attribute, 0, nodes);
System.arraycopy(oldnodeCount, 0, this.nodeCount, 0, nodes);
@@ -389,7 +394,7 @@
System.arraycopy(oldNodeChildren, 0, this.nodeChildren, 0, nodes);
System.arraycopy(oldConditional, 0, this.conditional, 0, nodes);
}
-
+
private void resizeChildren(int nodeId) {
int length = childCount[nodeId];
int size = (int) (GROWTH_RATE * length);
@@ -400,7 +405,7 @@
nodeChildren[nodeId] = new int[size];
System.arraycopy(oldNodeChildren, 0, this.nodeChildren[nodeId], 0, length);
}
-
+
private void resizeHeaderLookup(int attributeValue) {
int size = (int) (attributeValue * GROWTH_RATE);
int[] oldLookup = headerTableLookup;
@@ -408,13 +413,13 @@
Arrays.fill(headerTableLookup, oldLookup.length, size, -1);
System.arraycopy(oldLookup, 0, this.headerTableLookup, 0, oldLookup.length);
}
-
+
private void resizeHeaderTable() {
int size = (int) (GROWTH_RATE * headerTableCount);
if (size < DEFAULT_HEADER_TABLE_INITIAL_SIZE) {
size = DEFAULT_HEADER_TABLE_INITIAL_SIZE;
}
-
+
int[] oldAttributes = headerTableAttributes;
long[] oldAttributeCount = headerTableAttributeCount;
int[][] oldProperties = headerTableProperties;
@@ -422,10 +427,10 @@
headerTableAttributeCount = new long[size];
headerTableProperties = new int[size][];
System.arraycopy(oldAttributes, 0, this.headerTableAttributes, 0,
- headerTableCount);
+ headerTableCount);
System.arraycopy(oldAttributeCount, 0, this.headerTableAttributeCount, 0,
- headerTableCount);
+ headerTableCount);
System.arraycopy(oldProperties, 0, this.headerTableProperties, 0,
- headerTableCount);
+ headerTableCount);
}
}
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/fpgrowth/FPTreeDepthCache.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/fpgrowth/FPTreeDepthCache.java?rev=906896&r1=906895&r2=906896&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/fpgrowth/FPTreeDepthCache.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/fpgrowth/FPTreeDepthCache.java Fri Feb 5 11:00:25 2010
@@ -17,40 +17,47 @@
package org.apache.mahout.fpm.pfpgrowth.fpgrowth;
-import org.apache.mahout.common.cache.LeastKCache;
-
import java.util.ArrayList;
+import java.util.List;
+import org.apache.mahout.common.cache.LeastKCache;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+/**
+ * Caches large FPTree {@link Object} for each level of the recursive
+ * {@link FPGrowth} algorithm to reduce allocation overhead.
+ *
+ */
public class FPTreeDepthCache {
-
+
private static int firstLevelCacheSize = 5;
-
- private static final Logger log = LoggerFactory.getLogger(FPTreeDepthCache.class);
-
- private final LeastKCache<Integer, FPTree> firstLevelCache = new LeastKCache<Integer, FPTree>(
- firstLevelCacheSize);
-
- private int hits = 0;
-
- private int misses = 0;
-
- private final ArrayList<FPTree> treeCache = new ArrayList<FPTree>();
-
+
+ private static final Logger log = LoggerFactory.getLogger(
+ FPTreeDepthCache.class);
+
+ private final LeastKCache<Integer,FPTree> firstLevelCache
+ = new LeastKCache<Integer,FPTree>(firstLevelCacheSize);
+
+ private int hits;
+
+ private int misses;
+
+ private final List<FPTree> treeCache = new ArrayList<FPTree>();
+
public FPTreeDepthCache() {
- log.info("Initializing FPTreeCache with firstLevelCacheSize: {}", firstLevelCacheSize);
+ log.info("Initializing FPTreeCache with firstLevelCacheSize: {}",
+ firstLevelCacheSize);
}
public static int getFirstLevelCacheSize() {
return firstLevelCacheSize;
}
-
+
public static void setFirstLevelCacheSize(int firstLevelCacheSize) {
FPTreeDepthCache.firstLevelCacheSize = firstLevelCacheSize;
}
-
+
public final FPTree getFirstLevelTree(int attr) {
Integer attribute = attr;
if (firstLevelCache.contains(attribute)) {
@@ -63,15 +70,15 @@
return conditionalTree;
}
}
-
+
public final int getHits() {
return hits;
}
-
+
public final int getMisses() {
return misses;
}
-
+
public final FPTree getTree(int level) {
while (treeCache.size() < level + 1) {
FPTree cTree = new FPTree();
@@ -81,5 +88,5 @@
conditionalTree.clear();
return conditionalTree;
}
-
+
}
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/fpgrowth/FrequentPatternMaxHeap.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/fpgrowth/FrequentPatternMaxHeap.java?rev=906896&r1=906895&r2=906896&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/fpgrowth/FrequentPatternMaxHeap.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/fpgrowth/FrequentPatternMaxHeap.java Fri Feb 5 11:00:25 2010
@@ -25,24 +25,24 @@
/** {@link FrequentPatternMaxHeap} keeps top K Attributes in a TreeSet */
public final class FrequentPatternMaxHeap {
-
- private int count = 0;
-
- private Pattern least = null;
-
- private int maxSize = 0;
-
- private boolean subPatternCheck = false;
-
- private Map<Long, Set<Pattern>> patternIndex = null;
-
- private PriorityQueue<Pattern> queue = null;
-
+
+ private int count;
+
+ private Pattern least;
+
+ private int maxSize;
+
+ private boolean subPatternCheck;
+
+ private Map<Long,Set<Pattern>> patternIndex;
+
+ private PriorityQueue<Pattern> queue;
+
public FrequentPatternMaxHeap(int numResults, boolean subPatternCheck) {
maxSize = numResults;
queue = new PriorityQueue<Pattern>(maxSize);
this.subPatternCheck = subPatternCheck;
- patternIndex = new HashMap<Long, Set<Pattern>>();
+ patternIndex = new HashMap<Long,Set<Pattern>>();
for (Pattern p : queue) {
Long index = p.support();
Set<Pattern> patternList;
@@ -52,22 +52,22 @@
}
patternList = patternIndex.get(index);
patternList.add(p);
-
+
}
}
-
+
public boolean addable(long support) {
if (count < maxSize) {
return true;
}
return least.support() <= support;
}
-
+
public PriorityQueue<Pattern> getHeap() {
if (subPatternCheck) {
PriorityQueue<Pattern> ret = new PriorityQueue<Pattern>(maxSize);
for (Pattern p : queue) {
-
+
if (patternIndex.get(p.support()).contains(p)) {
ret.add(p);
}
@@ -77,8 +77,10 @@
return queue;
}
}
-
- public void addAll(FrequentPatternMaxHeap patterns, int attribute, long attributeSupport) {
+
+ public void addAll(FrequentPatternMaxHeap patterns,
+ int attribute,
+ long attributeSupport) {
for (Pattern pattern : patterns.getHeap()) {
long support = Math.min(attributeSupport, pattern.support());
if (this.addable(support)) {
@@ -87,11 +89,11 @@
}
}
}
-
+
public void insert(Pattern frequentPattern) {
if (frequentPattern.length() == 0) {
return;
- }
+ }
if (count == maxSize) {
if (frequentPattern.compareTo(least) > 0) {
@@ -101,7 +103,7 @@
if (subPatternCheck) {
patternIndex.get(evictedItem.support()).remove(evictedItem);
}
-
+
}
}
} else {
@@ -117,22 +119,22 @@
}
}
}
-
+
public int count() {
return count;
}
-
+
public boolean isFull() {
return count == maxSize;
}
-
+
public long leastSupport() {
if (least == null) {
return 0;
}
return least.support();
}
-
+
private boolean addPattern(Pattern frequentPattern) {
if (subPatternCheck == false) {
queue.add(frequentPattern);
@@ -154,7 +156,8 @@
}
if (replace) {
indexSet.remove(replacablePattern);
- if (indexSet.contains(frequentPattern) == false && queue.add(frequentPattern)) {
+ if (indexSet.contains(frequentPattern) == false
+ && queue.add(frequentPattern)) {
indexSet.add(frequentPattern);
}
@@ -172,7 +175,7 @@
}
patternList = patternIndex.get(index);
patternList.add(frequentPattern);
-
+
return true;
}
}
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/fpgrowth/Pattern.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/fpgrowth/Pattern.java?rev=906896&r1=906895&r2=906896&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/fpgrowth/Pattern.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/fpgrowth/Pattern.java Fri Feb 5 11:00:25 2010
@@ -19,28 +19,33 @@
import java.util.Arrays;
+/**
+ * A {@link Pattern} in FPGrowth is a list of items (here int) and the
+ * support(the number of times the pattern is seen in the dataset)
+ *
+ */
public class Pattern implements Comparable<Pattern> {
-
+
private static final int DEFAULT_INITIAL_SIZE = 2;
-
+
private static final float GROWTH_RATE = 1.5f;
-
+
private boolean dirty = true;
-
+
private int hashCode;
-
- private int length = 0;
-
+
+ private int length;
+
private int[] pattern;
-
+
private long support = Long.MAX_VALUE;
-
+
private long[] supportValues;
-
+
public Pattern() {
this(DEFAULT_INITIAL_SIZE);
}
-
+
private Pattern(int size) {
if (size < DEFAULT_INITIAL_SIZE) {
size = DEFAULT_INITIAL_SIZE;
@@ -49,7 +54,7 @@
this.supportValues = new long[size];
dirty = true;
}
-
+
public final void add(int id, long supportCount) {
if (length >= pattern.length) {
resize();
@@ -59,7 +64,7 @@
this.support = (supportCount > this.support) ? this.support : supportCount;
dirty = true;
}
-
+
@Override
public boolean equals(Object obj) {
if (this == obj) {
@@ -80,15 +85,15 @@
}
return Arrays.equals(pattern, other.pattern);
}
-
+
public final int[] getPattern() {
return this.pattern;
}
-
+
public final Object[] getPatternWithSupport() {
return new Object[] {this.pattern, this.supportValues};
}
-
+
@Override
public int hashCode() {
if (dirty == false) {
@@ -101,7 +106,7 @@
hashCode = result;
return result;
}
-
+
public final boolean isSubPatternOf(Pattern frequentPattern) {
int[] otherPattern = frequentPattern.getPattern();
int otherLength = frequentPattern.length();
@@ -122,22 +127,22 @@
}
return otherI != otherLength || i == length;
}
-
+
public final int length() {
return this.length;
}
-
+
public final long support() {
return this.support;
}
-
+
@Override
public final String toString() {
int[] arr = new int[length];
System.arraycopy(pattern, 0, arr, 0, length);
return Arrays.toString(arr) + '-' + support;
}
-
+
private void resize() {
int size = (int) (GROWTH_RATE * length);
if (size < DEFAULT_INITIAL_SIZE) {
@@ -150,7 +155,7 @@
System.arraycopy(oldpattern, 0, this.pattern, 0, length);
System.arraycopy(oldSupport, 0, this.supportValues, 0, length);
}
-
+
@Override
public int compareTo(Pattern cr2) {
long support2 = cr2.support();
@@ -170,5 +175,5 @@
}
}
}
-
+
}
Modified: lucene/mahout/trunk/core/src/test/java/org/apache/mahout/fpm/pfpgrowth/FPGrowthTest.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/test/java/org/apache/mahout/fpm/pfpgrowth/FPGrowthTest.java?rev=906896&r1=906895&r2=906896&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/test/java/org/apache/mahout/fpm/pfpgrowth/FPGrowthTest.java (original)
+++ lucene/mahout/trunk/core/src/test/java/org/apache/mahout/fpm/pfpgrowth/FPGrowthTest.java Fri Feb 5 11:00:25 2010
@@ -34,7 +34,7 @@
import org.apache.mahout.common.Pair;
import org.apache.mahout.fpm.pfpgrowth.convertors.ContextStatusUpdater;
import org.apache.mahout.fpm.pfpgrowth.convertors.SequenceFileOutputCollector;
-import org.apache.mahout.fpm.pfpgrowth.convertors.string.StringOutputConvertor;
+import org.apache.mahout.fpm.pfpgrowth.convertors.string.StringOutputConverter;
import org.apache.mahout.fpm.pfpgrowth.convertors.string.TopKStringPatterns;
import org.apache.mahout.fpm.pfpgrowth.fpgrowth.FPGrowth;
@@ -64,7 +64,7 @@
SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, path, Text.class,
TopKStringPatterns.class);
fp.generateTopKFrequentPatterns(transactions.iterator(), fp.generateFList(transactions.iterator(), 3),
- 3, 100, new HashSet<String>(), new StringOutputConvertor(
+ 3, 100, new HashSet<String>(), new StringOutputConverter(
new SequenceFileOutputCollector<Text, TopKStringPatterns>(writer)),
new ContextStatusUpdater(null));
writer.close();
Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/FPGrowthJob.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/FPGrowthJob.java?rev=906896&r1=906895&r2=906896&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/FPGrowthJob.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/FPGrowthJob.java Fri Feb 5 11:00:25 2010
@@ -45,7 +45,7 @@
import org.apache.mahout.common.commandline.DefaultOptionCreator;
import org.apache.mahout.fpm.pfpgrowth.convertors.ContextStatusUpdater;
import org.apache.mahout.fpm.pfpgrowth.convertors.SequenceFileOutputCollector;
-import org.apache.mahout.fpm.pfpgrowth.convertors.string.StringOutputConvertor;
+import org.apache.mahout.fpm.pfpgrowth.convertors.string.StringOutputConverter;
import org.apache.mahout.fpm.pfpgrowth.convertors.string.TopKStringPatterns;
import org.apache.mahout.fpm.pfpgrowth.fpgrowth.FPGrowth;
import org.slf4j.Logger;
@@ -201,7 +201,7 @@
fp.generateTopKFrequentPatterns(new StringRecordIterator(new FileLineIterable(new File(input),
encoding, false), pattern), fp.generateFList(new StringRecordIterator(new FileLineIterable(
new File(input), encoding, false), pattern), minSupport), minSupport, maxHeapSize, features,
- new StringOutputConvertor(new SequenceFileOutputCollector<Text, TopKStringPatterns>(writer)),
+ new StringOutputConverter(new SequenceFileOutputCollector<Text, TopKStringPatterns>(writer)),
new ContextStatusUpdater(null));
writer.close();
Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/example/dataset/KeyBasedStringTupleMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/example/dataset/KeyBasedStringTupleMapper.java?rev=906896&r1=906895&r2=906896&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/example/dataset/KeyBasedStringTupleMapper.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/example/dataset/KeyBasedStringTupleMapper.java Fri Feb 5 11:00:25 2010
@@ -30,19 +30,28 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-public class KeyBasedStringTupleMapper extends Mapper<LongWritable, Text, Text, StringTuple> {
-
- private static final Logger log = LoggerFactory.getLogger(KeyBasedStringTupleMapper.class);
-
- private Pattern splitter = null;
-
- private int[] selectedFields = null;
-
- private int[] groupingFields = null;
-
+/**
+ * Splits the line using a {@link Pattern} and outputs key as given by the
+ * groupingFields
+ *
+ */
+public class KeyBasedStringTupleMapper extends
+ Mapper<LongWritable,Text,Text,StringTuple> {
+
+ private static final Logger log = LoggerFactory
+ .getLogger(KeyBasedStringTupleMapper.class);
+
+ private Pattern splitter;
+
+ private int[] selectedFields;
+
+ private int[] groupingFields;
+
@Override
- protected void map(LongWritable key, Text value, Context context) throws IOException,
- InterruptedException {
+ protected void map(LongWritable key,
+ Text value,
+ Context context) throws IOException,
+ InterruptedException {
String[] fields = splitter.split(value.toString());
if (fields.length != 4) {
log.info("{} {}", fields.length, value.toString());
@@ -50,37 +59,43 @@
return;
}
List<String> oKey = new ArrayList<String>();
- for (int i = 0, groupingFieldCount = groupingFields.length; i < groupingFieldCount; i++) {
+ for (int i = 0, groupingFieldCount = groupingFields.length;
+ i < groupingFieldCount; i++) {
oKey.add(fields[groupingFields[i]]);
context.setStatus(fields[groupingFields[i]]);
}
-
+
List<String> oValue = new ArrayList<String>();
- for (int i = 0, selectedFieldCount = selectedFields.length; i < selectedFieldCount; i++) {
+ for (int i = 0, selectedFieldCount = selectedFields.length;
+ i < selectedFieldCount; i++) {
oValue.add(fields[selectedFields[i]]);
}
-
+
context.write(new Text(oKey.toString()), new StringTuple(oValue));
-
+
}
-
+
@Override
- protected void setup(Context context) throws IOException, InterruptedException {
+ protected void setup(Context context) throws IOException,
+ InterruptedException {
super.setup(context);
- Parameters params = Parameters.fromString(context.getConfiguration().get("job.parameters", ""));
+ Parameters params = Parameters.fromString(
+ context.getConfiguration().get("job.parameters", ""));
splitter = Pattern.compile(params.get("splitPattern", "[ \t]*\t[ \t]*"));
-
- int selectedFieldCount = Integer.valueOf(params.get("selectedFieldCount", "0"));
+
+ int selectedFieldCount = Integer.valueOf(
+ params.get("selectedFieldCount", "0"));
selectedFields = new int[selectedFieldCount];
for (int i = 0; i < selectedFieldCount; i++) {
selectedFields[i] = Integer.valueOf(params.get("field" + i, "0"));
}
-
- int groupingFieldCount = Integer.valueOf(params.get("groupingFieldCount", "0"));
+
+ int groupingFieldCount = Integer.valueOf(
+ params.get("groupingFieldCount", "0"));
groupingFields = new int[groupingFieldCount];
for (int i = 0; i < groupingFieldCount; i++) {
groupingFields[i] = Integer.valueOf(params.get("gfield" + i, "0"));
}
-
+
}
}
Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/example/dataset/KeyBasedStringTupleReducer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/example/dataset/KeyBasedStringTupleReducer.java?rev=906896&r1=906895&r2=906896&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/example/dataset/KeyBasedStringTupleReducer.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/example/dataset/KeyBasedStringTupleReducer.java Fri Feb 5 11:00:25 2010
@@ -26,15 +26,18 @@
import org.apache.mahout.common.Parameters;
import org.apache.mahout.common.StringTuple;
-public class KeyBasedStringTupleReducer extends Reducer<Text, StringTuple, Text, Text> {
-
+public class KeyBasedStringTupleReducer extends
+ Reducer<Text,StringTuple,Text,Text> {
+
private int maxTransactionLength = 100;
-
+
@Override
- protected void reduce(Text key, Iterable<StringTuple> values, Context context) throws IOException,
- InterruptedException {
+ protected void reduce(Text key,
+ Iterable<StringTuple> values,
+ Context context) throws IOException,
+ InterruptedException {
Set<String> items = new HashSet<String>();
-
+
for (StringTuple value : values) {
for (String field : value.getEntries()) {
items.add(field);
@@ -52,23 +55,26 @@
sb.replace(0, sb.length(), "");
sep = "";
}
-
+
sb.append(sep).append(field);
sep = "\t";
-
+
i++;
-
+
}
if (sb.length() > 0) {
context.write(null, new Text(sb.toString()));
}
}
}
-
+
@Override
- protected void setup(Context context) throws IOException, InterruptedException {
+ protected void setup(Context context) throws IOException,
+ InterruptedException {
super.setup(context);
- Parameters params = Parameters.fromString(context.getConfiguration().get("job.parameters", ""));
- maxTransactionLength = Integer.valueOf(params.get("maxTransactionLength", "100"));
+ Parameters params = Parameters.fromString(context.getConfiguration().get(
+ "job.parameters", ""));
+ maxTransactionLength = Integer.valueOf(params.get("maxTransactionLength",
+ "100"));
}
}