You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ro...@apache.org on 2010/02/13 18:56:01 UTC
svn commit: r909861 [1/4] - in /lucene/mahout/trunk/utils/src:
main/java/org/apache/mahout/clustering/lda/
main/java/org/apache/mahout/text/ main/java/org/apache/mahout/utils/
main/java/org/apache/mahout/utils/clustering/
main/java/org/apache/mahout/ut...
Author: robinanil
Date: Sat Feb 13 17:55:56 2010
New Revision: 909861
URL: http://svn.apache.org/viewvc?rev=909861&view=rev
Log:
MAHOUT-291
Code Cleanup in Utils
Modified:
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/lda/LDAPrintTopics.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/SequenceFileDumper.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocCombiner.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocDriver.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocMapper.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocReducer.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/Gram.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/LLRReducer.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/NGramCollector.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/SequenceFileVectorIterable.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/TF.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/TFIDF.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/TermEntry.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/TermInfo.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/Weight.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFModel.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFType.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/Driver.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModel.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMergeReducer.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMerger.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/JWriterTermInfoWriter.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/JWriterVectorWriter.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/TermInfoWriter.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/VectorWriter.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfo.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/TFDFMapper.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DocumentProcessor.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/document/SequenceFileTokenizerMapper.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TFPartialVectorReducer.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TermCountMapper.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TermCountReducer.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TermDocumentCountMapper.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TermDocumentCountReducer.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFPartialVectorReducer.java
lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/dirichlet/TestL1ModelClustering.java
lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/CollocMapperTest.java
lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/CollocReducerTest.java
lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/GramTest.java
lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/LLRReducerTest.java
lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/NGramCollectorTest.java
lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/RandomVectorIterable.java
lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/SequenceFileVectorIterableTest.java
lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java
lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/io/VectorWriterTest.java
lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java
lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java
Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/lda/LDAPrintTopics.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/lda/LDAPrintTopics.java?rev=909861&r1=909860&r2=909861&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/lda/LDAPrintTopics.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/lda/LDAPrintTopics.java Sat Feb 13 17:55:56 2010
@@ -27,7 +27,6 @@
import java.util.List;
import java.util.PriorityQueue;
import java.util.Queue;
-import java.util.regex.Pattern;
import org.apache.commons.cli2.CommandLine;
import org.apache.commons.cli2.Group;
@@ -50,9 +49,8 @@
* Class to print out the top K words for each topic.
*/
public class LDAPrintTopics {
- private static final Pattern TAB_PATTERN = Pattern.compile("\t");
- private LDAPrintTopics() {}
+ private LDAPrintTopics() { }
private static class StringDoublePair implements Comparable<StringDoublePair> {
private final double score;
@@ -85,9 +83,9 @@
}
public static List<List<String>> topWordsForTopics(String dir,
- Configuration job,
- List<String> wordList,
- int numWordsToPrint) throws IOException {
+ Configuration job,
+ List<String> wordList,
+ int numWordsToPrint) throws IOException {
FileSystem fs = new Path(dir).getFileSystem(job);
List<PriorityQueue<StringDoublePair>> queues = new ArrayList<PriorityQueue<StringDoublePair>>();
@@ -101,11 +99,11 @@
int topic = key.getX();
int word = key.getY();
- ensureQueueSize(queues, topic);
+ LDAPrintTopics.ensureQueueSize(queues, topic);
if (word >= 0 && topic >= 0) {
double score = value.get();
String realWord = wordList.get(word);
- maybeEnqueue(queues.get(topic), realWord, score, numWordsToPrint);
+ LDAPrintTopics.maybeEnqueue(queues.get(topic), realWord, score, numWordsToPrint);
}
}
reader.close();
@@ -149,42 +147,42 @@
GroupBuilder gbuilder = new GroupBuilder();
Option inputOpt = obuilder.withLongName("input").withRequired(true)
- .withArgument(
- abuilder.withName("input").withMinimum(1).withMaximum(1).create())
- .withDescription("Path to an LDA output (a state)").withShortName("i")
- .create();
+ .withArgument(
+ abuilder.withName("input").withMinimum(1).withMaximum(1).create())
+ .withDescription("Path to an LDA output (a state)").withShortName("i")
+ .create();
Option dictOpt = obuilder.withLongName("dict").withRequired(true)
- .withArgument(
- abuilder.withName("dict").withMinimum(1).withMaximum(1).create())
- .withDescription(
- "Dictionary to read in, in the same format as one created by "
- + "org.apache.mahout.utils.vectors.lucene.Driver").withShortName(
- "d").create();
+ .withArgument(
+ abuilder.withName("dict").withMinimum(1).withMaximum(1).create())
+ .withDescription(
+ "Dictionary to read in, in the same format as one created by "
+ + "org.apache.mahout.utils.vectors.lucene.Driver").withShortName(
+ "d").create();
Option outOpt = obuilder.withLongName("output").withRequired(true)
- .withArgument(
- abuilder.withName("output").withMinimum(1).withMaximum(1).create())
- .withDescription("Output directory to write top words").withShortName(
- "o").create();
+ .withArgument(
+ abuilder.withName("output").withMinimum(1).withMaximum(1).create())
+ .withDescription("Output directory to write top words").withShortName(
+ "o").create();
Option wordOpt = obuilder.withLongName("words").withRequired(false)
- .withArgument(
- abuilder.withName("words").withMinimum(0).withMaximum(1).withDefault(
- "20").create()).withDescription("Number of words to print")
- .withShortName("w").create();
+ .withArgument(
+ abuilder.withName("words").withMinimum(0).withMaximum(1).withDefault(
+ "20").create()).withDescription("Number of words to print")
+ .withShortName("w").create();
Option dictTypeOpt = obuilder.withLongName("dictionaryType").withRequired(
false).withArgument(
- abuilder.withName("dictionaryType").withMinimum(1).withMaximum(1)
- .create()).withDescription(
- "The dictionary file type (text|sequencefile)").withShortName("dt")
+ abuilder.withName("dictionaryType").withMinimum(1).withMaximum(1)
+ .create()).withDescription(
+ "The dictionary file type (text|sequencefile)").withShortName("dt")
.create();
Option helpOpt = obuilder.withLongName("help").withDescription(
- "Print out help").withShortName("h").create();
+ "Print out help").withShortName("h").create();
Group group = gbuilder.withName("Options").withOption(dictOpt).withOption(
outOpt).withOption(wordOpt).withOption(inputOpt).withOption(dictTypeOpt)
- .create();
+ .create();
try {
Parser parser = new Parser();
parser.setGroup(group);
@@ -212,7 +210,7 @@
List<String> wordList;
if (dictionaryType.equals("text")) {
wordList = Arrays.asList(VectorHelper.loadTermDictionary(new File(
- dictFile)));
+ dictFile)));
} else if (dictionaryType.equals("sequencefile")) {
FileSystem fs = FileSystem.get(new Path(dictFile).toUri(), config);
wordList = Arrays.asList(VectorHelper.loadTermDictionary(config, fs,
@@ -221,7 +219,7 @@
throw new IllegalArgumentException("Invalid dictionary format");
}
- List<List<String>> topWords = topWordsForTopics(input, config, wordList,
+ List<List<String>> topWords = LDAPrintTopics.topWordsForTopics(input, config, wordList,
numWords);
if (!output.exists()) {
Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java?rev=909861&r1=909860&r2=909861&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java Sat Feb 13 17:55:56 2010
@@ -43,8 +43,7 @@
*/
public final class SparseVectorsFromSequenceFiles {
- private static final Logger log = LoggerFactory
- .getLogger(SparseVectorsFromSequenceFiles.class);
+ private static final Logger log = LoggerFactory.getLogger(SparseVectorsFromSequenceFiles.class);
private SparseVectorsFromSequenceFiles() {}
@@ -53,108 +52,74 @@
ArgumentBuilder abuilder = new ArgumentBuilder();
GroupBuilder gbuilder = new GroupBuilder();
- Option inputDirOpt = obuilder.withLongName("input").withRequired(true)
- .withArgument(
- abuilder.withName("input").withMinimum(1).withMaximum(1).create())
- .withDescription(
- "input dir containing the documents in sequence file format")
- .withShortName("i").create();
-
- Option outputDirOpt = obuilder
- .withLongName("output")
- .withRequired(true)
- .withArgument(
- abuilder.withName("output").withMinimum(1).withMaximum(1).create())
- .withDescription("The output directory").withShortName("o").create();
+ Option inputDirOpt = obuilder.withLongName("input").withRequired(true).withArgument(
+ abuilder.withName("input").withMinimum(1).withMaximum(1).create()).withDescription(
+ "input dir containing the documents in sequence file format").withShortName("i").create();
+
+ Option outputDirOpt = obuilder.withLongName("output").withRequired(true).withArgument(
+ abuilder.withName("output").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The output directory").withShortName("o").create();
Option minSupportOpt = obuilder.withLongName("minSupport").withArgument(
- abuilder.withName("minSupport").withMinimum(1).withMaximum(1).create())
- .withDescription("(Optional) Minimum Support. Default Value: 2")
- .withShortName("s").create();
-
- Option analyzerNameOpt = obuilder.withLongName("analyzerName")
- .withArgument(
- abuilder.withName("analyzerName").withMinimum(1).withMaximum(1)
- .create()).withDescription("The class name of the analyzer")
- .withShortName("a").create();
+ abuilder.withName("minSupport").withMinimum(1).withMaximum(1).create()).withDescription(
+ "(Optional) Minimum Support. Default Value: 2").withShortName("s").create();
+
+ Option analyzerNameOpt = obuilder.withLongName("analyzerName").withArgument(
+ abuilder.withName("analyzerName").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The class name of the analyzer").withShortName("a").create();
Option chunkSizeOpt = obuilder.withLongName("chunkSize").withArgument(
- abuilder.withName("chunkSize").withMinimum(1).withMaximum(1).create())
- .withDescription("The chunkSize in MegaBytes. 100-10000 MB")
- .withShortName("chunk").create();
-
- Option weightOpt = obuilder.withLongName("weight").withRequired(false)
- .withArgument(
- abuilder.withName("weight").withMinimum(1).withMaximum(1).create())
- .withDescription("The kind of weight to use. Currently TF or TFIDF")
- .withShortName("wt").create();
-
- Option minDFOpt = obuilder.withLongName("minDF").withRequired(false)
- .withArgument(
- abuilder.withName("minDF").withMinimum(1).withMaximum(1).create())
- .withDescription("The minimum document frequency. Default is 1")
- .withShortName("md").create();
+ abuilder.withName("chunkSize").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The chunkSize in MegaBytes. 100-10000 MB").withShortName("chunk").create();
+
+ Option weightOpt = obuilder.withLongName("weight").withRequired(false).withArgument(
+ abuilder.withName("weight").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The kind of weight to use. Currently TF or TFIDF").withShortName("wt").create();
+
+ Option minDFOpt = obuilder.withLongName("minDF").withRequired(false).withArgument(
+ abuilder.withName("minDF").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The minimum document frequency. Default is 1").withShortName("md").create();
Option maxDFPercentOpt = obuilder
.withLongName("maxDFPercent")
.withRequired(false)
- .withArgument(
- abuilder.withName("maxDFPercent").withMinimum(1).withMaximum(1)
- .create())
+ .withArgument(abuilder.withName("maxDFPercent").withMinimum(1).withMaximum(1).create())
.withDescription(
"The max percentage of docs for the DF. Can be used to remove really high frequency terms. Expressed as an integer between 0 and 100. Default is 99.")
.withShortName("x").create();
- Option minLLROpt = obuilder.withLongName("minLLR").withRequired(false)
- .withArgument(
- abuilder.withName("minLLR").withMinimum(1).withMaximum(1).create())
- .withDescription(
- "(Optional)The minimum Log Likelihood Ratio(Float) Default is "
- + LLRReducer.DEFAULT_MIN_LLR).withShortName("ml").create();
-
- Option numReduceTasksOpt = obuilder.withLongName("numReducers")
- .withArgument(
- abuilder.withName("numReducers").withMinimum(1).withMaximum(1)
- .create()).withDescription(
- "(Optional) Number of reduce tasks. Default Value: 1").withShortName(
- "nr").create();
-
- Option powerOpt = obuilder
- .withLongName("norm")
- .withRequired(false)
- .withArgument(
- abuilder.withName("norm").withMinimum(1).withMaximum(1).create())
- .withDescription(
- "The norm to use, expressed as either a float or \"INF\" if you want to use the Infinite norm. "
- + "Must be greater or equal to 0. The default is not to normalize")
- .withShortName("n").create();
- Option maxNGramSizeOpt = obuilder
- .withLongName("maxNGramSize")
- .withRequired(false)
- .withArgument(
- abuilder.withName("ngramSize").withMinimum(1).withMaximum(1).create())
+ Option minLLROpt = obuilder.withLongName("minLLR").withRequired(false).withArgument(
+ abuilder.withName("minLLR").withMinimum(1).withMaximum(1).create()).withDescription(
+ "(Optional)The minimum Log Likelihood Ratio(Float) Default is " + LLRReducer.DEFAULT_MIN_LLR)
+ .withShortName("ml").create();
+
+ Option numReduceTasksOpt = obuilder.withLongName("numReducers").withArgument(
+ abuilder.withName("numReducers").withMinimum(1).withMaximum(1).create()).withDescription(
+ "(Optional) Number of reduce tasks. Default Value: 1").withShortName("nr").create();
+
+ Option powerOpt = obuilder.withLongName("norm").withRequired(false).withArgument(
+ abuilder.withName("norm").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The norm to use, expressed as either a float or \"INF\" if you want to use the Infinite norm. "
+ + "Must be greater or equal to 0. The default is not to normalize").withShortName("n").create();
+ Option maxNGramSizeOpt = obuilder.withLongName("maxNGramSize").withRequired(false).withArgument(
+ abuilder.withName("ngramSize").withMinimum(1).withMaximum(1).create())
.withDescription(
"(Optional) The maximum size of ngrams to create"
- + " (2 = bigrams, 3 = trigrams, etc) Default Value:2")
- .withShortName("ng").create();
- Option sequentialAccessVectorOpt = obuilder.withLongName("sequentialAccessVector")
- .withRequired(false)
- .withDescription("(Optional) Whether output vectors should be SequentialAccessVectors If set true else false")
+ + " (2 = bigrams, 3 = trigrams, etc) Default Value:2").withShortName("ng").create();
+ Option sequentialAccessVectorOpt = obuilder.withLongName("sequentialAccessVector").withRequired(false)
+ .withDescription(
+ "(Optional) Whether output vectors should be SequentialAccessVectors If set true else false")
.withShortName("seq").create();
- Option overwriteOutput = obuilder.withLongName("overwrite").withRequired(
- false).withDescription("If set, overwrite the output directory")
- .withShortName("w").create();
- Option helpOpt = obuilder.withLongName("help").withDescription(
- "Print out help").withShortName("h").create();
-
- Group group = gbuilder.withName("Options").withOption(minSupportOpt)
- .withOption(analyzerNameOpt).withOption(chunkSizeOpt).withOption(
- outputDirOpt).withOption(inputDirOpt).withOption(minDFOpt)
- .withOption(maxDFPercentOpt).withOption(weightOpt).withOption(powerOpt)
- .withOption(minLLROpt).withOption(numReduceTasksOpt).withOption(
- maxNGramSizeOpt).withOption(overwriteOutput).withOption(helpOpt)
- .withOption(sequentialAccessVectorOpt)
+ Option overwriteOutput = obuilder.withLongName("overwrite").withRequired(false).withDescription(
+ "If set, overwrite the output directory").withShortName("w").create();
+ Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
.create();
+
+ Group group = gbuilder.withName("Options").withOption(minSupportOpt).withOption(analyzerNameOpt)
+ .withOption(chunkSizeOpt).withOption(outputDirOpt).withOption(inputDirOpt).withOption(minDFOpt)
+ .withOption(maxDFPercentOpt).withOption(weightOpt).withOption(powerOpt).withOption(minLLROpt)
+ .withOption(numReduceTasksOpt).withOption(maxNGramSizeOpt).withOption(overwriteOutput).withOption(
+ helpOpt).withOption(sequentialAccessVectorOpt).create();
try {
Parser parser = new Parser();
parser.setGroup(group);
@@ -182,8 +147,7 @@
if (cmdLine.hasOption(maxNGramSizeOpt) == true) {
try {
- maxNGramSize = Integer.parseInt(cmdLine.getValue(maxNGramSizeOpt)
- .toString());
+ maxNGramSize = Integer.parseInt(cmdLine.getValue(maxNGramSizeOpt).toString());
} catch (NumberFormatException ex) {
log.warn("Could not parse ngram size option");
}
@@ -202,8 +166,7 @@
int reduceTasks = 1;
if (cmdLine.hasOption(numReduceTasksOpt)) {
- reduceTasks = Integer.parseInt(cmdLine.getValue(numReduceTasksOpt)
- .toString());
+ reduceTasks = Integer.parseInt(cmdLine.getValue(numReduceTasksOpt).toString());
}
log.info("Pass1 reduce tasks: {}", reduceTasks);
@@ -237,8 +200,7 @@
}
int maxDFPercent = 99;
if (cmdLine.hasOption(maxDFPercentOpt)) {
- maxDFPercent = Integer.parseInt(cmdLine.getValue(maxDFPercentOpt)
- .toString());
+ maxDFPercent = Integer.parseInt(cmdLine.getValue(maxDFPercentOpt).toString());
}
float norm = PartialVectorMerger.NO_NORMALIZING;
@@ -251,23 +213,20 @@
}
}
HadoopUtil.overwriteOutput(outputDir);
- String tokenizedPath = outputDir
- + DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER;
- DocumentProcessor.tokenizeDocuments(inputDir, analyzerClass,
- tokenizedPath);
-
+ String tokenizedPath = outputDir + DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER;
+ DocumentProcessor.tokenizeDocuments(inputDir, analyzerClass, tokenizedPath);
+
boolean sequentialAccessOutput = false;
if (cmdLine.hasOption(sequentialAccessVectorOpt)) {
sequentialAccessOutput = true;
}
- DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath, outputDir,
- minSupport, maxNGramSize, minLLRValue, reduceTasks, chunkSize, sequentialAccessOutput);
+ DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath, outputDir, minSupport, maxNGramSize,
+ minLLRValue, reduceTasks, chunkSize, sequentialAccessOutput);
if (processIdf) {
- TFIDFConverter.processTfIdf(
- outputDir + DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER,
- outputDir + TFIDFConverter.TFIDF_OUTPUT_FOLDER, chunkSize, minDf,
- maxDFPercent, norm, sequentialAccessOutput);
+ TFIDFConverter.processTfIdf(outputDir + DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER,
+ outputDir + TFIDFConverter.TFIDF_OUTPUT_FOLDER, chunkSize, minDf, maxDFPercent, norm,
+ sequentialAccessOutput);
}
} catch (OptionException e) {
log.error("Exception", e);
Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/SequenceFileDumper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/SequenceFileDumper.java?rev=909861&r1=909860&r2=909861&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/SequenceFileDumper.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/SequenceFileDumper.java Sat Feb 13 17:55:56 2010
@@ -17,6 +17,11 @@
package org.apache.mahout.utils;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+
import org.apache.commons.cli2.CommandLine;
import org.apache.commons.cli2.Group;
import org.apache.commons.cli2.Option;
@@ -36,51 +41,46 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import java.io.FileWriter;
-import java.io.IOException;
-import java.io.OutputStreamWriter;
-import java.io.Writer;
-
public class SequenceFileDumper {
-
+
private static final Logger log = LoggerFactory.getLogger(SequenceFileDumper.class);
-
+
private SequenceFileDumper() {
}
-
+
public static void main(String[] args) throws IOException, IllegalAccessException, InstantiationException {
DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
ArgumentBuilder abuilder = new ArgumentBuilder();
GroupBuilder gbuilder = new GroupBuilder();
-
+
Option seqOpt = obuilder.withLongName("seqFile").withRequired(false).withArgument(
- abuilder.withName("seqFile").withMinimum(1).withMaximum(1).create()).
- withDescription("The Sequence File containing the Clusters").withShortName("s").create();
+ abuilder.withName("seqFile").withMinimum(1).withMaximum(1).create()).
+ withDescription("The Sequence File containing the Clusters").withShortName("s").create();
Option outputOpt = obuilder.withLongName("output").withRequired(false).withArgument(
- abuilder.withName("output").withMinimum(1).withMaximum(1).create()).
- withDescription("The output file. If not specified, dumps to the console").withShortName("o").create();
+ abuilder.withName("output").withMinimum(1).withMaximum(1).create()).
+ withDescription("The output file. If not specified, dumps to the console").withShortName("o").create();
Option substringOpt = obuilder.withLongName("substring").withRequired(false).withArgument(
- abuilder.withName("substring").withMinimum(1).withMaximum(1).create()).
- withDescription("The number of chars of the asFormatString() to print").withShortName("b").create();
+ abuilder.withName("substring").withMinimum(1).withMaximum(1).create()).
+ withDescription("The number of chars of the asFormatString() to print").withShortName("b").create();
Option countOpt = obuilder.withLongName("count").withRequired(false).
- withDescription("Report the count only").withShortName("c").create();
+ withDescription("Report the count only").withShortName("c").create();
Option helpOpt = obuilder.withLongName("help").
- withDescription("Print out help").withShortName("h").create();
-
+ withDescription("Print out help").withShortName("h").create();
+
Group group = gbuilder.withName("Options").withOption(seqOpt).withOption(outputOpt)
.withOption(substringOpt).withOption(countOpt).withOption(helpOpt).create();
-
+
try {
Parser parser = new Parser();
parser.setGroup(group);
CommandLine cmdLine = parser.parse(args);
-
+
if (cmdLine.hasOption(helpOpt)) {
-
- printHelp(group);
+
+ SequenceFileDumper.printHelp(group);
return;
}
-
+
if (cmdLine.hasOption(seqOpt)) {
Path path = new Path(cmdLine.getValue(seqOpt).toString());
JobClient client = new JobClient();
@@ -88,7 +88,7 @@
client.setConf(conf);
FileSystem fs = FileSystem.get(path.toUri(), conf);
SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
-
+
Writer writer;
if (cmdLine.hasOption(outputOpt)) {
writer = new FileWriter(cmdLine.getValue(outputOpt).toString());
@@ -96,7 +96,7 @@
writer = new OutputStreamWriter(System.out);
}
writer.append("Input Path: ").append(String.valueOf(path)).append('\n');
-
+
int sub = Integer.MAX_VALUE;
if (cmdLine.hasOption(substringOpt)) {
sub = Integer.parseInt(cmdLine.getValue(substringOpt).toString());
@@ -105,7 +105,7 @@
Writable key = (Writable) reader.getKeyClass().newInstance();
Writable value = (Writable) reader.getValueClass().newInstance();
writer.append("Key class: ").append(String.valueOf(reader.getKeyClass())).append(" Value Class: ")
- .append(String.valueOf(value.getClass())).append('\n');
+ .append(String.valueOf(value.getClass())).append('\n');
writer.flush();
long count = 0;
if (countOnly == false) {
@@ -129,14 +129,14 @@
writer.close();
}
}
-
+
} catch (OptionException e) {
- log.error("Exception", e);
- printHelp(group);
+ SequenceFileDumper.log.error("Exception", e);
+ SequenceFileDumper.printHelp(group);
}
-
+
}
-
+
private static void printHelp(Group group) {
HelpFormatter formatter = new HelpFormatter();
formatter.setGroup(group);
Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java?rev=909861&r1=909860&r2=909861&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java Sat Feb 13 17:55:56 2010
@@ -58,8 +58,7 @@
public final class ClusterDumper {
- private static final Logger log = LoggerFactory
- .getLogger(ClusterDumper.class);
+ private static final Logger log = LoggerFactory.getLogger(ClusterDumper.class);
private final String seqFileDir;
private final String pointsDir;
@@ -80,15 +79,13 @@
if (this.pointsDir != null) {
JobConf conf = new JobConf(Job.class);
// read in the points
- clusterIdToPoints = readPoints(this.pointsDir, conf);
+ clusterIdToPoints = ClusterDumper.readPoints(this.pointsDir, conf);
} else {
clusterIdToPoints = Collections.emptyMap();
}
}
- public void printClusters() throws IOException,
- InstantiationException,
- IllegalAccessException {
+ public void printClusters() throws IOException, InstantiationException, IllegalAccessException {
JobClient client = new JobClient();
JobConf conf = new JobConf(Job.class);
client.setConf(conf);
@@ -96,13 +93,10 @@
String[] dictionary = null;
if (this.termDictionary != null) {
if (dictionaryFormat.equals("text")) {
- dictionary = VectorHelper.loadTermDictionary(new File(
- this.termDictionary));
+ dictionary = VectorHelper.loadTermDictionary(new File(this.termDictionary));
} else if (dictionaryFormat.equals("sequencefile")) {
- FileSystem fs = FileSystem.get(new Path(this.termDictionary).toUri(),
- conf);
- dictionary = VectorHelper.loadTermDictionary(conf, fs,
- this.termDictionary);
+ FileSystem fs = FileSystem.get(new Path(this.termDictionary).toUri(), conf);
+ dictionary = VectorHelper.loadTermDictionary(conf, fs, this.termDictionary);
} else {
throw new IllegalArgumentException("Invalid dictionary format");
}
@@ -115,13 +109,12 @@
writer = new OutputStreamWriter(System.out);
}
- File[] seqFileList = new File(this.seqFileDir)
- .listFiles(new FilenameFilter() {
- @Override
- public boolean accept(File file, String name) {
- return name.endsWith(".crc") == false;
- }
- });
+ File[] seqFileList = new File(this.seqFileDir).listFiles(new FilenameFilter() {
+ @Override
+ public boolean accept(File file, String name) {
+ return name.endsWith(".crc") == false;
+ }
+ });
for (File seqFile : seqFileList) {
if (!seqFile.isFile()) {
continue;
@@ -134,27 +127,25 @@
ClusterBase value = (ClusterBase) reader.getValueClass().newInstance();
while (reader.next(key, value)) {
Vector center = value.getCenter();
- String fmtStr = useJSON ? center.asFormatString() : VectorHelper
- .vectorToString(center, dictionary);
+ String fmtStr = useJSON ? center.asFormatString() : VectorHelper.vectorToString(center, dictionary);
writer.append("Id: ").append(String.valueOf(value.getId())).append(":");
writer.append("name:").append(center.getName());
- if (subString > 0) writer.append(":").append(
- fmtStr.substring(0, Math.min(subString, fmtStr.length())));
+ if (subString > 0) {
+ writer.append(":").append(fmtStr.substring(0, Math.min(subString, fmtStr.length())));
+ }
writer.append('\n');
-
+
if (dictionary != null) {
- String topTerms = getTopFeatures(center, dictionary, 10);
+ String topTerms = ClusterDumper.getTopFeatures(center, dictionary, 10);
writer.write("\tTop Terms: ");
writer.write(topTerms);
writer.write('\n');
}
- List<String> points = clusterIdToPoints.get(String.valueOf(value
- .getId()));
+ List<String> points = clusterIdToPoints.get(String.valueOf(value.getId()));
if (points != null) {
writer.write("\tPoints: ");
- for (Iterator<String> iterator = points.iterator(); iterator
- .hasNext();) {
+ for (Iterator<String> iterator = points.iterator(); iterator.hasNext();) {
String point = iterator.next();
writer.append(point);
if (iterator.hasNext()) {
@@ -202,65 +193,40 @@
this.dictionaryFormat = dictionaryType;
}
- public static void main(String[] args) throws IOException,
- IllegalAccessException,
- InstantiationException {
+ public static void main(String[] args) throws IOException, IllegalAccessException, InstantiationException {
DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
ArgumentBuilder abuilder = new ArgumentBuilder();
GroupBuilder gbuilder = new GroupBuilder();
- Option seqOpt = obuilder.withLongName("seqFileDir").withRequired(false)
- .withArgument(
- abuilder.withName("seqFileDir").withMinimum(1).withMaximum(1)
- .create()).withDescription(
- "The directory containing Sequence Files for the Clusters")
- .withShortName("s").create();
- Option outputOpt = obuilder.withLongName("output").withRequired(false)
- .withArgument(
- abuilder.withName("output").withMinimum(1).withMaximum(1).create())
- .withDescription(
- "The output file. If not specified, dumps to the console")
- .withShortName("o").create();
- Option substringOpt = obuilder
- .withLongName("substring")
- .withRequired(false)
- .withArgument(
- abuilder.withName("substring").withMinimum(1).withMaximum(1).create())
- .withDescription("The number of chars of the asFormatString() to print")
- .withShortName("b").create();
- Option centroidJSonOpt = obuilder
- .withLongName("json")
- .withRequired(false)
- .withDescription(
- "Output the centroid as JSON. Otherwise it substitues in the terms for vector cell entries")
+ Option seqOpt = obuilder.withLongName("seqFileDir").withRequired(false).withArgument(
+ abuilder.withName("seqFileDir").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The directory containing Sequence Files for the Clusters").withShortName("s").create();
+ Option outputOpt = obuilder.withLongName("output").withRequired(false).withArgument(
+ abuilder.withName("output").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The output file. If not specified, dumps to the console").withShortName("o").create();
+ Option substringOpt = obuilder.withLongName("substring").withRequired(false).withArgument(
+ abuilder.withName("substring").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The number of chars of the asFormatString() to print").withShortName("b").create();
+ Option centroidJSonOpt = obuilder.withLongName("json").withRequired(false).withDescription(
+ "Output the centroid as JSON. Otherwise it substitues in the terms for vector cell entries")
.withShortName("j").create();
- Option pointsOpt = obuilder
- .withLongName("pointsDir")
- .withRequired(false)
- .withArgument(
- abuilder.withName("pointsDir").withMinimum(1).withMaximum(1).create())
- .withDescription(
- "The directory containing points sequence files mapping input vectors to their cluster. "
- + "If specified, then the program will output the points associated with a cluster")
- .withShortName("p").create();
- Option dictOpt = obuilder.withLongName("dictionary").withRequired(false)
- .withArgument(
- abuilder.withName("dictionary").withMinimum(1).withMaximum(1)
- .create()).withDescription("The dictionary file. ")
- .withShortName("d").create();
- Option dictTypeOpt = obuilder.withLongName("dictionaryType").withRequired(
- false).withArgument(
- abuilder.withName("dictionaryType").withMinimum(1).withMaximum(1)
- .create()).withDescription(
- "The dictionary file type (text|sequencefile)").withShortName("dt")
+ Option pointsOpt = obuilder.withLongName("pointsDir").withRequired(false).withArgument(
+ abuilder.withName("pointsDir").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The directory containing points sequence files mapping input vectors to their cluster. "
+ + "If specified, then the program will output the points associated with a cluster").withShortName(
+ "p").create();
+ Option dictOpt = obuilder.withLongName("dictionary").withRequired(false).withArgument(
+ abuilder.withName("dictionary").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The dictionary file. ").withShortName("d").create();
+ Option dictTypeOpt = obuilder.withLongName("dictionaryType").withRequired(false).withArgument(
+ abuilder.withName("dictionaryType").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The dictionary file type (text|sequencefile)").withShortName("dt").create();
+ Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
.create();
- Option helpOpt = obuilder.withLongName("help").withDescription(
- "Print out help").withShortName("h").create();
- Group group = gbuilder.withName("Options").withOption(helpOpt).withOption(
- seqOpt).withOption(outputOpt).withOption(substringOpt).withOption(
- pointsOpt).withOption(centroidJSonOpt).withOption(dictOpt).withOption(
- dictTypeOpt).create();
+ Group group = gbuilder.withName("Options").withOption(helpOpt).withOption(seqOpt).withOption(outputOpt)
+ .withOption(substringOpt).withOption(pointsOpt).withOption(centroidJSonOpt).withOption(dictOpt)
+ .withOption(dictTypeOpt).create();
try {
Parser parser = new Parser();
@@ -314,7 +280,7 @@
}
clusterDumper.printClusters();
} catch (OptionException e) {
- log.error("Exception", e);
+ ClusterDumper.log.error("Exception", e);
CommandLineUtil.printHelp(group);
}
}
@@ -323,8 +289,7 @@
this.useJSON = json;
}
- private static Map<String,List<String>> readPoints(String pointsPathDir,
- JobConf conf) throws IOException {
+ private static Map<String,List<String>> readPoints(String pointsPathDir, JobConf conf) throws IOException {
SortedMap<String,List<String>> result = new TreeMap<String,List<String>>();
File[] children = new File(pointsPathDir).listFiles(new FilenameFilter() {
@@ -358,9 +323,9 @@
pointList.add(key.toString());
}
} catch (InstantiationException e) {
- log.error("Exception", e);
+ ClusterDumper.log.error("Exception", e);
} catch (IllegalAccessException e) {
- log.error("Exception", e);
+ ClusterDumper.log.error("Exception", e);
}
}
@@ -377,9 +342,7 @@
}
}
- private static String getTopFeatures(Vector vector,
- String[] dictionary,
- int numTerms) {
+ private static String getTopFeatures(Vector vector, String[] dictionary, int numTerms) {
List<TermIndexWeight> vectorTerms = new ArrayList<TermIndexWeight>();
@@ -403,7 +366,7 @@
int index = vectorTerms.get(i).index;
String dictTerm = dictionary[index];
if (dictTerm == null) {
- log.error("Dictionary entry missing for {}", index);
+ ClusterDumper.log.error("Dictionary entry missing for {}", index);
continue;
}
topTerms.add(dictTerm);
Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocCombiner.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocCombiner.java?rev=909861&r1=909860&r2=909861&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocCombiner.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocCombiner.java Sat Feb 13 17:55:56 2010
@@ -15,7 +15,6 @@
* limitations under the License.
*/
-
package org.apache.mahout.utils.nlp.collocations.llr;
import java.io.IOException;
@@ -29,56 +28,54 @@
import org.apache.mahout.utils.nlp.collocations.llr.Gram.Type;
/** Combiner for pass1 of the CollocationDriver */
-public class CollocCombiner extends MapReduceBase implements
- Reducer<Gram, Gram, Gram, Gram> {
-
- /** collocation finder: pass 1 colloc phase:
- *
- * given input from the mapper,
- * k:h_subgram:1 v:ngram:1
- * k:t_subgram:1 v:ngram:1
- *
- * count ngrams and subgrams.
- *
- * output is:
- *
- * k:h_subgram:subgramfreq v:ngram:ngramfreq
- * k:t_subgram:subgramfreq v:ngram:ngramfreq
- *
- * Each ngram's frequency is essentially counted twice, frequency should
- * be the same for the head and tail. Fix this to count only for the head
- * and move the count into the value?
+public class CollocCombiner extends MapReduceBase implements Reducer<Gram,Gram,Gram,Gram> {
+
+ /**
+ * collocation finder: pass 1 colloc phase:
+ *
+ * given input from the mapper, k:h_subgram:1 v:ngram:1 k:t_subgram:1 v:ngram:1
+ *
+ * count ngrams and subgrams.
+ *
+ * output is:
+ *
+ * k:h_subgram:subgramfreq v:ngram:ngramfreq k:t_subgram:subgramfreq v:ngram:ngramfreq
+ *
+ * Each ngram's frequency is essentially counted twice, frequency should be the same for the head and tail.
+ * Fix this to count only for the head and move the count into the value?
*/
@Override
- public void reduce(Gram subgramKey, Iterator<Gram> ngramValues,
- OutputCollector<Gram, Gram> output, Reporter reporter) throws IOException {
-
+ public void reduce(Gram subgramKey,
+ Iterator<Gram> ngramValues,
+ OutputCollector<Gram,Gram> output,
+ Reporter reporter) throws IOException {
+
HashMap<Gram,Gram> ngramSet = new HashMap<Gram,Gram>();
int subgramFrequency = 0;
-
+
while (ngramValues.hasNext()) {
Gram ngram = ngramValues.next();
subgramFrequency += ngram.getFrequency();
-
+
Gram ngramCanon = ngramSet.get(ngram);
if (ngramCanon == null) {
// t is potentially reused, so create a new object to populate the HashMap
Gram ngramEntry = new Gram(ngram);
- ngramSet.put(ngramEntry,ngramEntry);
- }
- else {
+ ngramSet.put(ngramEntry, ngramEntry);
+ } else {
ngramCanon.incrementFrequency(ngram.getFrequency());
}
}
-
+
// emit subgram:subgramFreq ngram:ngramFreq pairs
subgramKey.setFrequency(subgramFrequency);
-
- for (Gram ngram: ngramSet.keySet()) {
- if(subgramKey.getType() == Type.UNIGRAM)
+
+ for (Gram ngram : ngramSet.keySet()) {
+ if (subgramKey.getType() == Type.UNIGRAM) {
ngram.setType(subgramKey.getType());
+ }
output.collect(subgramKey, ngram);
}
}
-
+
}
Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocDriver.java?rev=909861&r1=909860&r2=909861&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocDriver.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocDriver.java Sat Feb 13 17:55:56 2010
@@ -51,7 +51,7 @@
public class CollocDriver {
public static final String DEFAULT_OUTPUT_DIRECTORY = "output";
public static final String SUBGRAM_OUTPUT_DIRECTORY = "subgrams";
- public static final String NGRAM_OUTPUT_DIRECTORY = "ngrams";
+ public static final String NGRAM_OUTPUT_DIRECTORY = "ngrams";
public static final String EMIT_UNIGRAMS = "emit-unigrams";
public static final boolean DEFAULT_EMIT_UNIGRAMS = false;
@@ -69,78 +69,57 @@
ArgumentBuilder abuilder = new ArgumentBuilder();
GroupBuilder gbuilder = new GroupBuilder();
- Option inputOpt = obuilder.withLongName("input").withRequired(true)
- .withArgument(
- abuilder.withName("input").withMinimum(1).withMaximum(1).create())
- .withDescription("The Path for input files.").withShortName("i")
- .create();
-
- Option outputOpt = obuilder.withLongName("output").withRequired(true)
- .withArgument(
- abuilder.withName("output").withMinimum(1).withMaximum(1).create())
- .withDescription("The Path write output to").withShortName("o")
- .create();
+ Option inputOpt = obuilder.withLongName("input").withRequired(true).withArgument(
+ abuilder.withName("input").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The Path for input files.").withShortName("i").create();
+
+ Option outputOpt = obuilder.withLongName("output").withRequired(true).withArgument(
+ abuilder.withName("output").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The Path write output to").withShortName("o").create();
- Option maxNGramSizeOpt = obuilder.withLongName("maxNGramSize")
- .withRequired(false).withArgument(
- abuilder.withName("ngramSize").withMinimum(1).withMaximum(1).create())
+ Option maxNGramSizeOpt = obuilder.withLongName("maxNGramSize").withRequired(false).withArgument(
+ abuilder.withName("ngramSize").withMinimum(1).withMaximum(1).create())
.withDescription(
"(Optional) The maximum size of ngrams to create"
- + " (2 = bigrams, 3 = trigrams, etc) Default Value:2")
- .withShortName("ng").create();
+ + " (2 = bigrams, 3 = trigrams, etc) Default Value:2").withShortName("ng").create();
- Option minSupportOpt = obuilder.withLongName("minSupport")
- .withRequired(false).withArgument(
- abuilder.withName("minSupport").withMinimum(1).withMaximum(1).create())
- .withDescription(
- "(Optional) Minimum Support. Default Value: "
- + CollocReducer.DEFAULT_MIN_SUPPORT).withShortName("s").create();
-
- Option minLLROpt = obuilder.withLongName("minLLR").withRequired(false)
- .withArgument(
- abuilder.withName("minLLR").withMinimum(1).withMaximum(1).create())
- .withDescription(
- "(Optional)The minimum Log Likelihood Ratio(Float) Default is "
- + LLRReducer.DEFAULT_MIN_LLR).withShortName("ml").create();
+ Option minSupportOpt = obuilder.withLongName("minSupport").withRequired(false).withArgument(
+ abuilder.withName("minSupport").withMinimum(1).withMaximum(1).create()).withDescription(
+ "(Optional) Minimum Support. Default Value: " + CollocReducer.DEFAULT_MIN_SUPPORT).withShortName("s")
+ .create();
- Option numReduceTasksOpt = obuilder.withLongName("numReducers")
- .withRequired(false).withArgument(
- abuilder.withName("numReducers").withMinimum(1).withMaximum(1)
- .create()).withDescription(
- "(Optional) Number of reduce tasks. Default Value: "
- + DEFAULT_PASS1_NUM_REDUCE_TASKS).withShortName("nr").create();
+ Option minLLROpt = obuilder.withLongName("minLLR").withRequired(false).withArgument(
+ abuilder.withName("minLLR").withMinimum(1).withMaximum(1).create()).withDescription(
+ "(Optional)The minimum Log Likelihood Ratio(Float) Default is " + LLRReducer.DEFAULT_MIN_LLR)
+ .withShortName("ml").create();
+
+ Option numReduceTasksOpt = obuilder.withLongName("numReducers").withRequired(false).withArgument(
+ abuilder.withName("numReducers").withMinimum(1).withMaximum(1).create()).withDescription(
+ "(Optional) Number of reduce tasks. Default Value: " + CollocDriver.DEFAULT_PASS1_NUM_REDUCE_TASKS)
+ .withShortName("nr").create();
- Option preprocessOpt = obuilder.withLongName("preprocess")
- .withRequired(false).withDescription(
+ Option preprocessOpt = obuilder.withLongName("preprocess").withRequired(false).withDescription(
"If set, input is SequenceFile<Text,Text> where the value is the document, "
- + " which will be tokenized using the specified analyzer.")
- .withShortName("p").create();
+ + " which will be tokenized using the specified analyzer.").withShortName("p").create();
- Option unigramOpt = obuilder
- .withLongName("unigram")
- .withRequired(false)
- .withDescription(
- "If set, unigrams will be emitted in the final output alongside collocations")
- .withShortName("u").create();
+ Option unigramOpt = obuilder.withLongName("unigram").withRequired(false).withDescription(
+ "If set, unigrams will be emitted in the final output alongside collocations").withShortName("u")
+ .create();
+
+ Option overwriteOutput = obuilder.withLongName("overwrite").withRequired(false).withDescription(
+ "If set, overwrite the output directory").withShortName("w").create();
+
+ Option analyzerNameOpt = obuilder.withLongName("analyzerName").withArgument(
+ abuilder.withName("analyzerName").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The class name of the analyzer").withShortName("a").create();
+
+ Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
+ .create();
- Option overwriteOutput = obuilder.withLongName("overwrite").withRequired(
- false).withDescription("If set, overwrite the output directory")
- .withShortName("w").create();
-
- Option analyzerNameOpt = obuilder.withLongName("analyzerName")
- .withArgument(
- abuilder.withName("analyzerName").withMinimum(1).withMaximum(1)
- .create()).withDescription("The class name of the analyzer")
- .withShortName("a").create();
-
- Option helpOpt = obuilder.withLongName("help").withDescription(
- "Print out help").withShortName("h").create();
-
- Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(
- outputOpt).withOption(maxNGramSizeOpt).withOption(overwriteOutput)
- .withOption(minSupportOpt).withOption(minLLROpt).withOption(
- numReduceTasksOpt).withOption(analyzerNameOpt).withOption(
- preprocessOpt).withOption(unigramOpt).withOption(helpOpt).create();
+ Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(outputOpt).withOption(
+ maxNGramSizeOpt).withOption(overwriteOutput).withOption(minSupportOpt).withOption(minLLROpt)
+ .withOption(numReduceTasksOpt).withOption(analyzerNameOpt).withOption(preprocessOpt).withOption(
+ unigramOpt).withOption(helpOpt).create();
try {
Parser parser = new Parser();
@@ -155,17 +134,16 @@
String input = cmdLine.getValue(inputOpt).toString();
String output = cmdLine.getValue(outputOpt).toString();
- int maxNGramSize = DEFAULT_MAX_NGRAM_SIZE;
+ int maxNGramSize = CollocDriver.DEFAULT_MAX_NGRAM_SIZE;
if (cmdLine.hasOption(maxNGramSizeOpt) == true) {
try {
- maxNGramSize = Integer.parseInt(cmdLine.getValue(maxNGramSizeOpt)
- .toString());
+ maxNGramSize = Integer.parseInt(cmdLine.getValue(maxNGramSizeOpt).toString());
} catch (NumberFormatException ex) {
- log.warn("Could not parse ngram size option");
+ CollocDriver.log.warn("Could not parse ngram size option");
}
}
- log.info("Maximum n-gram size is: {}", maxNGramSize);
+ CollocDriver.log.info("Maximum n-gram size is: {}", maxNGramSize);
if (cmdLine.hasOption(overwriteOutput) == true) {
HadoopUtil.overwriteOutput(output);
@@ -174,28 +152,26 @@
int minSupport = CollocReducer.DEFAULT_MIN_SUPPORT;
;
if (cmdLine.hasOption(minSupportOpt)) {
- minSupport = Integer.parseInt(cmdLine.getValue(minSupportOpt)
- .toString());
+ minSupport = Integer.parseInt(cmdLine.getValue(minSupportOpt).toString());
}
- log.info("Minimum Support value: {}", minSupport);
+ CollocDriver.log.info("Minimum Support value: {}", minSupport);
float minLLRValue = LLRReducer.DEFAULT_MIN_LLR;
if (cmdLine.hasOption(minLLROpt)) {
minLLRValue = Float.parseFloat(cmdLine.getValue(minLLROpt).toString());
}
- log.info("Minimum LLR value: {}", minLLRValue);
+ CollocDriver.log.info("Minimum LLR value: {}", minLLRValue);
- int reduceTasks = DEFAULT_PASS1_NUM_REDUCE_TASKS;
+ int reduceTasks = CollocDriver.DEFAULT_PASS1_NUM_REDUCE_TASKS;
if (cmdLine.hasOption(numReduceTasksOpt)) {
- reduceTasks = Integer.parseInt(cmdLine.getValue(numReduceTasksOpt)
- .toString());
+ reduceTasks = Integer.parseInt(cmdLine.getValue(numReduceTasksOpt).toString());
}
- log.info("Number of pass1 reduce tasks: {}", reduceTasks);
+ CollocDriver.log.info("Number of pass1 reduce tasks: {}", reduceTasks);
boolean emitUnigrams = cmdLine.hasOption(unigramOpt);
if (cmdLine.hasOption(preprocessOpt)) {
- log.info("Input will be preprocessed");
+ CollocDriver.log.info("Input will be preprocessed");
Class<? extends Analyzer> analyzerClass = StandardAnalyzer.class;
if (cmdLine.hasOption(analyzerNameOpt)) {
@@ -206,26 +182,23 @@
analyzerClass.newInstance();
}
- String tokenizedPath =
- output + DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER;
+ String tokenizedPath = output + DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER;
- DocumentProcessor
- .tokenizeDocuments(input, analyzerClass, tokenizedPath);
+ DocumentProcessor.tokenizeDocuments(input, analyzerClass, tokenizedPath);
input = tokenizedPath;
} else {
- log.info("Input will NOT be preprocessed");
+ CollocDriver.log.info("Input will NOT be preprocessed");
}
// parse input and extract collocations
- long ngramCount = generateCollocations(input, output, emitUnigrams,
- maxNGramSize, reduceTasks, minSupport);
+ long ngramCount = CollocDriver.generateCollocations(input, output, emitUnigrams, maxNGramSize,
+ reduceTasks, minSupport);
// tally collocations and perform LLR calculation
- computeNGramsPruneByLLR(ngramCount, output, emitUnigrams, minLLRValue,
- reduceTasks);
+ CollocDriver.computeNGramsPruneByLLR(ngramCount, output, emitUnigrams, minLLRValue, reduceTasks);
} catch (OptionException e) {
- log.error("Exception", e);
+ CollocDriver.log.error("Exception", e);
CommandLineUtil.printHelp(group);
}
@@ -255,11 +228,11 @@
float minLLRValue,
int reduceTasks) throws IOException {
// parse input and extract collocations
- long ngramCount = generateCollocations(input, output, true, maxNGramSize,
- reduceTasks, minSupport);
+ long ngramCount = CollocDriver.generateCollocations(input, output, true, maxNGramSize, reduceTasks,
+ minSupport);
// tally collocations and perform LLR calculation
- computeNGramsPruneByLLR(ngramCount, output, true, minLLRValue, reduceTasks);
+ CollocDriver.computeNGramsPruneByLLR(ngramCount, output, true, minLLRValue, reduceTasks);
}
/**
@@ -284,7 +257,7 @@
conf.setBoolean(CollocDriver.EMIT_UNIGRAMS, emitUnigrams);
FileInputFormat.setInputPaths(conf, new Path(input));
- Path outPath = new Path(output, SUBGRAM_OUTPUT_DIRECTORY);
+ Path outPath = new Path(output, CollocDriver.SUBGRAM_OUTPUT_DIRECTORY);
FileOutputFormat.setOutputPath(conf, outPath);
conf.setInputFormat(SequenceFileInputFormat.class);
@@ -297,8 +270,7 @@
conf.setNumReduceTasks(reduceTasks);
RunningJob job = JobClient.runJob(conf);
- return job.getCounters().findCounter(CollocMapper.Count.NGRAM_TOTAL)
- .getValue();
+ return job.getCounters().findCounter(CollocMapper.Count.NGRAM_TOTAL).getValue();
}
/**
@@ -320,8 +292,8 @@
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(DoubleWritable.class);
- FileInputFormat.setInputPaths(conf, new Path(output, SUBGRAM_OUTPUT_DIRECTORY));
- Path outPath = new Path(output, NGRAM_OUTPUT_DIRECTORY);
+ FileInputFormat.setInputPaths(conf, new Path(output, CollocDriver.SUBGRAM_OUTPUT_DIRECTORY));
+ Path outPath = new Path(output, CollocDriver.NGRAM_OUTPUT_DIRECTORY);
FileOutputFormat.setOutputPath(conf, outPath);
conf.setMapperClass(IdentityMapper.class);
Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocMapper.java?rev=909861&r1=909860&r2=909861&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocMapper.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocMapper.java Sat Feb 13 17:55:56 2010
@@ -41,16 +41,14 @@
import org.slf4j.LoggerFactory;
/**
- * Runs pass 1 of the Collocation discovery job on input of
- * SequeceFile<Text,Text>, where the key is a document id and the value is the
- * document contents. . Delegates to NGramCollector to perform tokenization,
+ * Runs pass 1 of the Collocation discovery job on input of SequeceFile<Text,Text>, where the key is a
+ * document id and the value is the document contents. . Delegates to NGramCollector to perform tokenization,
* ngram-creation and output collection.
*
* @see org.apache.mahout.text.SequenceFilesFromDirectory
* @see org.apache.mahout.utils.nlp.collocations.llr.colloc.NGramCollector
*/
-public class CollocMapper extends MapReduceBase implements
- Mapper<Text,StringTuple,Gram,Gram> {
+public class CollocMapper extends MapReduceBase implements Mapper<Text,StringTuple,Gram,Gram> {
public static final String MAX_SHINGLE_SIZE = "maxShingleSize";
public static final int DEFAULT_MAX_SHINGLE_SIZE = 2;
@@ -68,41 +66,36 @@
public void configure(JobConf job) {
super.configure(job);
- this.maxShingleSize = job.getInt(CollocMapper.MAX_SHINGLE_SIZE,
- DEFAULT_MAX_SHINGLE_SIZE);
+ this.maxShingleSize = job.getInt(CollocMapper.MAX_SHINGLE_SIZE, CollocMapper.DEFAULT_MAX_SHINGLE_SIZE);
- this.emitUnigrams = job.getBoolean(CollocDriver.EMIT_UNIGRAMS,
- CollocDriver.DEFAULT_EMIT_UNIGRAMS);
+ this.emitUnigrams = job.getBoolean(CollocDriver.EMIT_UNIGRAMS, CollocDriver.DEFAULT_EMIT_UNIGRAMS);
- if (log.isInfoEnabled()) {
- log.info("Max Ngram size is {}", this.maxShingleSize);
- log.info("Emit Unitgrams is {}", emitUnigrams);
+ if (CollocMapper.log.isInfoEnabled()) {
+ CollocMapper.log.info("Max Ngram size is {}", this.maxShingleSize);
+ CollocMapper.log.info("Emit Unitgrams is {}", emitUnigrams);
}
}
/**
* Collocation finder: pass 1 map phase.
*
- * Receives a token stream which gets passed through the ShingleFilter. The
- * ShingleFilter delivers ngrams of the appropriate size which are then
- * decomposed into head and tail subgrams which are collected in the following
- * manner
+ * Receives a token stream which gets passed through the ShingleFilter. The ShingleFilter delivers ngrams of
+ * the appropriate size which are then decomposed into head and tail subgrams which are collected in the
+ * following manner
*
* k:h_subgram v:ngram k:t_subgram v:ngram
*
- * The 'h_' or 't_' prefix is used to specify whether the subgram in question
- * is the head or tail of the ngram. In this implementation the head of the
- * ngram is a (n-1)gram, and the tail is a (1)gram.
+ * The 'h_' or 't_' prefix is used to specify whether the subgram in question is the head or tail of the
+ * ngram. In this implementation the head of the ngram is a (n-1)gram, and the tail is a (1)gram.
*
- * For example, given 'click and clack' and an ngram length of 3: k:'h_click
- * and' v:'click and clack' k;'t_clack' v:'click and clack'
+ * For example, given 'click and clack' and an ngram length of 3: k:'h_click and' v:'click and clack'
+ * k;'t_clack' v:'click and clack'
*
- * Also counts the total number of ngrams encountered and adds it to the
- * counter CollocDriver.Count.NGRAM_TOTAL
+ * Also counts the total number of ngrams encountered and adds it to the counter
+ * CollocDriver.Count.NGRAM_TOTAL
*
* @param r
- * The reader to read input from -- used to create a tokenstream from
- * the analyzer
+ * The reader to read input from -- used to create a tokenstream from the analyzer
*
* @param collector
* The collector to send output to
@@ -111,28 +104,21 @@
* Used to deliver the final ngram-count.
*
* @throws IOException
- * if there's a problem with the ShingleFilter reading data or the
- * collector collecting output.
+ * if there's a problem with the ShingleFilter reading data or the collector collecting output.
*/
@Override
- public void map(Text key,
- StringTuple value,
- final OutputCollector<Gram,Gram> collector,
- Reporter reporter) throws IOException {
+ public void map(Text key, StringTuple value, final OutputCollector<Gram,Gram> collector, Reporter reporter) throws IOException {
- ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value
- .getEntries().iterator()), maxShingleSize);
+ ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value.getEntries().iterator()),
+ maxShingleSize);
int count = 0; // ngram count
- OpenObjectIntHashMap<String> ngrams = new OpenObjectIntHashMap<String>(
- value.getEntries().size() * (maxShingleSize - 1));
- OpenObjectIntHashMap<String> unigrams = new OpenObjectIntHashMap<String>(
- value.getEntries().size());
+ OpenObjectIntHashMap<String> ngrams = new OpenObjectIntHashMap<String>(value.getEntries().size()
+ * (maxShingleSize - 1));
+ OpenObjectIntHashMap<String> unigrams = new OpenObjectIntHashMap<String>(value.getEntries().size());
do {
- String term = ((TermAttribute) sf.getAttribute(TermAttribute.class))
- .term();
- String type = ((TypeAttribute) sf.getAttribute(TypeAttribute.class))
- .type();
+ String term = ((TermAttribute) sf.getAttribute(TermAttribute.class)).term();
+ String type = ((TypeAttribute) sf.getAttribute(TypeAttribute.class)).type();
if ("shingle".equals(type)) {
count++;
if (ngrams.containsKey(term) == false) {
@@ -158,10 +144,8 @@
int i = term.lastIndexOf(' ');
if (i != -1) { // bigram, trigram etc
try {
- collector.collect(new Gram(term.substring(0, i), frequency, HEAD),
- ngram);
- collector.collect(new Gram(term.substring(i + 1), frequency, TAIL),
- ngram);
+ collector.collect(new Gram(term.substring(0, i), frequency, HEAD), ngram);
+ collector.collect(new Gram(term.substring(i + 1), frequency, TAIL), ngram);
} catch (IOException e) {
throw new RuntimeException(e);
}
Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocReducer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocReducer.java?rev=909861&r1=909860&r2=909861&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocReducer.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocReducer.java Sat Feb 13 17:55:56 2010
@@ -31,11 +31,9 @@
import org.slf4j.LoggerFactory;
/**
- * Reducer for Pass 1 of the collocation identification job. Generates counts
- * for ngrams and subgrams.
+ * Reducer for Pass 1 of the collocation identification job. Generates counts for ngrams and subgrams.
*/
-public class CollocReducer extends MapReduceBase implements
- Reducer<Gram,Gram,Gram,Gram> {
+public class CollocReducer extends MapReduceBase implements Reducer<Gram,Gram,Gram,Gram> {
public static final String MIN_SUPPORT = "minSupport";
public static final int DEFAULT_MIN_SUPPORT = 2;
@@ -53,14 +51,13 @@
public void configure(JobConf job) {
super.configure(job);
- this.minSupport = job.getInt(MIN_SUPPORT, DEFAULT_MIN_SUPPORT);
+ this.minSupport = job.getInt(CollocReducer.MIN_SUPPORT, CollocReducer.DEFAULT_MIN_SUPPORT);
- this.emitUnigrams =
- job.getBoolean(CollocDriver.EMIT_UNIGRAMS, CollocDriver.DEFAULT_EMIT_UNIGRAMS);
+ this.emitUnigrams = job.getBoolean(CollocDriver.EMIT_UNIGRAMS, CollocDriver.DEFAULT_EMIT_UNIGRAMS);
- if (log.isInfoEnabled()) {
- log.info("Min support is {}", minSupport);
- log.info("Emit Unitgrams is {}", emitUnigrams);
+ if (CollocReducer.log.isInfoEnabled()) {
+ CollocReducer.log.info("Min support is {}", minSupport);
+ CollocReducer.log.info("Emit Unitgrams is {}", emitUnigrams);
}
}
@@ -68,20 +65,16 @@
/**
* collocation finder: pass 1 reduce phase:
*
- * given input from the mapper,
- * k:h_subgram v:ngram
- * k:t_subgram v:ngram
+ * given input from the mapper, k:h_subgram v:ngram k:t_subgram v:ngram
*
* count ngrams and subgrams.
*
* output is:
*
- * k:ngram:ngramfreq v:h_subgram:h_subgramfreq
- * k:ngram:ngramfreq v:t_subgram:t_subgramfreq
+ * k:ngram:ngramfreq v:h_subgram:h_subgramfreq k:ngram:ngramfreq v:t_subgram:t_subgramfreq
*
- * Each ngram's frequency is essentially counted twice, frequency should be
- * the same for the head and tail. Fix this to count only for the head and
- * move the count into the value?
+ * Each ngram's frequency is essentially counted twice, frequency should be the same for the head and tail.
+ * Fix this to count only for the head and move the count into the value?
*/
@Override
public void reduce(Gram subgramKey,
@@ -115,8 +108,9 @@
reporter.incrCounter(Skipped.LESS_THAN_MIN_SUPPORT, 1);
continue;
}
- if(subgramKey.getType() == Type.UNIGRAM)
+ if (subgramKey.getType() == Type.UNIGRAM) {
ngram.setType(subgramKey.getType());
+ }
output.collect(ngram, subgramKey);
}
}
Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/Gram.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/Gram.java?rev=909861&r1=909860&r2=909861&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/Gram.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/Gram.java Sat Feb 13 17:55:56 2010
@@ -27,11 +27,10 @@
import org.apache.hadoop.io.WritableComparable;
/**
- * Writable for holding data generated from the collocation discovery jobs.
- * Depending on the job configuration gram may be one or more words. In some
- * contexts this is used to hold a complete ngram, while in others it holds a
- * part of an existing ngram (subgram). Tracks the frequency of the gram and its
- * position in the ngram in which is was found.
+ * Writable for holding data generated from the collocation discovery jobs. Depending on the job configuration
+ * gram may be one or more words. In some contexts this is used to hold a complete ngram, while in others it
+ * holds a part of an existing ngram (subgram). Tracks the frequency of the gram and its position in the ngram
+ * in which is was found.
*/
public class Gram implements WritableComparable<Gram> {
@@ -71,8 +70,7 @@
* @param gram
* the gram string
* @param type
- * whether the gram is at the head of its text unit or tail or
- * unigram
+ * whether the gram is at the head of its text unit or tail or unigram
*/
public Gram(String ngram, Type type) {
this(ngram, 1, type);
@@ -97,8 +95,7 @@
* @param frequency
* the gram frequency
* @param type
- * whether the gram is at the head of its text unit or tail or
- * unigram
+ * whether the gram is at the head of its text unit or tail or unigram
*/
public Gram(String ngram, int frequency, Type type) {
this.gram = ngram;
@@ -115,8 +112,7 @@
/**
* @param part
- * whether the gram is at the head of its text unit or tail or
- * unigram
+ * whether the gram is at the head of its text unit or tail or unigram
*/
public void setType(Type type) {
this.type = type;
@@ -162,9 +158,13 @@
frequency = in.readInt();
int typeValue = in.readUnsignedByte();
- if (typeValue == 0) type = Type.TAIL;
- else if (typeValue == 1) type = Type.HEAD;
- else type = Type.UNIGRAM;
+ if (typeValue == 0) {
+ type = Type.TAIL;
+ } else if (typeValue == 1) {
+ type = Type.HEAD;
+ } else {
+ type = Type.UNIGRAM;
+ }
Text data = new Text();
data.readFields(in);
@@ -175,9 +175,13 @@
public void write(DataOutput out) throws IOException {
out.writeInt(frequency);
- if (type == Type.TAIL) out.writeByte(0);
- else if (type == Type.HEAD) out.writeByte(1);
- else out.writeByte(2);
+ if (type == Type.TAIL) {
+ out.writeByte(0);
+ } else if (type == Type.HEAD) {
+ out.writeByte(1);
+ } else {
+ out.writeByte(2);
+ }
Text data = new Text(gram);
data.write(out);
@@ -214,8 +218,8 @@
public int hashCode() {
final int prime = 31;
int result = 1;
- result = prime * result + ((gram == null) ? 0 : gram.hashCode());
- result = prime * result + ((type == null) ? 0 : type.hashCode());
+ result = prime * result + (gram == null ? 0 : gram.hashCode());
+ result = prime * result + (type == null ? 0 : type.hashCode());
return result;
}
@@ -224,23 +228,36 @@
*/
@Override
public boolean equals(Object obj) {
- if (this == obj) return true;
- if (obj == null) return false;
- if (getClass() != obj.getClass()) return false;
+ if (this == obj) {
+ return true;
+ }
+ if (obj == null) {
+ return false;
+ }
+ if (getClass() != obj.getClass()) {
+ return false;
+ }
Gram other = (Gram) obj;
if (gram == null) {
- if (other.gram != null) return false;
- } else if (!gram.equals(other.gram)) return false;
+ if (other.gram != null) {
+ return false;
+ }
+ } else if (!gram.equals(other.gram)) {
+ return false;
+ }
if (type == null) {
- if (other.type != null) return false;
- } else if (!type.equals(other.type)) return false;
+ if (other.type != null) {
+ return false;
+ }
+ } else if (!type.equals(other.type)) {
+ return false;
+ }
return true;
}
@Override
public String toString() {
- return "'" + gram + "'["
- + (type == Type.UNIGRAM ? "u" : (type == Type.HEAD ? "h" : "t")) + "]:"
+ return "'" + gram + "'[" + (type == Type.UNIGRAM ? "u" : type == Type.HEAD ? "h" : "t") + "]:"
+ frequency;
}
Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/LLRReducer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/LLRReducer.java?rev=909861&r1=909860&r2=909861&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/LLRReducer.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/LLRReducer.java Sat Feb 13 17:55:56 2010
@@ -33,11 +33,10 @@
import org.slf4j.LoggerFactory;
/**
- * Reducer for pass 2 of the collocation discovery job. Collects ngram and
- * sub-ngram frequencies and performs the Log-likelihood ratio calculation.
+ * Reducer for pass 2 of the collocation discovery job. Collects ngram and sub-ngram frequencies and performs
+ * the Log-likelihood ratio calculation.
*/
-public class LLRReducer extends MapReduceBase implements
- Reducer<Gram,Gram,Text,DoubleWritable> {
+public class LLRReducer extends MapReduceBase implements Reducer<Gram,Gram,Text,DoubleWritable> {
public static enum Skipped {
EXTRA_HEAD,
@@ -80,16 +79,15 @@
public void configure(JobConf job) {
super.configure(job);
- this.ngramTotal = job.getLong(NGRAM_TOTAL, -1);
- this.minLLRValue = job.getFloat(MIN_LLR, DEFAULT_MIN_LLR);
+ this.ngramTotal = job.getLong(LLRReducer.NGRAM_TOTAL, -1);
+ this.minLLRValue = job.getFloat(LLRReducer.MIN_LLR, LLRReducer.DEFAULT_MIN_LLR);
- this.emitUnigrams = job.getBoolean(CollocDriver.EMIT_UNIGRAMS,
- CollocDriver.DEFAULT_EMIT_UNIGRAMS);
+ this.emitUnigrams = job.getBoolean(CollocDriver.EMIT_UNIGRAMS, CollocDriver.DEFAULT_EMIT_UNIGRAMS);
- if (log.isInfoEnabled()) {
- log.info("NGram Total is {}", ngramTotal);
- log.info("Min LLR value is {}", minLLRValue);
- log.info("Emit Unitgrams is {}", emitUnigrams);
+ if (LLRReducer.log.isInfoEnabled()) {
+ LLRReducer.log.info("NGram Total is {}", ngramTotal);
+ LLRReducer.log.info("Min LLR value is {}", minLLRValue);
+ LLRReducer.log.info("Emit Unitgrams is {}", emitUnigrams);
}
if (ngramTotal == -1) {
@@ -98,17 +96,13 @@
}
/**
- * Perform LLR calculation, input is: k:ngram:ngramFreq
- * v:(h_|t_)subgram:subgramfreq N = ngram total
+ * Perform LLR calculation, input is: k:ngram:ngramFreq v:(h_|t_)subgram:subgramfreq N = ngram total
*
- * Each ngram will have 2 subgrams, a head and a tail, referred to as A and B
- * respectively below.
+ * Each ngram will have 2 subgrams, a head and a tail, referred to as A and B respectively below.
*
- * A+ B: number of times a+b appear together: ngramFreq A+!B: number of times
- * A appears without B: hSubgramFreq - ngramFreq !A+ B: number of times B
- * appears without A: tSubgramFreq - ngramFreq !A+!B: number of times neither
- * A or B appears (in that order): N - (subgramFreqA + subgramFreqB -
- * ngramFreq)
+ * A+ B: number of times a+b appear together: ngramFreq A+!B: number of times A appears without B:
+ * hSubgramFreq - ngramFreq !A+ B: number of times B appears without A: tSubgramFreq - ngramFreq !A+!B:
+ * number of times neither A or B appears (in that order): N - (subgramFreqA + subgramFreqB - ngramFreq)
*/
@Override
public void reduce(Gram key,
@@ -132,10 +126,10 @@
while (values.hasNext()) {
Gram value = values.next();
- int pos = (value.getType() == Type.HEAD ? 0 : 1);
+ int pos = value.getType() == Type.HEAD ? 0 : 1;
if (gramFreq[pos] != -1) {
- log.warn("Extra {} for {}, skipping", value.getType(), ngram);
+ LLRReducer.log.warn("Extra {} for {}, skipping", value.getType(), ngram);
if (value.getType() == Type.HEAD) {
reporter.incrCounter(Skipped.EXTRA_HEAD, 1);
} else {
@@ -149,11 +143,11 @@
}
if (gramFreq[0] == -1) {
- log.warn("Missing head for {}, skipping.", ngram);
+ LLRReducer.log.warn("Missing head for {}, skipping.", ngram);
reporter.incrCounter(Skipped.MISSING_HEAD, 1);
return;
} else if (gramFreq[1] == -1) {
- log.warn("Missing tail for {}, skipping", ngram);
+ LLRReducer.log.warn("Missing tail for {}, skipping", ngram);
reporter.incrCounter(Skipped.MISSING_TAIL, 1);
return;
}
@@ -161,8 +155,7 @@
int k11 = ngram.getFrequency(); /* a&b */
int k12 = gramFreq[0] - ngram.getFrequency(); /* a&!b */
int k21 = gramFreq[1] - ngram.getFrequency(); /* !b&a */
- int k22 = (int) (ngramTotal - (gramFreq[0] + gramFreq[1] - ngram
- .getFrequency())); /* !a&!b */
+ int k22 = (int) (ngramTotal - (gramFreq[0] + gramFreq[1] - ngram.getFrequency())); /* !a&!b */
try {
double llr = ll.logLikelihoodRatio(k11, k12, k21, k22);
@@ -175,18 +168,16 @@
output.collect(t, dd);
} catch (IllegalArgumentException ex) {
reporter.incrCounter(Skipped.LLR_CALCULATION_ERROR, 1);
- log.error("Problem calculating LLR ratio: " + ex.getMessage());
- log.error("NGram: " + ngram);
- log.error("HEAD: " + gram[0] + ":" + gramFreq[0]);
- log.error("TAIL: " + gram[1] + ":" + gramFreq[1]);
- log.error("k11: " + k11 + " k12: " + k12 + " k21: " + k21 + " k22: "
- + k22);
+ LLRReducer.log.error("Problem calculating LLR ratio: " + ex.getMessage());
+ LLRReducer.log.error("NGram: " + ngram);
+ LLRReducer.log.error("HEAD: " + gram[0] + ":" + gramFreq[0]);
+ LLRReducer.log.error("TAIL: " + gram[1] + ":" + gramFreq[1]);
+ LLRReducer.log.error("k11: " + k11 + " k12: " + k12 + " k21: " + k21 + " k22: " + k22);
}
}
/**
- * provide interface so the input to the llr calculation can be captured for
- * validation in unit testing
+ * provide interface so the input to the llr calculation can be captured for validation in unit testing
*/
public static interface LLCallback {
public double logLikelihoodRatio(int k11, int k12, int k21, int k22);
Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/NGramCollector.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/NGramCollector.java?rev=909861&r1=909860&r2=909861&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/NGramCollector.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/NGramCollector.java Sat Feb 13 17:55:56 2010
@@ -38,9 +38,8 @@
import org.slf4j.LoggerFactory;
/**
- * Performs tokenization, ngram generation + collection for the first pass of
- * the LLR collocation discovery job. Factors this code out of the mappers so
- * that different input formats can be supported.
+ * Performs tokenization, ngram generation + collection for the first pass of the LLR collocation discovery
+ * job. Factors this code out of the mappers so that different input formats can be supported.
*
* @see org.apache.mahout.utils.nlp.collocations.llr.colloc.CollocMapperTextFile
*/
@@ -53,12 +52,10 @@
NGRAM_TOTAL;
}
- private static final Logger log = LoggerFactory
- .getLogger(NGramCollector.class);
+ private static final Logger log = LoggerFactory.getLogger(NGramCollector.class);
/**
- * An analyzer to perform tokenization. A ShingleFilter will be wrapped around
- * its output to create ngrams
+ * An analyzer to perform tokenization. A ShingleFilter will be wrapped around its output to create ngrams
*/
private Analyzer a;
@@ -70,12 +67,10 @@
/**
* Configure the NGramCollector.
*
- * Reads NGramCollector.ANALYZER_CLASS and instantiates that class if it is
- * provided. Otherwise a lucene StandardAnalyzer will be used that is set to
- * be compatible to LUCENE_24.
+ * Reads NGramCollector.ANALYZER_CLASS and instantiates that class if it is provided. Otherwise a lucene
+ * StandardAnalyzer will be used that is set to be compatible to LUCENE_24.
*
- * Reads NGramCollector.MAX_SHINGLE_SIZE and uses this as the parameter to the
- * ShingleFilter.
+ * Reads NGramCollector.MAX_SHINGLE_SIZE and uses this as the parameter to the ShingleFilter.
*
* @param job
*/
@@ -104,33 +99,30 @@
this.maxShingleSize = job.getInt(NGramCollector.MAX_SHINGLE_SIZE, 2);
- if (log.isInfoEnabled()) {
- log.info("Analyzer is {}", this.a.toString());
- log.info("Max Ngram size is {}", this.maxShingleSize);
+ if (NGramCollector.log.isInfoEnabled()) {
+ NGramCollector.log.info("Analyzer is {}", this.a.toString());
+ NGramCollector.log.info("Max Ngram size is {}", this.maxShingleSize);
}
}
/**
- * Receives a document and uses a lucene analyzer to tokenize them. The
- * ShingleFilter delivers ngrams of the appropriate size which aren then
- * decomposed into head and tail subgrams which are collected in the following
- * manner
+ * Receives a document and uses a lucene analyzer to tokenize them. The ShingleFilter delivers ngrams of the
+ * appropriate size which aren then decomposed into head and tail subgrams which are collected in the
+ * following manner
*
* k:h_subgram v:ngram k:t_subgram v:ngram
*
- * The 'h_' or 't_' prefix is used to specify whether the subgram in question
- * is the head or tail of the ngram. In this implementation the head of the
- * ngram is a (n-1)gram, and the tail is a (1)gram.
+ * The 'h_' or 't_' prefix is used to specify whether the subgram in question is the head or tail of the
+ * ngram. In this implementation the head of the ngram is a (n-1)gram, and the tail is a (1)gram.
*
- * For example, given 'click and clack' and an ngram length of 3: k:'h_click
- * and' v:'clack and clack' k;'t_clack' v:'click and clack'
+ * For example, given 'click and clack' and an ngram length of 3: k:'h_click and' v:'clack and clack'
+ * k;'t_clack' v:'click and clack'
*
- * Also counts the total number of ngrams encountered and adds it to the
- * counter CollocDriver.Count.NGRAM_TOTAL
+ * Also counts the total number of ngrams encountered and adds it to the counter
+ * CollocDriver.Count.NGRAM_TOTAL
*
* @param r
- * The reader to read input from -- used to create a tokenstream from
- * the analyzer
+ * The reader to read input from -- used to create a tokenstream from the analyzer
*
* @param collector
* The collector to send output to
@@ -139,12 +131,9 @@
* Used to deliver the final ngram-count.
*
* @throws IOException
- * if there's a problem with the ShingleFilter reading data or the
- * collector collecting output.
+ * if there's a problem with the ShingleFilter reading data or the collector collecting output.
*/
- public void collectNgrams(Reader r,
- OutputCollector<Gram,Gram> collector,
- Reporter reporter) throws IOException {
+ public void collectNgrams(Reader r, OutputCollector<Gram,Gram> collector, Reporter reporter) throws IOException {
TokenStream st = a.tokenStream("text", r);
ShingleFilter sf = new ShingleFilter(st, maxShingleSize);
@@ -152,10 +141,8 @@
int count = 0; // ngram count
do {
- String term = ((TermAttribute) sf.getAttribute(TermAttribute.class))
- .term();
- String type = ((TypeAttribute) sf.getAttribute(TypeAttribute.class))
- .type();
+ String term = ((TermAttribute) sf.getAttribute(TermAttribute.class)).term();
+ String type = ((TypeAttribute) sf.getAttribute(TypeAttribute.class)).type();
if ("shingle".equals(type)) {
count++;