You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2011/10/17 11:08:10 UTC
svn commit: r1185047 -
/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/BagOfWordsFeatureGenerator.java
Author: joern
Date: Mon Oct 17 09:08:10 2011
New Revision: 1185047
URL: http://svn.apache.org/viewvc?rev=1185047&view=rev
Log:
OPENNLP-327 Added option to only use all-letter tokens.
Modified:
incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/BagOfWordsFeatureGenerator.java
Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/BagOfWordsFeatureGenerator.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/BagOfWordsFeatureGenerator.java?rev=1185047&r1=1185046&r2=1185047&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/BagOfWordsFeatureGenerator.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/BagOfWordsFeatureGenerator.java Mon Oct 17 09:08:10 2011
@@ -21,17 +21,37 @@ package opennlp.tools.doccat;
import java.util.ArrayList;
import java.util.Collection;
+import opennlp.tools.util.featuregen.StringPattern;
+
/**
* Generates a feature for each word in a document.
*/
public class BagOfWordsFeatureGenerator implements FeatureGenerator {
+ private boolean useOnlyAllLetterTokens = false;
+
+ public BagOfWordsFeatureGenerator() {
+ }
+
+ BagOfWordsFeatureGenerator(boolean useOnlyAllLetterTokens) {
+ this.useOnlyAllLetterTokens = useOnlyAllLetterTokens;
+ }
+
public Collection<String> extractFeatures(String[] text) {
Collection<String> bagOfWords = new ArrayList<String>(text.length);
for (int i = 0; i < text.length; i++) {
- bagOfWords.add("bow=" + text[i]);
+
+ if (useOnlyAllLetterTokens) {
+ StringPattern pattern = StringPattern.recognize(text[i]);
+
+ if (pattern.isAllLetter())
+ bagOfWords.add("bow=" + text[i]);
+ }
+ else {
+ bagOfWords.add("bow=" + text[i]);
+ }
}
return bagOfWords;