You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2017/04/20 10:41:16 UTC

[29/50] [abbrv] opennlp git commit: OPENNLP-994: Remove deprecated methods from the Document Categorizer, this closes apache/opennlp#133

OPENNLP-994: Remove deprecated methods from the Document Categorizer, this closes apache/opennlp#133


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/c6ecbf24
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/c6ecbf24
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/c6ecbf24

Branch: refs/heads/parser_regression
Commit: c6ecbf243c7ab63f83d7c2267e052552bc6672f9
Parents: ede6901
Author: smarthi <sm...@apache.org>
Authored: Mon Feb 27 17:23:40 2017 -0500
Committer: J�rn Kottmann <jo...@apache.org>
Committed: Thu Apr 20 12:40:22 2017 +0200

----------------------------------------------------------------------
 .../doccat/DoccatCrossValidatorTool.java        |   7 +-
 .../tools/cmdline/doccat/DoccatTool.java        |  11 +-
 .../tools/cmdline/doccat/DoccatTrainerTool.java |   5 +-
 .../opennlp/tools/doccat/DoccatFactory.java     |  93 +----------------
 .../tools/doccat/DocumentCategorizer.java       |  54 ++--------
 .../doccat/DocumentCategorizerEvaluator.java    |   2 +-
 .../tools/doccat/DocumentCategorizerME.java     | 101 ++-----------------
 .../opennlp/tools/doccat/DocumentSample.java    |   6 --
 .../formats/LeipzigDoccatSampleStream.java      |  19 ++--
 .../tools/doccat/DocumentCategorizerMETest.java |  18 ++--
 .../tools/doccat/DocumentCategorizerNBTest.java |  17 ++--
 .../tools/doccat/DocumentSampleTest.java        |   4 +-
 .../doccat/AbstractDocumentCategorizer.java     |  29 +++---
 .../java/opennlp/uima/util/AnnotatorUtil.java   |   6 +-
 14 files changed, 66 insertions(+), 306 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/c6ecbf24/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatCrossValidatorTool.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatCrossValidatorTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatCrossValidatorTool.java
index f0f1712..a73aba7 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatCrossValidatorTool.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatCrossValidatorTool.java
@@ -36,7 +36,6 @@ import opennlp.tools.doccat.DoccatEvaluationMonitor;
 import opennlp.tools.doccat.DoccatFactory;
 import opennlp.tools.doccat.DocumentSample;
 import opennlp.tools.doccat.FeatureGenerator;
-import opennlp.tools.tokenize.Tokenizer;
 import opennlp.tools.util.eval.EvaluationMonitor;
 import opennlp.tools.util.model.ModelUtil;
 
@@ -84,16 +83,12 @@ public final class DoccatCrossValidatorTool extends
     FeatureGenerator[] featureGenerators = DoccatTrainerTool
         .createFeatureGenerators(params.getFeatureGenerators());
 
-    Tokenizer tokenizer = DoccatTrainerTool.createTokenizer(params
-        .getTokenizer());
-
     DoccatEvaluationMonitor[] listenersArr = listeners
         .toArray(new DoccatEvaluationMonitor[listeners.size()]);
 
     DoccatCrossValidator validator;
     try {
-      DoccatFactory factory = DoccatFactory.create(params.getFactory(),
-          tokenizer, featureGenerators);
+      DoccatFactory factory = DoccatFactory.create(params.getFactory(), featureGenerators);
       validator = new DoccatCrossValidator(params.getLang(), mlParams,
           factory, listenersArr);
 

http://git-wip-us.apache.org/repos/asf/opennlp/blob/c6ecbf24/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTool.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTool.java
index a01d354..49a640c 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTool.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTool.java
@@ -28,6 +28,7 @@ import opennlp.tools.cmdline.SystemInputStreamFactory;
 import opennlp.tools.doccat.DoccatModel;
 import opennlp.tools.doccat.DocumentCategorizerME;
 import opennlp.tools.doccat.DocumentSample;
+import opennlp.tools.tokenize.WhitespaceTokenizer;
 import opennlp.tools.util.ObjectStream;
 import opennlp.tools.util.ParagraphStream;
 import opennlp.tools.util.PlainTextByLineStream;
@@ -36,7 +37,7 @@ public class DoccatTool extends BasicCmdLineTool {
 
   @Override
   public String getShortDescription() {
-    return "learnable document categorizer";
+    return "learned document categorizer";
   }
 
   @Override
@@ -53,7 +54,7 @@ public class DoccatTool extends BasicCmdLineTool {
 
       DoccatModel model = new DoccatModelLoader().load(new File(args[0]));
 
-      DocumentCategorizerME doccat = new DocumentCategorizerME(model);
+      DocumentCategorizerME documentCategorizerME = new DocumentCategorizerME(model);
 
       /*
        * moved initialization to the try block to catch new IOException
@@ -68,10 +69,10 @@ public class DoccatTool extends BasicCmdLineTool {
             new SystemInputStreamFactory(), SystemInputStreamFactory.encoding()));
         String document;
         while ((document = documentStream.read()) != null) {
-          String[] tokens = model.getFactory().getTokenizer().tokenize(document);
+          String[] tokens = WhitespaceTokenizer.INSTANCE.tokenize(document);
 
-          double[] prob = doccat.categorize(tokens);
-          String category = doccat.getBestCategory(prob);
+          double[] prob = documentCategorizerME.categorize(tokens);
+          String category = documentCategorizerME.getBestCategory(prob);
 
           DocumentSample sample = new DocumentSample(category, tokens);
           System.out.println(sample.toString());

http://git-wip-us.apache.org/repos/asf/opennlp/blob/c6ecbf24/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTrainerTool.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTrainerTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTrainerTool.java
index 6ef5d88..8ebb5a8 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTrainerTool.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTrainerTool.java
@@ -66,12 +66,9 @@ public class DoccatTrainerTool
     FeatureGenerator[] featureGenerators = createFeatureGenerators(params
         .getFeatureGenerators());
 
-    Tokenizer tokenizer = createTokenizer(params.getTokenizer());
-
     DoccatModel model;
     try {
-      DoccatFactory factory = DoccatFactory.create(params.getFactory(),
-          tokenizer, featureGenerators);
+      DoccatFactory factory = DoccatFactory.create(params.getFactory(), featureGenerators);
       model = DocumentCategorizerME.train(params.getLang(), sampleStream,
           mlParams, factory);
     } catch (IOException e) {

http://git-wip-us.apache.org/repos/asf/opennlp/blob/c6ecbf24/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatFactory.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatFactory.java b/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatFactory.java
index a6c815b..babab7c 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatFactory.java
@@ -22,8 +22,6 @@ import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 
-import opennlp.tools.tokenize.Tokenizer;
-import opennlp.tools.tokenize.WhitespaceTokenizer;
 import opennlp.tools.util.BaseToolFactory;
 import opennlp.tools.util.InvalidFormatException;
 import opennlp.tools.util.ext.ExtensionLoader;
@@ -34,47 +32,17 @@ import opennlp.tools.util.ext.ExtensionLoader;
 public class DoccatFactory extends BaseToolFactory {
 
   private static final String FEATURE_GENERATORS = "doccat.featureGenerators";
-  private static final String TOKENIZER_NAME = "doccat.tokenizer";
 
   private FeatureGenerator[] featureGenerators;
-  private Tokenizer tokenizer;
 
   /**
    * Creates a {@link DoccatFactory} that provides the default implementation of
    * the resources.
    */
-  public DoccatFactory() {
-    this.tokenizer = WhitespaceTokenizer.INSTANCE;
-  }
+  public DoccatFactory() {}
 
   public DoccatFactory(final FeatureGenerator[] featureGenerators) {
-    this.tokenizer = WhitespaceTokenizer.INSTANCE;
-    this.featureGenerators = featureGenerators;
-  }
-
-  /**
-   * Creates a {@link DoccatFactory}. Use this constructor to programmatically
-   * create a factory.
-   *
-   * @deprecated will be removed after 1.7.1 release. Don't use it.
-   * @param tokenizer         the tokenizer
-   * @param featureGenerators the feature generators
-   */
-  @Deprecated
-  public DoccatFactory(Tokenizer tokenizer, FeatureGenerator[] featureGenerators) {
-    this.init(tokenizer, featureGenerators);
-  }
-
-  /**
-   * @deprecated will be removed after 1.7.1 release. Don't use it.
-   * @param tokenizer the tokenizer
-   * @param featureGenerators feature generators
-   */
-  @Deprecated
-  protected void init(Tokenizer tokenizer, FeatureGenerator[] featureGenerators) {
-
     this.featureGenerators = featureGenerators;
-    this.tokenizer = tokenizer;
   }
 
   protected void init(FeatureGenerator[] featureGenerators) {
@@ -85,11 +53,6 @@ public class DoccatFactory extends BaseToolFactory {
   public Map<String, String> createManifestEntries() {
     Map<String, String> manifestEntries = super.createManifestEntries();
 
-    if (getTokenizer() != null) {
-      manifestEntries.put(TOKENIZER_NAME, getTokenizer().getClass()
-          .getCanonicalName());
-    }
-
     if (getFeatureGenerators() != null) {
       manifestEntries.put(FEATURE_GENERATORS, featureGeneratorsAsString());
     }
@@ -115,31 +78,6 @@ public class DoccatFactory extends BaseToolFactory {
     // nothing to validate
   }
 
-  /**
-   * @deprecated will be removed after 1.7.1 release. Don't use it.
-   */
-  @Deprecated
-  public static DoccatFactory create(String subclassName, Tokenizer tokenizer,
-      FeatureGenerator[] featureGenerators) throws InvalidFormatException {
-    if (subclassName == null) {
-      // will create the default factory
-      return new DoccatFactory(tokenizer, featureGenerators);
-    }
-    try {
-      DoccatFactory theFactory = ExtensionLoader.instantiateExtension(
-          DoccatFactory.class, subclassName);
-      theFactory.init(tokenizer, featureGenerators);
-      return theFactory;
-    } catch (Exception e) {
-      String msg = "Could not instantiate the " + subclassName
-          + ". The initialization throw an exception.";
-      System.err.println(msg);
-      e.printStackTrace();
-      throw new InvalidFormatException(msg, e);
-    }
-
-  }
-
   public static DoccatFactory create(String subclassName, FeatureGenerator[] featureGenerators)
       throws InvalidFormatException {
     if (subclassName == null) {
@@ -192,33 +130,4 @@ public class DoccatFactory extends BaseToolFactory {
     this.featureGenerators = featureGenerators;
   }
 
-  /**
-   * @deprecated will be removed after 1.7.1 release. Don't use it.
-   */
-  @Deprecated
-  public Tokenizer getTokenizer() {
-    if (this.tokenizer == null) {
-      if (artifactProvider != null) {
-        String className = artifactProvider.getManifestProperty(TOKENIZER_NAME);
-        if (className != null) {
-          this.tokenizer = ExtensionLoader.instantiateExtension(
-              Tokenizer.class, className);
-        }
-      }
-      if (this.tokenizer == null) { // could not load using artifact provider
-        this.tokenizer = WhitespaceTokenizer.INSTANCE;
-      }
-    }
-    return tokenizer;
-  }
-
-  /**
-   * @deprecated will be removed after 1.7.1 release. Don't use it.
-   * @param tokenizer tokenizer
-   */
-  @Deprecated
-  public void setTokenizer(Tokenizer tokenizer) {
-    this.tokenizer = tokenizer;
-  }
-
 }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/c6ecbf24/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizer.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizer.java b/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizer.java
index 88bf8f9..b180549 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizer.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizer.java
@@ -27,23 +27,21 @@ import java.util.SortedMap;
 public interface DocumentCategorizer {
 
   /**
-   * Categorizes the given text, provided in separate tokens.
+   * Categorize the given text provided as tokens along with
+   * the provided extra information
    *
    * @param text the tokens of text to categorize
+   * @param extraInformation extra information
    * @return per category probabilities
    */
-  double[] categorize(String[] text);
+  double[] categorize(String[] text, Map<String, Object> extraInformation);
 
   /**
    * Categorizes the given text, provided in separate tokens.
-   *
-   * @param text             the tokens of text to categorize
-   * @param extraInformation optional extra information to pass for evaluation
+   * @param text the tokens of text to categorize
    * @return per category probabilities
-   * @deprecated will be removed after 1.7.1 release. Don't use it.
    */
-  @Deprecated
-  double[] categorize(String[] text, Map<String, Object> extraInformation);
+  double[] categorize(String[] text);
 
   /**
    * get the best category from previously generated outcome probabilities
@@ -77,25 +75,6 @@ public interface DocumentCategorizer {
   int getNumberOfCategories();
 
   /**
-   * categorize a piece of text
-   *
-   * @param documentText the text to categorize
-   * @return the probabilities of each category (sum up to 1)
-   * @deprecated will be removed after 1.7.1 release. Don't use it.
-   */
-  @Deprecated
-  double[] categorize(String documentText);
-
-  /**
-   * categorize a piece of text, providing extra metadata.
-   *
-   * @param documentText     the text to categorize
-   * @param extraInformation extra metadata
-   * @return the probabilities of each category (sum up to 1)
-   */
-  double[] categorize(String documentText, Map<String, Object> extraInformation);
-
-  /**
    * get the name of the category associated with the given probabilties
    *
    * @param results the probabilities of each category
@@ -108,16 +87,6 @@ public interface DocumentCategorizer {
    *
    * @param text the input text to classify
    * @return a map with the score as a key. The value is a Set of categories with the score.
-   * @deprecated will be removed after 1.7.1 release. Don't use it.
-   */
-  @Deprecated
-  Map<String, Double> scoreMap(String text);
-
-  /**
-   * Returns a map in which the key is the category name and the value is the score
-   *
-   * @param text the input text to classify
-   * @return a map with the score as a key. The value is a Set of categories with the score.
    */
   Map<String, Double> scoreMap(String[] text);
 
@@ -127,17 +96,6 @@ public interface DocumentCategorizer {
    *
    * @param text the input text to classify
    * @return a map with the score as a key. The value is a Set of categories with the score.
-   * @deprecated will be removed after 1.7.1 release. Don't use it.
-   */
-  @Deprecated
-  SortedMap<Double, Set<String>> sortedScoreMap(String text);
-
-  /**
-   * Get a map of the scores sorted in ascending aorder together with their associated categories.
-   * Many categories can have the same score, hence the Set as value
-   *
-   * @param text the input text to classify
-   * @return a map with the score as a key. The value is a Set of categories with the score.
    */
   SortedMap<Double, Set<String>> sortedScoreMap(String[] text);
 

http://git-wip-us.apache.org/repos/asf/opennlp/blob/c6ecbf24/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerEvaluator.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerEvaluator.java b/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerEvaluator.java
index 63e0768..c501280 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerEvaluator.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerEvaluator.java
@@ -59,7 +59,7 @@ public class DocumentCategorizerEvaluator extends Evaluator<DocumentSample> {
 
     String[] document = sample.getText();
 
-    double[] probs = categorizer.categorize(document, sample.getExtraInformation());
+    double[] probs = categorizer.categorize(document);
 
     String cat = categorizer.getBestCategory(probs);
 

http://git-wip-us.apache.org/repos/asf/opennlp/blob/c6ecbf24/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java b/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java
index e743b9d..9dc41d7 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java
@@ -29,8 +29,6 @@ import java.util.TreeMap;
 import opennlp.tools.ml.EventTrainer;
 import opennlp.tools.ml.TrainerFactory;
 import opennlp.tools.ml.model.MaxentModel;
-import opennlp.tools.tokenize.SimpleTokenizer;
-import opennlp.tools.tokenize.Tokenizer;
 import opennlp.tools.util.ObjectStream;
 import opennlp.tools.util.TrainingParameters;
 
@@ -48,22 +46,6 @@ public class DocumentCategorizerME implements DocumentCategorizer {
   private DocumentCategorizerContextGenerator mContextGenerator;
 
   /**
-   * Initializes the current instance with a doccat model and custom feature
-   * generation. The feature generation must be identical to the configuration
-   * at training time.
-   *
-   * @param model             the doccat model
-   * @param featureGenerators the feature generators
-   * @deprecated train a {@link DoccatModel} with a specific
-   * {@link DoccatFactory} to customize the {@link FeatureGenerator}s
-   */
-  @Deprecated
-  public DocumentCategorizerME(DoccatModel model, FeatureGenerator... featureGenerators) {
-    this.model = model;
-    this.mContextGenerator = new DocumentCategorizerContextGenerator(featureGenerators);
-  }
-
-  /**
    * Initializes the current instance with a doccat model. Default feature
    * generation is used.
    *
@@ -75,6 +57,13 @@ public class DocumentCategorizerME implements DocumentCategorizer {
         .getFactory().getFeatureGenerators());
   }
 
+  /**
+   * Categorize the given text provided as tokens along with
+   * the provided extra information
+   *
+   * @param text text tokens to categorize
+   * @param extraInformation additional information
+   */
   @Override
   public double[] categorize(String[] text, Map<String, Object> extraInformation) {
     return model.getMaxentModel().eval(
@@ -83,58 +72,15 @@ public class DocumentCategorizerME implements DocumentCategorizer {
 
   /**
    * Categorizes the given text.
+   *
    * @param text the text to categorize
    */
+  @Override
   public double[] categorize(String[] text) {
     return this.categorize(text, Collections.emptyMap());
   }
 
   /**
-   * Categorizes the given text. The Tokenizer is obtained from
-   * {@link DoccatFactory#getTokenizer()} and defaults to
-   * {@link SimpleTokenizer}.
-   * @deprecated will be removed after 1.7.1 release. Don't use it.
-   */
-  @Deprecated
-  @Override
-  public double[] categorize(String documentText,
-      Map<String, Object> extraInformation) {
-    Tokenizer tokenizer = model.getFactory().getTokenizer();
-    return categorize(tokenizer.tokenize(documentText), extraInformation);
-  }
-
-  /**
-   * Categorizes the given text. The text is tokenized with the SimpleTokenizer
-   * before it is passed to the feature generation.
-   * @deprecated will be removed after 1.7.1 release. Don't use it.
-   */
-  @Deprecated
-  public double[] categorize(String documentText) {
-    Tokenizer tokenizer = model.getFactory().getTokenizer();
-    return categorize(tokenizer.tokenize(documentText), Collections.emptyMap());
-  }
-
-  /**
-   * Returns a map in which the key is the category name and the value is the score
-   *
-   * @param text the input text to classify
-   * @return the score map
-   * @deprecated will be removed after 1.7.1 release. Don't use it.
-   */
-  @Deprecated
-  public Map<String, Double> scoreMap(String text) {
-    Map<String, Double> probDist = new HashMap<>();
-
-    double[] categorize = categorize(text);
-    int catSize = getNumberOfCategories();
-    for (int i = 0; i < catSize; i++) {
-      String category = getCategory(i);
-      probDist.put(category, categorize[getIndex(category)]);
-    }
-    return probDist;
-  }
-
-  /**
    * Returns a map in which the key is the category name and the value is the score
    *
    * @param text the input text to classify
@@ -160,35 +106,6 @@ public class DocumentCategorizerME implements DocumentCategorizer {
    *
    * @param text the input text to classify
    * @return the sorted score map
-   * @deprecated will be removed after 1.7.1 release. Don't use it.
-   */
-  @Deprecated
-  @Override
-  public SortedMap<Double, Set<String>> sortedScoreMap(String text) {
-    SortedMap<Double, Set<String>> descendingMap = new TreeMap<>();
-    double[] categorize = categorize(text);
-    int catSize = getNumberOfCategories();
-    for (int i = 0; i < catSize; i++) {
-      String category = getCategory(i);
-      double score = categorize[getIndex(category)];
-      if (descendingMap.containsKey(score)) {
-        descendingMap.get(score).add(category);
-      } else {
-        Set<String> newset = new HashSet<>();
-        newset.add(category);
-        descendingMap.put(score, newset);
-      }
-    }
-    return descendingMap;
-  }
-
-  /**
-   * Returns a map with the score as a key in ascending order.
-   * The value is a Set of categories with the score.
-   * Many categories can have the same score, hence the Set as value
-   *
-   * @param text the input text to classify
-   * @return the sorted score map
    */
   @Override
   public SortedMap<Double, Set<String>> sortedScoreMap(String[] text) {

http://git-wip-us.apache.org/repos/asf/opennlp/blob/c6ecbf24/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentSample.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentSample.java b/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentSample.java
index 3d107fa..adddc27 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentSample.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentSample.java
@@ -24,8 +24,6 @@ import java.util.List;
 import java.util.Map;
 import java.util.Objects;
 
-import opennlp.tools.tokenize.WhitespaceTokenizer;
-
 /**
  * Class which holds a classified document and its category.
  */
@@ -35,10 +33,6 @@ public class DocumentSample {
   private final List<String> text;
   private final Map<String, Object> extraInformation;
 
-  public DocumentSample(String category, String text) {
-    this(category, WhitespaceTokenizer.INSTANCE.tokenize(text));
-  }
-
   public DocumentSample(String category, String[] text) {
     this(category, text, null);
   }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/c6ecbf24/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java
index 1ca0484..8ed0036 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java
@@ -20,6 +20,9 @@ package opennlp.tools.formats;
 import java.io.IOException;
 import java.io.PrintStream;
 import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
 
 import opennlp.tools.doccat.DocumentSample;
 import opennlp.tools.tokenize.SimpleTokenizer;
@@ -36,7 +39,7 @@ import opennlp.tools.util.PlainTextByLineStream;
  * <p>
  * The input text is tokenized with the {@link SimpleTokenizer}. The input text classified
  * by the language model must also be tokenized by the {@link SimpleTokenizer} to produce
- * exactly the same tokenization during testing and training.
+ * exactly the same tokenization during testing and training.�
  */
 public class LeipzigDoccatSampleStream extends
     FilterObjectStream<String, DocumentSample> {
@@ -79,10 +82,8 @@ public class LeipzigDoccatSampleStream extends
   }
 
   public DocumentSample read() throws IOException {
-
     int count = 0;
-
-    StringBuilder sampleText = new StringBuilder();
+    List<String> tokensList = new ArrayList<>();
 
     String line;
     while (count < sentencesPerDocument && (line = samples.read()) != null) {
@@ -94,17 +95,13 @@ public class LeipzigDoccatSampleStream extends
       }
 
       // Always skip first token, that is the sentence number!
-      for (int i = 1; i < tokens.length; i++) {
-        sampleText.append(tokens[i]);
-        sampleText.append(' ');
-      }
+      tokensList.addAll(Arrays.asList(tokens).subList(1, tokens.length));
 
       count++;
     }
 
-
-    if (sampleText.length() > 0) {
-      return new DocumentSample(language, sampleText.toString());
+    if (tokensList.size() > 0) {
+      return new DocumentSample(language, tokensList.toArray(new String[tokensList.size()]));
     }
 
     return null;

http://git-wip-us.apache.org/repos/asf/opennlp/blob/c6ecbf24/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentCategorizerMETest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentCategorizerMETest.java b/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentCategorizerMETest.java
index 6389530..220df87 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentCategorizerMETest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentCategorizerMETest.java
@@ -42,27 +42,23 @@ public class DocumentCategorizerMETest {
         new DocumentSample("0", new String[]{"x", "y", "z", "7", "8"}));
 
     TrainingParameters params = new TrainingParameters();
-    params.put(TrainingParameters.ITERATIONS_PARAM, Integer.toString(100));
-    params.put(TrainingParameters.CUTOFF_PARAM, Integer.toString(0));
+    params.put(TrainingParameters.ITERATIONS_PARAM, "100");
+    params.put(TrainingParameters.CUTOFF_PARAM, "0");
 
     DoccatModel model = DocumentCategorizerME.train("x-unspecified", samples,
             params, new DoccatFactory());
 
     DocumentCategorizer doccat = new DocumentCategorizerME(model);
 
-    double[] aProbs = doccat.categorize("a");
+    double[] aProbs = doccat.categorize(new String[]{"a"});
     Assert.assertEquals("1", doccat.getBestCategory(aProbs));
 
-    double[] bProbs = doccat.categorize("x");
+    double[] bProbs = doccat.categorize(new String[]{"x"});
     Assert.assertEquals("0", doccat.getBestCategory(bProbs));
 
     //test to make sure sorted map's last key is cat 1 because it has the highest score.
-    SortedMap<Double, Set<String>> sortedScoreMap = doccat.sortedScoreMap("a");
-    for (String cat : sortedScoreMap.get(sortedScoreMap.lastKey())) {
-      Assert.assertEquals("1", cat);
-      break;
-    }
-    System.out.println("");
-
+    SortedMap<Double, Set<String>> sortedScoreMap = doccat.sortedScoreMap(new String[]{"a"});
+    Set<String> cat = sortedScoreMap.get(sortedScoreMap.lastKey());
+    Assert.assertEquals(1, cat.size());
   }
 }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/c6ecbf24/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentCategorizerNBTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentCategorizerNBTest.java b/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentCategorizerNBTest.java
index de3f098..0847690 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentCategorizerNBTest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentCategorizerNBTest.java
@@ -44,8 +44,8 @@ public class DocumentCategorizerNBTest {
         new DocumentSample("0", new String[]{"x", "y", "z", "7", "8"}));
 
     TrainingParameters params = new TrainingParameters();
-    params.put(TrainingParameters.ITERATIONS_PARAM, Integer.toString(100));
-    params.put(TrainingParameters.CUTOFF_PARAM, Integer.toString(0));
+    params.put(TrainingParameters.ITERATIONS_PARAM, "100");
+    params.put(TrainingParameters.CUTOFF_PARAM, "0");
     params.put(AbstractTrainer.ALGORITHM_PARAM, NaiveBayesTrainer.NAIVE_BAYES_VALUE);
 
     DoccatModel model = DocumentCategorizerME.train("x-unspecified", samples,
@@ -53,19 +53,16 @@ public class DocumentCategorizerNBTest {
 
     DocumentCategorizer doccat = new DocumentCategorizerME(model);
 
-    double[] aProbs = doccat.categorize("a");
+    double[] aProbs = doccat.categorize(new String[]{"a"});
     Assert.assertEquals("1", doccat.getBestCategory(aProbs));
 
-    double[] bProbs = doccat.categorize("x");
+    double[] bProbs = doccat.categorize(new String[]{"x"});
     Assert.assertEquals("0", doccat.getBestCategory(bProbs));
 
     //test to make sure sorted map's last key is cat 1 because it has the highest score.
-    SortedMap<Double, Set<String>> sortedScoreMap = doccat.sortedScoreMap("a");
-    for (String cat : sortedScoreMap.get(sortedScoreMap.lastKey())) {
-      Assert.assertEquals("1", cat);
-      break;
-    }
-    System.out.println("");
+    SortedMap<Double, Set<String>> sortedScoreMap = doccat.sortedScoreMap(new String[]{"a"});
+    Set<String> cat = sortedScoreMap.get(sortedScoreMap.lastKey());
+    Assert.assertEquals(1, cat.size());
 
   }
 }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/c6ecbf24/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentSampleTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentSampleTest.java b/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentSampleTest.java
index 232158b..8cf8fef 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentSampleTest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentSampleTest.java
@@ -31,11 +31,11 @@ public class DocumentSampleTest {
   }
 
   public static DocumentSample createGoldSample() {
-    return new DocumentSample("aCategory", "a small text");
+    return new DocumentSample("aCategory", new String[]{"a", "small", "text"});
   }
 
   public static DocumentSample createPredSample() {
-    return new DocumentSample("anotherCategory", "a small text");
+    return new DocumentSample("anotherCategory", new String[]{"a", "small", "text"});
   }
 
 }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/c6ecbf24/opennlp-uima/src/main/java/opennlp/uima/doccat/AbstractDocumentCategorizer.java
----------------------------------------------------------------------
diff --git a/opennlp-uima/src/main/java/opennlp/uima/doccat/AbstractDocumentCategorizer.java b/opennlp-uima/src/main/java/opennlp/uima/doccat/AbstractDocumentCategorizer.java
index db9c075..4b49dca 100644
--- a/opennlp-uima/src/main/java/opennlp/uima/doccat/AbstractDocumentCategorizer.java
+++ b/opennlp-uima/src/main/java/opennlp/uima/doccat/AbstractDocumentCategorizer.java
@@ -17,12 +17,17 @@
 
 package opennlp.uima.doccat;
 
+import java.util.ArrayList;
+import java.util.List;
+
 import org.apache.uima.UimaContext;
 import org.apache.uima.analysis_component.CasAnnotator_ImplBase;
 import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
 import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.FSIterator;
 import org.apache.uima.cas.Type;
 import org.apache.uima.cas.TypeSystem;
+import org.apache.uima.cas.text.AnnotationFS;
 import org.apache.uima.resource.ResourceAccessException;
 import org.apache.uima.resource.ResourceInitializationException;
 import org.apache.uima.util.Level;
@@ -72,29 +77,25 @@ abstract class AbstractDocumentCategorizer extends CasAnnotator_ImplBase {
     mCategorizer = new DocumentCategorizerME(model);
   }
 
-  public void typeSystemInit(TypeSystem typeSystem)
-      throws AnalysisEngineProcessException {
+  public void typeSystemInit(TypeSystem typeSystem) throws AnalysisEngineProcessException {
     mTokenType = AnnotatorUtil.getRequiredTypeParameter(context, typeSystem,
-        UimaUtil.SENTENCE_TYPE_PARAMETER);
+        UimaUtil.TOKEN_TYPE_PARAMETER);
   }
 
   protected abstract void setBestCategory(CAS cas, String bestCategory);
 
   public void process(CAS cas) {
 
-    double[] result;
-
-    if (mTokenType != null) {
-      // TODO:
-      // count tokens
-      // create token array
-      // pass array to doccat
-      // create result annotation
-      result = mCategorizer.categorize(cas.getDocumentText());
-    } else {
-      result = mCategorizer.categorize(cas.getDocumentText());
+    FSIterator<AnnotationFS> tokenAnnotations = cas.getAnnotationIndex(mTokenType).iterator();
+    List<String> tokensList = new ArrayList<>();
+
+    while (tokenAnnotations.hasNext()) {
+      tokensList.add(tokenAnnotations.next().getCoveredText());
     }
 
+    double[] result =
+        mCategorizer.categorize(tokensList.toArray(new String[tokensList.size()]));
+
     String bestCategory = mCategorizer.getBestCategory(result);
 
     setBestCategory(cas, bestCategory);

http://git-wip-us.apache.org/repos/asf/opennlp/blob/c6ecbf24/opennlp-uima/src/main/java/opennlp/uima/util/AnnotatorUtil.java
----------------------------------------------------------------------
diff --git a/opennlp-uima/src/main/java/opennlp/uima/util/AnnotatorUtil.java b/opennlp-uima/src/main/java/opennlp/uima/util/AnnotatorUtil.java
index 8847107..730d6be 100644
--- a/opennlp-uima/src/main/java/opennlp/uima/util/AnnotatorUtil.java
+++ b/opennlp-uima/src/main/java/opennlp/uima/util/AnnotatorUtil.java
@@ -329,8 +329,7 @@ public final class AnnotatorUtil {
     } else {
       throw new ResourceInitializationException(
           ExceptionMessages.MESSAGE_CATALOG,
-          ExceptionMessages.WRONG_PARAMETER_TYPE, new Object[] {parameter,
-          "String array"});
+          ExceptionMessages.WRONG_PARAMETER_TYPE, new Object[] {parameter, "String array"});
     }
   }
 
@@ -443,8 +442,7 @@ public final class AnnotatorUtil {
     if (inResource == null) {
       throw new ResourceInitializationException(
           ExceptionMessages.MESSAGE_CATALOG,
-          ExceptionMessages.IO_ERROR_MODEL_READING, new Object[] {name
-          + " could not be found!"});
+          ExceptionMessages.IO_ERROR_MODEL_READING, new Object[] {name + " could not be found!"});
     }
 
     return inResource;