You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2016/12/23 17:47:23 UTC

[6/7] opennlp git commit: OPENNLP-871: Cleanup for Java 8 and remove deprecated tokenizer code

OPENNLP-871: Cleanup for Java 8 and remove deprecated tokenizer code

Closes #17


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/c49a87ab
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/c49a87ab
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/c49a87ab

Branch: refs/heads/897
Commit: c49a87abdb6c07a123a2234f4ec5ca3d21306d69
Parents: 99323ad
Author: smarthi <sm...@apache.org>
Authored: Thu Dec 22 22:39:33 2016 -0500
Committer: Kottmann <jo...@apache.org>
Committed: Fri Dec 23 16:46:42 2016 +0100

----------------------------------------------------------------------
 .../tools/cmdline/chunker/ChunkerMETool.java    |  6 +-
 .../opennlp/tools/dictionary/Dictionary.java    |  7 +-
 .../tools/tokenize/TokenizerCrossValidator.java | 36 --------
 .../tools/tokenize/TokenizerFactory.java        | 22 ++---
 .../opennlp/tools/tokenize/TokenizerME.java     | 89 --------------------
 .../opennlp/tools/tokenize/TokenizerModel.java  | 53 ------------
 .../opennlp/tools/eval/Conll00ChunkerEval.java  |  3 +-
 7 files changed, 13 insertions(+), 203 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/c49a87ab/opennlp-tools/src/main/java/opennlp/tools/cmdline/chunker/ChunkerMETool.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/chunker/ChunkerMETool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/chunker/ChunkerMETool.java
index 5b87c9e..b511a0b 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/chunker/ChunkerMETool.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/chunker/ChunkerMETool.java
@@ -69,11 +69,9 @@ public class ChunkerMETool extends BasicCmdLineTool {
             continue;
           }
 
-          String[] chunks = chunker.chunk(posSample.getSentence(),
-                  posSample.getTags());
+          String[] chunks = chunker.chunk(posSample.getSentence(), posSample.getTags());
 
-          System.out.println(new ChunkSample(posSample.getSentence(),
-                  posSample.getTags(), chunks).nicePrint());
+          System.out.println(new ChunkSample(posSample.getSentence(), posSample.getTags(), chunks).nicePrint());
 
           perfMon.incrementCounter();
         }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/c49a87ab/opennlp-tools/src/main/java/opennlp/tools/dictionary/Dictionary.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/dictionary/Dictionary.java b/opennlp-tools/src/main/java/opennlp/tools/dictionary/Dictionary.java
index 58b7a6e..4961741 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/dictionary/Dictionary.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/dictionary/Dictionary.java
@@ -31,7 +31,6 @@ import java.util.StringTokenizer;
 import opennlp.tools.dictionary.serializer.Attributes;
 import opennlp.tools.dictionary.serializer.DictionarySerializer;
 import opennlp.tools.dictionary.serializer.Entry;
-import opennlp.tools.dictionary.serializer.EntryInserter;
 import opennlp.tools.util.StringList;
 import opennlp.tools.util.StringUtil;
 
@@ -113,11 +112,7 @@ public class Dictionary implements Iterable<StringList> {
    * @throws IOException
    */
   public Dictionary(InputStream in) throws IOException {
-    isCaseSensitive = DictionarySerializer.create(in, new EntryInserter() {
-      public void insert(Entry entry) {
-        put(entry.getTokens());
-      }
-    });
+    isCaseSensitive = DictionarySerializer.create(in, entry -> put(entry.getTokens()));
   }
 
   /**

http://git-wip-us.apache.org/repos/asf/opennlp/blob/c49a87ab/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerCrossValidator.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerCrossValidator.java b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerCrossValidator.java
index 3ca3c1d..fe9e4c6 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerCrossValidator.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerCrossValidator.java
@@ -18,13 +18,10 @@
 package opennlp.tools.tokenize;
 
 import java.io.IOException;
-
-import opennlp.tools.dictionary.Dictionary;
 import opennlp.tools.util.ObjectStream;
 import opennlp.tools.util.TrainingParameters;
 import opennlp.tools.util.eval.CrossValidationPartitioner;
 import opennlp.tools.util.eval.FMeasure;
-import opennlp.tools.util.model.ModelUtil;
 
 public class TokenizerCrossValidator {
 
@@ -42,39 +39,6 @@ public class TokenizerCrossValidator {
   }
 
   /**
-   * @deprecated use
-   *             {@link #TokenizerCrossValidator(TrainingParameters, TokenizerFactory, TokenizerEvaluationMonitor...)}
-   *             instead and pass in a {@link TokenizerFactory}
-   */
-  public TokenizerCrossValidator(String language, Dictionary abbreviations,
-      boolean alphaNumericOptimization, TrainingParameters params,
-      TokenizerEvaluationMonitor ... listeners) {
-    this(params, new TokenizerFactory(language, abbreviations,
-        alphaNumericOptimization, null), listeners);
-  }
-
-  /**
-   * @deprecated use
-   *             {@link #TokenizerCrossValidator(TrainingParameters, TokenizerFactory, TokenizerEvaluationMonitor...)}
-   *             instead and pass in a {@link TokenizerFactory}
-   */
-  public TokenizerCrossValidator(String language, boolean alphaNumericOptimization) {
-    this(language, alphaNumericOptimization, ModelUtil.createDefaultTrainingParameters());
-  }
-
-  /**
-   * @deprecated use
-   *             {@link #TokenizerCrossValidator(TrainingParameters, TokenizerFactory, TokenizerEvaluationMonitor...)}
-   *             instead and pass in a {@link TokenizerFactory}
-   */
-  public TokenizerCrossValidator(String language,
-      boolean alphaNumericOptimization, TrainingParameters params,
-      TokenizerEvaluationMonitor ... listeners) {
-    this(language, null, alphaNumericOptimization, params, listeners);
-  }
-
-
-  /**
    * Starts the evaluation.
    *
    * @param samples

http://git-wip-us.apache.org/repos/asf/opennlp/blob/c49a87ab/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerFactory.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerFactory.java b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerFactory.java
index f9e789a..ffa793b 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerFactory.java
@@ -37,7 +37,7 @@ public class TokenizerFactory extends BaseToolFactory {
 
   private String languageCode;
   private Dictionary abbreviationDictionary;
-  private Boolean useAlphaNumericOptimization;
+  private Boolean useAlphaNumericOptimization = false;
   private Pattern alphaNumericPattern;
 
   private static final String ABBREVIATIONS_ENTRY_NAME = "abbreviations.dictionary";
@@ -112,13 +112,11 @@ public class TokenizerFactory extends BaseToolFactory {
   public Map<String, String> createManifestEntries() {
     Map<String, String> manifestEntries = super.createManifestEntries();
 
-    manifestEntries.put(USE_ALPHA_NUMERIC_OPTIMIZATION,
-        Boolean.toString(isUseAlphaNumericOptmization()));
+    manifestEntries.put(USE_ALPHA_NUMERIC_OPTIMIZATION, Boolean.toString(isUseAlphaNumericOptmization()));
 
     // alphanumeric pattern is optional
     if (getAlphaNumericPattern() != null)
-      manifestEntries.put(ALPHA_NUMERIC_PATTERN, getAlphaNumericPattern()
-          .pattern());
+      manifestEntries.put(ALPHA_NUMERIC_PATTERN, getAlphaNumericPattern().pattern());
 
     return manifestEntries;
   }
@@ -167,9 +165,8 @@ public class TokenizerFactory extends BaseToolFactory {
    */
   public Pattern getAlphaNumericPattern() {
     if (this.alphaNumericPattern == null) {
-      if (artifactProvider != null) {
-        String prop = this.artifactProvider
-            .getManifestProperty(ALPHA_NUMERIC_PATTERN);
+      if (this.artifactProvider != null) {
+        String prop = this.artifactProvider.getManifestProperty(ALPHA_NUMERIC_PATTERN);
         if (prop != null) {
           this.alphaNumericPattern = Pattern.compile(prop);
         }
@@ -189,8 +186,8 @@ public class TokenizerFactory extends BaseToolFactory {
    * @return true if the alpha numeric optimization is enabled, otherwise false
    */
   public boolean isUseAlphaNumericOptmization() {
-    if (this.useAlphaNumericOptimization == null && artifactProvider != null) {
-      this.useAlphaNumericOptimization = Boolean.valueOf(artifactProvider
+    if (artifactProvider != null) {
+      this.useAlphaNumericOptimization = Boolean.valueOf(this.artifactProvider
           .getManifestProperty(USE_ALPHA_NUMERIC_OPTIMIZATION));
     }
     return this.useAlphaNumericOptimization;
@@ -203,8 +200,7 @@ public class TokenizerFactory extends BaseToolFactory {
    */
   public Dictionary getAbbreviationDictionary() {
     if (this.abbreviationDictionary == null && artifactProvider != null) {
-      this.abbreviationDictionary = artifactProvider
-          .getArtifact(ABBREVIATIONS_ENTRY_NAME);
+      this.abbreviationDictionary = this.artifactProvider.getArtifact(ABBREVIATIONS_ENTRY_NAME);
     }
     return this.abbreviationDictionary;
   }
@@ -215,7 +211,7 @@ public class TokenizerFactory extends BaseToolFactory {
    * @return the language code
    */
   public String getLanguageCode() {
-    if (this.languageCode == null && artifactProvider != null) {
+    if (this.languageCode == null && this.artifactProvider != null) {
       this.languageCode = this.artifactProvider.getLanguage();
     }
     return this.languageCode;

http://git-wip-us.apache.org/repos/asf/opennlp/blob/c49a87ab/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java
index 4c4c638..491b6fa 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java
@@ -34,7 +34,6 @@ import opennlp.tools.tokenize.lang.Factory;
 import opennlp.tools.util.ObjectStream;
 import opennlp.tools.util.Span;
 import opennlp.tools.util.TrainingParameters;
-import opennlp.tools.util.model.ModelUtil;
 
 /**
  * A Tokenizer for converting raw text into separated tokens.  It uses
@@ -254,94 +253,6 @@ public class TokenizerME extends AbstractTokenizer {
   }
 
   /**
-   * Trains a model for the {@link TokenizerME}.
-   *
-   * @param languageCode the language of the natural text
-   * @param samples the samples used for the training.
-   * @param useAlphaNumericOptimization - if true alpha numerics are skipped
-   * @param mlParams the machine learning train parameters
-   *
-   * @return the trained {@link TokenizerModel}
-   *
-   * @throws IOException it throws an {@link IOException} if an {@link IOException}
-   * is thrown during IO operations on a temp file which is created during training.
-   * Or if reading from the {@link ObjectStream} fails.
-   *
-   * @deprecated Use
-   *    {@link #train(ObjectStream, TokenizerFactory, TrainingParameters)}
-   *    and pass in a {@link TokenizerFactory}
-   */
-  public static TokenizerModel train(String languageCode, ObjectStream<TokenSample> samples,
-      boolean useAlphaNumericOptimization, TrainingParameters mlParams) throws IOException {
-    return train(languageCode, samples, null, useAlphaNumericOptimization,
-        mlParams);
-  }
-
-  /**
-   * Trains a model for the {@link TokenizerME}.
-   *
-   * @param languageCode the language of the natural text
-   * @param samples the samples used for the training.
-   * @param abbreviations an abbreviations dictionary
-   * @param useAlphaNumericOptimization - if true alpha numerics are skipped
-   * @param mlParams the machine learning train parameters
-   *
-   * @return the trained {@link TokenizerModel}
-   *
-   * @throws IOException it throws an {@link IOException} if an {@link IOException}
-   * is thrown during IO operations on a temp file which is created during training.
-   * Or if reading from the {@link ObjectStream} fails.
-   *
-   * @deprecated Use
-   *    {@link #train(ObjectStream, TokenizerFactory, TrainingParameters)}
-   *    and pass in a {@link TokenizerFactory}
-   */
-  public static TokenizerModel train(String languageCode,
-      ObjectStream<TokenSample> samples, Dictionary abbreviations,
-      boolean useAlphaNumericOptimization, TrainingParameters mlParams)
-      throws IOException {
-    Factory factory = new Factory();
-
-    Map<String, String> manifestInfoEntries = new HashMap<>();
-
-    ObjectStream<Event> eventStream = new TokSpanEventStream(samples,
-        useAlphaNumericOptimization, factory.getAlphanumeric(languageCode),
-        factory.createTokenContextGenerator(languageCode,
-            getAbbreviations(abbreviations)));
-
-    EventTrainer trainer = TrainerFactory.getEventTrainer(
-            mlParams.getSettings(), manifestInfoEntries);
-
-    MaxentModel maxentModel = trainer.train(eventStream);
-
-    return new TokenizerModel(languageCode, maxentModel, abbreviations,
-        useAlphaNumericOptimization, manifestInfoEntries);
-  }
-
-
-  /**
-   * Trains a model for the {@link TokenizerME} with a default cutoff of 5 and 100 iterations.
-   *
-   * @param languageCode the language of the natural text
-   * @param samples the samples used for the training.
-   * @param useAlphaNumericOptimization - if true alpha numerics are skipped
-   *
-   * @return the trained {@link TokenizerModel}
-   *
-   * @throws IOException it throws an {@link IOException} if an {@link IOException}
-   * is thrown during IO operations on a temp file which is
-   *
-   * @deprecated Use
-   *    {@link #train(ObjectStream, TokenizerFactory, TrainingParameters)}
-   *    and pass in a {@link TokenizerFactory}
-   */
-  public static TokenizerModel train(String languageCode, ObjectStream<TokenSample> samples,
-      boolean useAlphaNumericOptimization) throws IOException {
-    return train(samples, TokenizerFactory.create(null, languageCode, null, useAlphaNumericOptimization, null),
-      ModelUtil.createDefaultTrainingParameters());
-  }
-
-  /**
    * Returns the value of the alpha-numeric optimization flag.
    *
    * @return true if the tokenizer should use alpha-numeric optimization, false otherwise.

http://git-wip-us.apache.org/repos/asf/opennlp/blob/c49a87ab/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerModel.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerModel.java b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerModel.java
index e63b946..ed84b4e 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerModel.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerModel.java
@@ -66,59 +66,6 @@ public final class TokenizerModel extends BaseModel {
   /**
    * Initializes the current instance.
    *
-   * @param language the language the tokenizer should use
-   * @param tokenizerMaxentModel the statistical model of the tokenizer
-   * @param abbreviations the dictionary containing the abbreviations
-   * @param useAlphaNumericOptimization if true alpha numeric optimization is enabled, otherwise not
-   * @param manifestInfoEntries the additional meta data which should be written into manifest
-   *
-   * @deprecated Use
-   *             {@link TokenizerModel#TokenizerModel(MaxentModel, Map, TokenizerFactory)}
-   *             instead and pass in a {@link TokenizerFactory}.
-   */
-  public TokenizerModel(String language, MaxentModel tokenizerMaxentModel,
-      Dictionary abbreviations, boolean useAlphaNumericOptimization,
-      Map<String, String> manifestInfoEntries) {
-    this(tokenizerMaxentModel, manifestInfoEntries,
-        new TokenizerFactory(language, abbreviations, useAlphaNumericOptimization, null));
-  }
-
-  /**
-   * Initializes the current instance.
-   *
-   * @param language the language the tokenizer should use
-   * @param tokenizerMaxentModel the statistical model of the tokenizer
-   * @param useAlphaNumericOptimization if true alpha numeric optimization is enabled, otherwise not
-   * @param manifestInfoEntries the additional meta data which should be written into manifest
-   *
-   * @deprecated Use
-   *             {@link TokenizerModel#TokenizerModel(MaxentModel, Map, TokenizerFactory)}
-   *             instead and pass in a {@link TokenizerFactory}.
-   */
-  public TokenizerModel(String language, AbstractModel tokenizerMaxentModel,
-      boolean useAlphaNumericOptimization, Map<String, String> manifestInfoEntries) {
-    this(language, tokenizerMaxentModel, null, useAlphaNumericOptimization, manifestInfoEntries);
-  }
-
-  /**
-   * Initializes the current instance.
-   *
-   * @param language the language the tokenizer should use
-   * @param tokenizerMaxentModel the statistical model of the tokenizer
-   * @param useAlphaNumericOptimization if true alpha numeric optimization is enabled, otherwise not
-   *
-   * @deprecated Use
-   *             {@link TokenizerModel#TokenizerModel(MaxentModel, Map, TokenizerFactory)}
-   *             instead and pass in a {@link TokenizerFactory}.
-   */
-  public TokenizerModel(String language, AbstractModel tokenizerMaxentModel,
-      boolean useAlphaNumericOptimization) {
-    this(language, tokenizerMaxentModel, useAlphaNumericOptimization, null);
-  }
-
-  /**
-   * Initializes the current instance.
-   *
    * @param in the Input Stream to load the model from
    *
    * @throws IOException if reading from the stream fails in anyway

http://git-wip-us.apache.org/repos/asf/opennlp/blob/c49a87ab/opennlp-tools/src/test/java/opennlp/tools/eval/Conll00ChunkerEval.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/eval/Conll00ChunkerEval.java b/opennlp-tools/src/test/java/opennlp/tools/eval/Conll00ChunkerEval.java
index 3b8e060..0ff3002 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/eval/Conll00ChunkerEval.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/eval/Conll00ChunkerEval.java
@@ -59,8 +59,7 @@ public class Conll00ChunkerEval {
       double expectedFMeasure) throws IOException {
 
     ObjectStream<ChunkSample> samples = new ChunkSampleStream(
-        new PlainTextByLineStream(new MarkableFileInputStreamFactory(testData),
-            "UTF-8"));
+        new PlainTextByLineStream(new MarkableFileInputStreamFactory(testData),"UTF-8"));
 
     ChunkerEvaluator evaluator = new ChunkerEvaluator(new ChunkerME(model));
     evaluator.evaluate(samples);