You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by co...@apache.org on 2012/03/24 00:00:13 UTC

svn commit: r1304647 - in /opennlp/trunk/opennlp-tools/src: main/java/opennlp/tools/tokenize/ test/java/opennlp/tools/tokenize/

Author: colen
Date: Fri Mar 23 23:00:12 2012
New Revision: 1304647

URL: http://svn.apache.org/viewvc?rev=1304647&view=rev
Log:
OPENNLP-482: Changed TokenizerModel and TokenizerME to support factories. Implemented TokenizerFactory

Added:
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerFactory.java   (with props)
    opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/tokenize/DummyTokenizerFactory.java   (with props)
    opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerFactoryTest.java   (with props)
Modified:
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerModel.java

Added: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerFactory.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerFactory.java?rev=1304647&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerFactory.java (added)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerFactory.java Fri Mar 23 23:00:12 2012
@@ -0,0 +1,252 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreemnets.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.tokenize;
+
+import java.lang.reflect.Constructor;
+import java.util.Collections;
+import java.util.Map;
+import java.util.Set;
+import java.util.regex.Pattern;
+
+import opennlp.tools.dictionary.Dictionary;
+import opennlp.tools.tokenize.lang.Factory;
+import opennlp.tools.util.BaseToolFactory;
+import opennlp.tools.util.InvalidFormatException;
+import opennlp.tools.util.model.ArtifactProvider;
+
+/**
+ * The factory that provides {@link Tokenizer} default implementations and
+ * resources. Users can extend this class if their application requires
+ * overriding the {@link TokenContextGenerator}, {@link Dictionary} etc.
+ */
+public class TokenizerFactory extends BaseToolFactory {
+
+  private String languageCode;
+  private Dictionary abbreviationDictionary;
+  private Boolean useAlphaNumericOptimization = null;
+  private Pattern alphaNumericPattern;
+
+  private static final String ABBREVIATIONS_ENTRY_NAME = "abbreviations.dictionary";
+  private static final String USE_ALPHA_NUMERIC_OPTIMIZATION = "useAlphaNumericOptimization";
+  private static final String ALPHA_NUMERIC_PATTERN = "alphaNumericPattern";
+
+  /**
+   * Creates a {@link TokenizerFactory} that provides the default implementation
+   * of the resources.
+   */
+  public TokenizerFactory() {
+  }
+
+  /**
+   * Creates a {@link TokenizerFactory} with an {@link ArtifactProvider} that
+   * will be used to retrieve artifacts. This constructor will try to get the
+   * language code, abbreviation dictionary etc from the
+   * {@link ArtifactProvider}.
+   * <p>
+   * Sub-classes should implement a constructor with this signatures and call
+   * this constructor.
+   * <p>
+   * This will be used to load the factory from a serialized
+   * {@link TokenizerModel}.
+   */
+  public TokenizerFactory(ArtifactProvider artifactProvider) {
+    super(artifactProvider);
+  }
+
+  /**
+   * Creates a {@link TokenizerFactory}. Use this constructor to
+   * programmatically create a factory.
+   * 
+   * @param languageCode
+   *          the language of the natural text
+   * @param abbreviationDictionary
+   *          an abbreviations dictionary
+   * @param useAlphaNumericOptimization
+   *          if true alpha numerics are skipped
+   * @param alphaNumericPattern
+   *          null or a custom alphanumeric pattern (default is:
+   *          "^[A-Za-z0-9]+$", provided by {@link Factory#DEFAULT_ALPHANUMERIC}
+   */
+  public TokenizerFactory(String languageCode,
+      Dictionary abbreviationDictionary, boolean useAlphaNumericOptimization,
+      Pattern alphaNumericPattern) {
+    this.languageCode = languageCode;
+    this.useAlphaNumericOptimization = useAlphaNumericOptimization;
+    this.alphaNumericPattern = alphaNumericPattern;
+    this.abbreviationDictionary = abbreviationDictionary;
+  }
+
+  @Override
+  public void validateArtifactMap() throws InvalidFormatException {
+
+    if (this.artifactProvider.getManifestProperty(ALPHA_NUMERIC_PATTERN) == null)
+      throw new InvalidFormatException(ALPHA_NUMERIC_PATTERN
+          + " is a mandatory property!");
+
+    if (this.artifactProvider
+        .getManifestProperty(USE_ALPHA_NUMERIC_OPTIMIZATION) == null)
+      throw new InvalidFormatException(USE_ALPHA_NUMERIC_OPTIMIZATION
+          + " is a mandatory property!");
+
+    Object abbreviationsEntry = this.artifactProvider
+        .getArtifact(ABBREVIATIONS_ENTRY_NAME);
+
+    if (abbreviationsEntry != null
+        && !(abbreviationsEntry instanceof Dictionary)) {
+      throw new InvalidFormatException(
+          "Abbreviations dictionary has wrong type!");
+    }
+  }
+
+  @Override
+  public Map<String, Object> createArtifactMap() {
+    Map<String, Object> artifactMap = super.createArtifactMap();
+
+    // Abbreviations are optional
+    if (abbreviationDictionary != null)
+      artifactMap.put(ABBREVIATIONS_ENTRY_NAME, abbreviationDictionary);
+
+    return artifactMap;
+  }
+
+  @Override
+  public Map<String, String> createManifestEntries() {
+    Map<String, String> manifestEntries = super.createManifestEntries();
+
+    manifestEntries.put(USE_ALPHA_NUMERIC_OPTIMIZATION,
+        Boolean.toString(isUseAlphaNumericOptmization()));
+
+    // alphanumeric pattern is optional
+    if (getAlphaNumericPattern() != null)
+      manifestEntries.put(ALPHA_NUMERIC_PATTERN, getAlphaNumericPattern()
+          .pattern());
+
+    return manifestEntries;
+  }
+
+  /**
+   * Factory method the framework uses create a new {@link TokenizerFactory}.
+   */
+  public static TokenizerFactory create(String subclassName,
+      String languageCode, Dictionary abbreviationDictionary,
+      boolean useAlphaNumericOptimization, Pattern alphaNumericPattern)
+      throws InvalidFormatException {
+    if (subclassName == null) {
+      // will create the default factory
+      return new TokenizerFactory(languageCode, abbreviationDictionary,
+          useAlphaNumericOptimization, alphaNumericPattern);
+    }
+    TokenizerFactory theFactory = null;
+    Class<? extends BaseToolFactory> factoryClass = loadSubclass(subclassName);
+    if (factoryClass != null) {
+      try {
+        Constructor<?> constructor = null;
+        constructor = factoryClass.getConstructor(String.class,
+            Dictionary.class, boolean.class, Pattern.class);
+        theFactory = (TokenizerFactory) constructor.newInstance(languageCode,
+            abbreviationDictionary, useAlphaNumericOptimization,
+            alphaNumericPattern);
+      } catch (NoSuchMethodException e) {
+        String msg = "Could not instantiate the "
+            + subclassName
+            + ". The mandatory constructor (String, Dictionary, boolean, Pattern) is missing.";
+        System.err.println(msg);
+        throw new IllegalArgumentException(msg);
+      } catch (Exception e) {
+        String msg = "Could not instantiate the "
+            + subclassName
+            + ". The constructor (String, Dictionary, boolean, Pattern) throw an exception.";
+        System.err.println(msg);
+        e.printStackTrace();
+        throw new InvalidFormatException(msg);
+      }
+    }
+    return theFactory;
+  }
+
+  /**
+   * Gets the alpha numeric pattern.
+   * 
+   * @return the user specified alpha numeric pattern or a default.
+   */
+  public Pattern getAlphaNumericPattern() {
+    if (this.alphaNumericPattern == null) {
+      if (artifactProvider != null) {
+        String prop = this.artifactProvider
+            .getManifestProperty(ALPHA_NUMERIC_PATTERN);
+        if (prop != null) {
+          this.alphaNumericPattern = Pattern.compile(prop);
+        }
+      } else {
+        // get from language dependent factory
+        Factory f = new Factory();
+        this.alphaNumericPattern = f.getAlphanumeric(languageCode);
+      }
+    }
+    return this.alphaNumericPattern;
+  }
+
+  /**
+   * Gets whether to use alphanumeric optimization.
+   */
+  public boolean isUseAlphaNumericOptmization() {
+    if (this.useAlphaNumericOptimization == null && artifactProvider != null) {
+      this.useAlphaNumericOptimization = Boolean.valueOf(artifactProvider
+          .getManifestProperty(USE_ALPHA_NUMERIC_OPTIMIZATION));
+    }
+    return this.useAlphaNumericOptimization;
+  }
+
+  /**
+   * Gets the abbreviation dictionary
+   * 
+   * @return null or the abbreviation dictionary
+   */
+  public Dictionary getAbbreviationDictionary() {
+    if (this.abbreviationDictionary == null && artifactProvider != null) {
+      this.abbreviationDictionary = artifactProvider
+          .getArtifact(ABBREVIATIONS_ENTRY_NAME);
+    }
+    return this.abbreviationDictionary;
+  }
+
+  /**
+   * Gets the language code
+   */
+  public String getLanguageCode() {
+    if (this.languageCode == null && artifactProvider != null) {
+      this.languageCode = this.artifactProvider.getLanguage();
+    }
+    return this.languageCode;
+  }
+
+  /**
+   * Gets the context generator
+   */
+  public TokenContextGenerator getContextGenerator() {
+    Factory f = new Factory();
+    Set<String> abbs = null;
+    Dictionary abbDict = getAbbreviationDictionary();
+    if (abbDict != null) {
+      abbs = abbDict.asStringSet();
+    } else {
+      abbs = Collections.emptySet();
+    }
+    return f.createTokenContextGenerator(getLanguageCode(), abbs);
+  }
+}

Propchange: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerFactory.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Modified: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java?rev=1304647&r1=1304646&r2=1304647&view=diff
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java (original)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java Fri Mar 23 23:00:12 2012
@@ -210,6 +210,41 @@ public class TokenizerME extends Abstrac
     newTokens.toArray(spans);
     return spans;
   }
+  
+  /**
+   * Trains a model for the {@link TokenizerME}.
+   * 
+   * @param languageCode
+   *          the language of the natural text
+   * @param samples
+   *          the samples used for the training.
+   * @param factory
+   *          a {@link TokenizerFactory} to get resources from
+   * @param mlParams
+   *          the machine learning train parameters
+   * @return the trained {@link TokenizerModel}
+   * @throws IOException
+   *           it throws an {@link IOException} if an {@link IOException} is
+   *           thrown during IO operations on a temp file which is created
+   *           during training. Or if reading from the {@link ObjectStream}
+   *           fails.
+   */
+  public static TokenizerModel train(String languageCode,
+      ObjectStream<TokenSample> samples, TokenizerFactory factory,
+      TrainingParameters mlParams) throws IOException {
+
+    Map<String, String> manifestInfoEntries = new HashMap<String, String>();
+
+    EventStream eventStream = new TokSpanEventStream(samples,
+        factory.isUseAlphaNumericOptmization(),
+        factory.getAlphaNumericPattern(), factory.getContextGenerator());
+
+    AbstractModel maxentModel = TrainUtil.train(eventStream,
+        mlParams.getSettings(), manifestInfoEntries);
+
+    return new TokenizerModel(languageCode, maxentModel, manifestInfoEntries,
+        factory);
+  }
 
   /**
    * Trains a model for the {@link TokenizerME}.
@@ -225,6 +260,9 @@ public class TokenizerME extends Abstrac
    * is thrown during IO operations on a temp file which is created during training.
    * Or if reading from the {@link ObjectStream} fails.
    * 
+   * @deprecated Use 
+   *    {@link #train(String, ObjectStream, TokenizerFactory, TrainingParameters)} 
+   *    and pass in a {@link TokenizerFactory}
    */
   public static TokenizerModel train(String languageCode, ObjectStream<TokenSample> samples,
       boolean useAlphaNumericOptimization, TrainingParameters mlParams) throws IOException {
@@ -247,6 +285,9 @@ public class TokenizerME extends Abstrac
    * is thrown during IO operations on a temp file which is created during training.
    * Or if reading from the {@link ObjectStream} fails.
    * 
+   * @deprecated Use 
+   *    {@link #train(String, ObjectStream, TokenizerFactory, TrainingParameters)} 
+   *    and pass in a {@link TokenizerFactory}
    */
   public static TokenizerModel train(String languageCode,
       ObjectStream<TokenSample> samples, Dictionary abbreviations,
@@ -283,8 +324,9 @@ public class TokenizerME extends Abstrac
    * is thrown during IO operations on a temp file which is created during training.
    * Or if reading from the {@link ObjectStream} fails.
    * 
-   * @deprecated use {@link #train(String, ObjectStream, boolean, TrainingParameters)}
-   * instead and pass in a TrainingParameters object.
+   * @deprecated Use 
+   *    {@link #train(String, ObjectStream, TokenizerFactory, TrainingParameters)} 
+   *    and pass in a {@link TokenizerFactory}
    */
   @Deprecated
   public static TokenizerModel train(String languageCode, ObjectStream<TokenSample> samples,
@@ -308,6 +350,11 @@ public class TokenizerME extends Abstrac
    * 
    * @throws ObjectStreamException if reading from the {@link ObjectStream} fails
    * created during training.
+   * 
+   * 
+   * @deprecated Use 
+   *    {@link #train(String, ObjectStream, TokenizerFactory, TrainingParameters)} 
+   *    and pass in a {@link TokenizerFactory}
    */
   public static TokenizerModel train(String languageCode, ObjectStream<TokenSample> samples,
       boolean useAlphaNumericOptimization) throws IOException, ObjectStreamException {

Modified: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerModel.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerModel.java?rev=1304647&r1=1304646&r2=1304647&view=diff
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerModel.java (original)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerModel.java Fri Mar 23 23:00:12 2012
@@ -30,6 +30,7 @@ import opennlp.maxent.io.BinaryGISModelR
 import opennlp.model.AbstractModel;
 import opennlp.model.MaxentModel;
 import opennlp.tools.dictionary.Dictionary;
+import opennlp.tools.util.BaseToolFactory;
 import opennlp.tools.util.InvalidFormatException;
 import opennlp.tools.util.model.BaseModel;
 import opennlp.tools.util.model.ModelUtil;
@@ -45,31 +46,37 @@ public final class TokenizerModel extend
   private static final String COMPONENT_NAME = "TokenizerME";
   
   private static final String TOKENIZER_MODEL_ENTRY = "token.model";
-  private static final String ABBREVIATIONS_ENTRY_NAME = "abbreviations.dictionary";
 
-  private static final String USE_ALPHA_NUMERIC_OPTIMIZATION =
-      "useAlphaNumericOptimization";
+  /**
+   * Initializes the current instance.
+   * 
+   * @param languageCode the language of the natural text
+   * @param tokenizerModel the model
+   * @param manifestInfoEntries the manifest
+   * @param tokenizerFactory the factory
+   */
+  public TokenizerModel(String languageCode, AbstractModel tokenizerModel,
+      Map<String, String> manifestInfoEntries, TokenizerFactory tokenizerFactory) {
+    super(COMPONENT_NAME, languageCode, manifestInfoEntries, tokenizerFactory);
+    artifactMap.put(TOKENIZER_MODEL_ENTRY, tokenizerModel);
+    checkArtifactMap();
+  }
 
   /**
    * Initializes the current instance.
    *
    * @param tokenizerMaxentModel
    * @param useAlphaNumericOptimization
+   * 
+   * @deprecated Use
+   *             {@link TokenizerModel#TokenizerModel(String, AbstractModel, Map, TokenizerFactory)}
+   *             instead and pass in a {@link TokenizerFactory}.
    */
   public TokenizerModel(String language, AbstractModel tokenizerMaxentModel,
       Dictionary abbreviations, boolean useAlphaNumericOptimization,
       Map<String, String> manifestInfoEntries) {
-    super(COMPONENT_NAME, language, manifestInfoEntries);
-
-    artifactMap.put(TOKENIZER_MODEL_ENTRY, tokenizerMaxentModel);
-
-    setManifestProperty(USE_ALPHA_NUMERIC_OPTIMIZATION,
-        Boolean.toString(useAlphaNumericOptimization));
-
-    // Abbreviations are optional
-    if (abbreviations != null)
-      artifactMap.put(ABBREVIATIONS_ENTRY_NAME, abbreviations);
-    checkArtifactMap();
+    this(language, tokenizerMaxentModel, manifestInfoEntries, 
+        new TokenizerFactory(language, abbreviations, useAlphaNumericOptimization, null));
   }
 
   /**
@@ -79,6 +86,10 @@ public final class TokenizerModel extend
    * @param tokenizerMaxentModel
    * @param useAlphaNumericOptimization
    * @param manifestInfoEntries
+   * 
+   * @deprecated Use
+   *             {@link TokenizerModel#TokenizerModel(String, AbstractModel, Map, TokenizerFactory)}
+   *             instead and pass in a {@link TokenizerFactory}.
    */
   public TokenizerModel(String language, AbstractModel tokenizerMaxentModel,
       boolean useAlphaNumericOptimization, Map<String, String> manifestInfoEntries) {
@@ -91,6 +102,10 @@ public final class TokenizerModel extend
    * @param language
    * @param tokenizerMaxentModel
    * @param useAlphaNumericOptimization
+   * 
+   * @deprecated Use
+   *             {@link TokenizerModel#TokenizerModel(String, AbstractModel, Map, TokenizerFactory)}
+   *             instead and pass in a {@link TokenizerFactory}.
    */
   public TokenizerModel(String language, AbstractModel tokenizerMaxentModel,
       boolean useAlphaNumericOptimization) {
@@ -130,17 +145,15 @@ public final class TokenizerModel extend
     if (!isModelCompatible(getMaxentModel())) {
       throw new InvalidFormatException("The maxent model is not compatible with the tokenizer!");
     }
+  }
 
-    if (getManifestProperty(USE_ALPHA_NUMERIC_OPTIMIZATION) == null) {
-      throw new InvalidFormatException("The " + USE_ALPHA_NUMERIC_OPTIMIZATION + " parameter " +
-          "cannot be found!");
-    }
-    
-    Object abbreviationsEntry = artifactMap.get(ABBREVIATIONS_ENTRY_NAME);
+  public TokenizerFactory getFactory() {
+    return (TokenizerFactory) this.toolFactory;
+  }
 
-    if (abbreviationsEntry != null && !(abbreviationsEntry instanceof Dictionary)) {
-      throw new InvalidFormatException("Abbreviations dictionary has wrong type!");
-    }
+  @Override
+  protected Class<? extends BaseToolFactory> getDefaultFactory() {
+    return TokenizerFactory.class;
   }
 
   public AbstractModel getMaxentModel() {
@@ -148,13 +161,17 @@ public final class TokenizerModel extend
   }
   
   public Dictionary getAbbreviations() {
-    return (Dictionary) artifactMap.get(ABBREVIATIONS_ENTRY_NAME);
+    if (getFactory() != null) {
+      return getFactory().getAbbreviationDictionary();
+    }
+    return null;
   }
 
   public boolean useAlphaNumericOptimization() {
-    String optimization = getManifestProperty(USE_ALPHA_NUMERIC_OPTIMIZATION);
-
-    return Boolean.parseBoolean(optimization);
+    if (getFactory() != null) {
+      return getFactory().isUseAlphaNumericOptmization();
+    }
+    return false;
   }
 
   public static void main(String[] args) throws IOException {

Added: opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/tokenize/DummyTokenizerFactory.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/tokenize/DummyTokenizerFactory.java?rev=1304647&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/tokenize/DummyTokenizerFactory.java (added)
+++ opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/tokenize/DummyTokenizerFactory.java Fri Mar 23 23:00:12 2012
@@ -0,0 +1,122 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.tokenize;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.util.Map;
+import java.util.Set;
+import java.util.regex.Pattern;
+
+import opennlp.tools.dictionary.Dictionary;
+import opennlp.tools.util.InvalidFormatException;
+import opennlp.tools.util.model.ArtifactProvider;
+import opennlp.tools.util.model.ArtifactSerializer;
+
+public class DummyTokenizerFactory extends TokenizerFactory {
+
+  private static final String DUMMY_DICT = "dummy";
+  private DummyDictionary dict;
+
+  public DummyTokenizerFactory(String languageCode,
+      Dictionary abbreviationDictionary, boolean useAlphaNumericOptimization,
+      Pattern alphaNumericPattern) {
+    super(languageCode, abbreviationDictionary, useAlphaNumericOptimization,
+        alphaNumericPattern);
+    this.dict = new DummyDictionary(abbreviationDictionary);
+  }
+
+  public DummyTokenizerFactory(ArtifactProvider provider) {
+    super(provider);
+  }
+
+  @Override
+  public DummyDictionary getAbbreviationDictionary() {
+    if (this.dict == null && artifactProvider != null) {
+      this.dict = artifactProvider.getArtifact(DUMMY_DICT);
+    }
+    return this.dict;
+  }
+
+  @Override
+  public TokenContextGenerator getContextGenerator() {
+    return new DummyContextGenerator(getAbbreviationDictionary().asStringSet());
+  }
+
+  @Override
+  @SuppressWarnings("rawtypes")
+  public Map<String, ArtifactSerializer> createArtifactSerializersMap() {
+    Map<String, ArtifactSerializer> serializers = super
+        .createArtifactSerializersMap();
+
+    serializers.put(DUMMY_DICT, new DummyDictionarySerializer());
+    return serializers;
+  }
+
+  @Override
+  public Map<String, Object> createArtifactMap() {
+    Map<String, Object> artifactMap = super.createArtifactMap();
+    if (this.dict != null)
+      artifactMap.put(DUMMY_DICT, this.dict);
+    return artifactMap;
+  }
+
+  static class DummyDictionarySerializer implements
+      ArtifactSerializer<DummyDictionary> {
+
+    public DummyDictionary create(InputStream in) throws IOException,
+        InvalidFormatException {
+      return new DummyDictionary(in);
+    }
+
+    public void serialize(DummyDictionary artifact, OutputStream out)
+        throws IOException {
+      artifact.serialize(out);
+    }
+  }
+
+  static class DummyDictionary extends Dictionary {
+    private Dictionary indict;
+
+    public DummyDictionary(Dictionary dict) {
+      this.indict = dict;
+    }
+
+    public DummyDictionary(InputStream in) throws IOException {
+      this.indict = new Dictionary(in);
+    }
+
+    public void serialize(OutputStream out) throws IOException {
+      indict.serialize(out);
+    }
+
+    public Set<String> asStringSet() {
+      return indict.asStringSet();
+    }
+  }
+
+  static class DummyContextGenerator extends DefaultTokenContextGenerator {
+
+    public DummyContextGenerator(Set<String> inducedAbbreviations) {
+      super(inducedAbbreviations);
+    }
+
+  }
+
+}

Propchange: opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/tokenize/DummyTokenizerFactory.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerFactoryTest.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerFactoryTest.java?rev=1304647&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerFactoryTest.java (added)
+++ opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerFactoryTest.java Fri Mar 23 23:00:12 2012
@@ -0,0 +1,226 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreemnets.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.tokenize;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.util.regex.Pattern;
+
+import opennlp.tools.dictionary.Dictionary;
+import opennlp.tools.tokenize.DummyTokenizerFactory.DummyContextGenerator;
+import opennlp.tools.tokenize.DummyTokenizerFactory.DummyDictionary;
+import opennlp.tools.tokenize.lang.Factory;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.PlainTextByLineStream;
+import opennlp.tools.util.TrainingParameters;
+
+import org.junit.Test;
+
+/**
+ * Tests for the {@link TokenizerFactory} class.
+ */
+public class TokenizerFactoryTest {
+
+  private static ObjectStream<TokenSample> createSampleStream()
+      throws IOException {
+    InputStream in = TokenizerFactoryTest.class.getClassLoader()
+        .getResourceAsStream("opennlp/tools/tokenize/token.train");
+
+    return new TokenSampleStream(new PlainTextByLineStream(
+        new InputStreamReader(in)));
+  }
+
+  private static TokenizerModel train(TokenizerFactory factory)
+      throws IOException {
+    return TokenizerME.train(factory.getLanguageCode(), createSampleStream(),
+        factory, TrainingParameters.defaultParams());
+  }
+
+  static Dictionary loadAbbDictionary() throws IOException {
+    InputStream in = TokenizerFactoryTest.class.getClassLoader()
+        .getResourceAsStream("opennlp/tools/sentdetect/abb.xml");
+
+    return new Dictionary(in);
+  }
+
+  @Test
+  public void testDefault() throws IOException {
+
+    Dictionary dic = loadAbbDictionary();
+    final String lang = "es";
+
+    TokenizerModel model = train(new TokenizerFactory(lang, dic, false, null));
+
+    TokenizerFactory factory = model.getFactory();
+    assertTrue(factory.getAbbreviationDictionary() instanceof Dictionary);
+    assertTrue(factory.getContextGenerator() instanceof DefaultTokenContextGenerator);
+
+    assertEquals(Factory.DEFAULT_ALPHANUMERIC, factory.getAlphaNumericPattern()
+        .pattern());
+    assertEquals(lang, factory.getLanguageCode());
+    assertEquals(lang, model.getLanguage());
+    assertFalse(factory.isUseAlphaNumericOptmization());
+
+    ByteArrayOutputStream out = new ByteArrayOutputStream();
+    model.serialize(out);
+    ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray());
+
+    TokenizerModel fromSerialized = new TokenizerModel(in);
+
+    factory = fromSerialized.getFactory();
+    assertTrue(factory.getAbbreviationDictionary() instanceof Dictionary);
+    assertTrue(factory.getContextGenerator() instanceof DefaultTokenContextGenerator);
+
+    assertEquals(Factory.DEFAULT_ALPHANUMERIC, factory.getAlphaNumericPattern()
+        .pattern());
+    assertEquals(lang, factory.getLanguageCode());
+    assertEquals(lang, model.getLanguage());
+    assertFalse(factory.isUseAlphaNumericOptmization());
+  }
+
+  @Test
+  public void testNullDict() throws IOException {
+
+    Dictionary dic = null;
+    final String lang = "es";
+
+    TokenizerModel model = train(new TokenizerFactory(lang, dic, false, null));
+
+    TokenizerFactory factory = model.getFactory();
+    assertNull(factory.getAbbreviationDictionary());
+    assertTrue(factory.getContextGenerator() instanceof DefaultTokenContextGenerator);
+
+    assertEquals(Factory.DEFAULT_ALPHANUMERIC, factory.getAlphaNumericPattern()
+        .pattern());
+    assertEquals(lang, factory.getLanguageCode());
+    assertEquals(lang, model.getLanguage());
+    assertFalse(factory.isUseAlphaNumericOptmization());
+
+    ByteArrayOutputStream out = new ByteArrayOutputStream();
+    model.serialize(out);
+    ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray());
+
+    TokenizerModel fromSerialized = new TokenizerModel(in);
+
+    factory = fromSerialized.getFactory();
+    assertNull(factory.getAbbreviationDictionary());
+    assertTrue(factory.getContextGenerator() instanceof DefaultTokenContextGenerator);
+
+    assertEquals(Factory.DEFAULT_ALPHANUMERIC, factory.getAlphaNumericPattern()
+        .pattern());
+    assertEquals(lang, factory.getLanguageCode());
+    assertEquals(lang, model.getLanguage());
+    assertFalse(factory.isUseAlphaNumericOptmization());
+  }
+
+  @Test
+  public void testCustomPatternAndAlphaOpt() throws IOException {
+
+    Dictionary dic = null;
+    final String lang = "es";
+    String pattern = "^[0-9A-Za-z]+$";
+
+    TokenizerModel model = train(new TokenizerFactory(lang, dic, true,
+        Pattern.compile(pattern)));
+
+    TokenizerFactory factory = model.getFactory();
+    assertNull(factory.getAbbreviationDictionary());
+    assertTrue(factory.getContextGenerator() instanceof DefaultTokenContextGenerator);
+
+    assertEquals(pattern, factory.getAlphaNumericPattern().pattern());
+    assertEquals(lang, factory.getLanguageCode());
+    assertEquals(lang, model.getLanguage());
+    assertTrue(factory.isUseAlphaNumericOptmization());
+
+    ByteArrayOutputStream out = new ByteArrayOutputStream();
+    model.serialize(out);
+    ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray());
+
+    TokenizerModel fromSerialized = new TokenizerModel(in);
+
+    factory = fromSerialized.getFactory();
+    assertNull(factory.getAbbreviationDictionary());
+    assertTrue(factory.getContextGenerator() instanceof DefaultTokenContextGenerator);
+
+    assertEquals(pattern, factory.getAlphaNumericPattern().pattern());
+    assertEquals(lang, factory.getLanguageCode());
+    assertEquals(lang, model.getLanguage());
+    assertTrue(factory.isUseAlphaNumericOptmization());
+  }
+
+  @Test
+  public void testDummyFactory() throws IOException {
+
+    Dictionary dic = loadAbbDictionary();
+    final String lang = "es";
+    String pattern = "^[0-9A-Za-z]+$";
+
+    TokenizerModel model = train(new DummyTokenizerFactory(lang, dic, true,
+        Pattern.compile(pattern)));
+
+    TokenizerFactory factory = model.getFactory();
+    assertTrue(factory.getAbbreviationDictionary() instanceof DummyDictionary);
+    assertTrue(factory.getContextGenerator() instanceof DummyContextGenerator);
+
+    assertEquals(pattern, factory.getAlphaNumericPattern().pattern());
+    assertEquals(lang, factory.getLanguageCode());
+    assertEquals(lang, model.getLanguage());
+    assertTrue(factory.isUseAlphaNumericOptmization());
+
+    ByteArrayOutputStream out = new ByteArrayOutputStream();
+    model.serialize(out);
+    ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray());
+
+    TokenizerModel fromSerialized = new TokenizerModel(in);
+
+    factory = fromSerialized.getFactory();
+    assertTrue(factory.getAbbreviationDictionary() instanceof DummyDictionary);
+    assertTrue(factory.getContextGenerator() instanceof DummyContextGenerator);
+
+    assertEquals(pattern, factory.getAlphaNumericPattern().pattern());
+    assertEquals(lang, factory.getLanguageCode());
+    assertEquals(lang, model.getLanguage());
+    assertTrue(factory.isUseAlphaNumericOptmization());
+  }
+
+  @Test
+  public void testCreateDummyFactory() throws IOException {
+    Dictionary dic = loadAbbDictionary();
+    final String lang = "es";
+    String pattern = "^[0-9A-Za-z]+$";
+
+    TokenizerFactory factory = TokenizerFactory.create(
+        DummyTokenizerFactory.class.getCanonicalName(), lang, dic, true,
+        Pattern.compile(pattern));
+
+    assertTrue(factory.getAbbreviationDictionary() instanceof DummyDictionary);
+    assertTrue(factory.getContextGenerator() instanceof DummyContextGenerator);
+
+    assertEquals(pattern, factory.getAlphaNumericPattern().pattern());
+    assertEquals(lang, factory.getLanguageCode());
+    assertTrue(factory.isUseAlphaNumericOptmization());
+  }
+}

Propchange: opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerFactoryTest.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain