You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by co...@apache.org on 2012/03/24 00:00:13 UTC
svn commit: r1304647 - in /opennlp/trunk/opennlp-tools/src:
main/java/opennlp/tools/tokenize/ test/java/opennlp/tools/tokenize/
Author: colen
Date: Fri Mar 23 23:00:12 2012
New Revision: 1304647
URL: http://svn.apache.org/viewvc?rev=1304647&view=rev
Log:
OPENNLP-482: Changed TokenizerModel and TokenizerME to support factories. Implemented TokenizerFactory
Added:
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerFactory.java (with props)
opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/tokenize/DummyTokenizerFactory.java (with props)
opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerFactoryTest.java (with props)
Modified:
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerModel.java
Added: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerFactory.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerFactory.java?rev=1304647&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerFactory.java (added)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerFactory.java Fri Mar 23 23:00:12 2012
@@ -0,0 +1,252 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreemnets. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.tokenize;
+
+import java.lang.reflect.Constructor;
+import java.util.Collections;
+import java.util.Map;
+import java.util.Set;
+import java.util.regex.Pattern;
+
+import opennlp.tools.dictionary.Dictionary;
+import opennlp.tools.tokenize.lang.Factory;
+import opennlp.tools.util.BaseToolFactory;
+import opennlp.tools.util.InvalidFormatException;
+import opennlp.tools.util.model.ArtifactProvider;
+
+/**
+ * The factory that provides {@link Tokenizer} default implementations and
+ * resources. Users can extend this class if their application requires
+ * overriding the {@link TokenContextGenerator}, {@link Dictionary} etc.
+ */
+public class TokenizerFactory extends BaseToolFactory {
+
+ private String languageCode;
+ private Dictionary abbreviationDictionary;
+ private Boolean useAlphaNumericOptimization = null;
+ private Pattern alphaNumericPattern;
+
+ private static final String ABBREVIATIONS_ENTRY_NAME = "abbreviations.dictionary";
+ private static final String USE_ALPHA_NUMERIC_OPTIMIZATION = "useAlphaNumericOptimization";
+ private static final String ALPHA_NUMERIC_PATTERN = "alphaNumericPattern";
+
+ /**
+ * Creates a {@link TokenizerFactory} that provides the default implementation
+ * of the resources.
+ */
+ public TokenizerFactory() {
+ }
+
+ /**
+ * Creates a {@link TokenizerFactory} with an {@link ArtifactProvider} that
+ * will be used to retrieve artifacts. This constructor will try to get the
+ * language code, abbreviation dictionary etc from the
+ * {@link ArtifactProvider}.
+ * <p>
+ * Sub-classes should implement a constructor with this signatures and call
+ * this constructor.
+ * <p>
+ * This will be used to load the factory from a serialized
+ * {@link TokenizerModel}.
+ */
+ public TokenizerFactory(ArtifactProvider artifactProvider) {
+ super(artifactProvider);
+ }
+
+ /**
+ * Creates a {@link TokenizerFactory}. Use this constructor to
+ * programmatically create a factory.
+ *
+ * @param languageCode
+ * the language of the natural text
+ * @param abbreviationDictionary
+ * an abbreviations dictionary
+ * @param useAlphaNumericOptimization
+ * if true alpha numerics are skipped
+ * @param alphaNumericPattern
+ * null or a custom alphanumeric pattern (default is:
+ * "^[A-Za-z0-9]+$", provided by {@link Factory#DEFAULT_ALPHANUMERIC}
+ */
+ public TokenizerFactory(String languageCode,
+ Dictionary abbreviationDictionary, boolean useAlphaNumericOptimization,
+ Pattern alphaNumericPattern) {
+ this.languageCode = languageCode;
+ this.useAlphaNumericOptimization = useAlphaNumericOptimization;
+ this.alphaNumericPattern = alphaNumericPattern;
+ this.abbreviationDictionary = abbreviationDictionary;
+ }
+
+ @Override
+ public void validateArtifactMap() throws InvalidFormatException {
+
+ if (this.artifactProvider.getManifestProperty(ALPHA_NUMERIC_PATTERN) == null)
+ throw new InvalidFormatException(ALPHA_NUMERIC_PATTERN
+ + " is a mandatory property!");
+
+ if (this.artifactProvider
+ .getManifestProperty(USE_ALPHA_NUMERIC_OPTIMIZATION) == null)
+ throw new InvalidFormatException(USE_ALPHA_NUMERIC_OPTIMIZATION
+ + " is a mandatory property!");
+
+ Object abbreviationsEntry = this.artifactProvider
+ .getArtifact(ABBREVIATIONS_ENTRY_NAME);
+
+ if (abbreviationsEntry != null
+ && !(abbreviationsEntry instanceof Dictionary)) {
+ throw new InvalidFormatException(
+ "Abbreviations dictionary has wrong type!");
+ }
+ }
+
+ @Override
+ public Map<String, Object> createArtifactMap() {
+ Map<String, Object> artifactMap = super.createArtifactMap();
+
+ // Abbreviations are optional
+ if (abbreviationDictionary != null)
+ artifactMap.put(ABBREVIATIONS_ENTRY_NAME, abbreviationDictionary);
+
+ return artifactMap;
+ }
+
+ @Override
+ public Map<String, String> createManifestEntries() {
+ Map<String, String> manifestEntries = super.createManifestEntries();
+
+ manifestEntries.put(USE_ALPHA_NUMERIC_OPTIMIZATION,
+ Boolean.toString(isUseAlphaNumericOptmization()));
+
+ // alphanumeric pattern is optional
+ if (getAlphaNumericPattern() != null)
+ manifestEntries.put(ALPHA_NUMERIC_PATTERN, getAlphaNumericPattern()
+ .pattern());
+
+ return manifestEntries;
+ }
+
+ /**
+ * Factory method the framework uses create a new {@link TokenizerFactory}.
+ */
+ public static TokenizerFactory create(String subclassName,
+ String languageCode, Dictionary abbreviationDictionary,
+ boolean useAlphaNumericOptimization, Pattern alphaNumericPattern)
+ throws InvalidFormatException {
+ if (subclassName == null) {
+ // will create the default factory
+ return new TokenizerFactory(languageCode, abbreviationDictionary,
+ useAlphaNumericOptimization, alphaNumericPattern);
+ }
+ TokenizerFactory theFactory = null;
+ Class<? extends BaseToolFactory> factoryClass = loadSubclass(subclassName);
+ if (factoryClass != null) {
+ try {
+ Constructor<?> constructor = null;
+ constructor = factoryClass.getConstructor(String.class,
+ Dictionary.class, boolean.class, Pattern.class);
+ theFactory = (TokenizerFactory) constructor.newInstance(languageCode,
+ abbreviationDictionary, useAlphaNumericOptimization,
+ alphaNumericPattern);
+ } catch (NoSuchMethodException e) {
+ String msg = "Could not instantiate the "
+ + subclassName
+ + ". The mandatory constructor (String, Dictionary, boolean, Pattern) is missing.";
+ System.err.println(msg);
+ throw new IllegalArgumentException(msg);
+ } catch (Exception e) {
+ String msg = "Could not instantiate the "
+ + subclassName
+ + ". The constructor (String, Dictionary, boolean, Pattern) throw an exception.";
+ System.err.println(msg);
+ e.printStackTrace();
+ throw new InvalidFormatException(msg);
+ }
+ }
+ return theFactory;
+ }
+
+ /**
+ * Gets the alpha numeric pattern.
+ *
+ * @return the user specified alpha numeric pattern or a default.
+ */
+ public Pattern getAlphaNumericPattern() {
+ if (this.alphaNumericPattern == null) {
+ if (artifactProvider != null) {
+ String prop = this.artifactProvider
+ .getManifestProperty(ALPHA_NUMERIC_PATTERN);
+ if (prop != null) {
+ this.alphaNumericPattern = Pattern.compile(prop);
+ }
+ } else {
+ // get from language dependent factory
+ Factory f = new Factory();
+ this.alphaNumericPattern = f.getAlphanumeric(languageCode);
+ }
+ }
+ return this.alphaNumericPattern;
+ }
+
+ /**
+ * Gets whether to use alphanumeric optimization.
+ */
+ public boolean isUseAlphaNumericOptmization() {
+ if (this.useAlphaNumericOptimization == null && artifactProvider != null) {
+ this.useAlphaNumericOptimization = Boolean.valueOf(artifactProvider
+ .getManifestProperty(USE_ALPHA_NUMERIC_OPTIMIZATION));
+ }
+ return this.useAlphaNumericOptimization;
+ }
+
+ /**
+ * Gets the abbreviation dictionary
+ *
+ * @return null or the abbreviation dictionary
+ */
+ public Dictionary getAbbreviationDictionary() {
+ if (this.abbreviationDictionary == null && artifactProvider != null) {
+ this.abbreviationDictionary = artifactProvider
+ .getArtifact(ABBREVIATIONS_ENTRY_NAME);
+ }
+ return this.abbreviationDictionary;
+ }
+
+ /**
+ * Gets the language code
+ */
+ public String getLanguageCode() {
+ if (this.languageCode == null && artifactProvider != null) {
+ this.languageCode = this.artifactProvider.getLanguage();
+ }
+ return this.languageCode;
+ }
+
+ /**
+ * Gets the context generator
+ */
+ public TokenContextGenerator getContextGenerator() {
+ Factory f = new Factory();
+ Set<String> abbs = null;
+ Dictionary abbDict = getAbbreviationDictionary();
+ if (abbDict != null) {
+ abbs = abbDict.asStringSet();
+ } else {
+ abbs = Collections.emptySet();
+ }
+ return f.createTokenContextGenerator(getLanguageCode(), abbs);
+ }
+}
Propchange: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerFactory.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Modified: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java?rev=1304647&r1=1304646&r2=1304647&view=diff
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java (original)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java Fri Mar 23 23:00:12 2012
@@ -210,6 +210,41 @@ public class TokenizerME extends Abstrac
newTokens.toArray(spans);
return spans;
}
+
+ /**
+ * Trains a model for the {@link TokenizerME}.
+ *
+ * @param languageCode
+ * the language of the natural text
+ * @param samples
+ * the samples used for the training.
+ * @param factory
+ * a {@link TokenizerFactory} to get resources from
+ * @param mlParams
+ * the machine learning train parameters
+ * @return the trained {@link TokenizerModel}
+ * @throws IOException
+ * it throws an {@link IOException} if an {@link IOException} is
+ * thrown during IO operations on a temp file which is created
+ * during training. Or if reading from the {@link ObjectStream}
+ * fails.
+ */
+ public static TokenizerModel train(String languageCode,
+ ObjectStream<TokenSample> samples, TokenizerFactory factory,
+ TrainingParameters mlParams) throws IOException {
+
+ Map<String, String> manifestInfoEntries = new HashMap<String, String>();
+
+ EventStream eventStream = new TokSpanEventStream(samples,
+ factory.isUseAlphaNumericOptmization(),
+ factory.getAlphaNumericPattern(), factory.getContextGenerator());
+
+ AbstractModel maxentModel = TrainUtil.train(eventStream,
+ mlParams.getSettings(), manifestInfoEntries);
+
+ return new TokenizerModel(languageCode, maxentModel, manifestInfoEntries,
+ factory);
+ }
/**
* Trains a model for the {@link TokenizerME}.
@@ -225,6 +260,9 @@ public class TokenizerME extends Abstrac
* is thrown during IO operations on a temp file which is created during training.
* Or if reading from the {@link ObjectStream} fails.
*
+ * @deprecated Use
+ * {@link #train(String, ObjectStream, TokenizerFactory, TrainingParameters)}
+ * and pass in a {@link TokenizerFactory}
*/
public static TokenizerModel train(String languageCode, ObjectStream<TokenSample> samples,
boolean useAlphaNumericOptimization, TrainingParameters mlParams) throws IOException {
@@ -247,6 +285,9 @@ public class TokenizerME extends Abstrac
* is thrown during IO operations on a temp file which is created during training.
* Or if reading from the {@link ObjectStream} fails.
*
+ * @deprecated Use
+ * {@link #train(String, ObjectStream, TokenizerFactory, TrainingParameters)}
+ * and pass in a {@link TokenizerFactory}
*/
public static TokenizerModel train(String languageCode,
ObjectStream<TokenSample> samples, Dictionary abbreviations,
@@ -283,8 +324,9 @@ public class TokenizerME extends Abstrac
* is thrown during IO operations on a temp file which is created during training.
* Or if reading from the {@link ObjectStream} fails.
*
- * @deprecated use {@link #train(String, ObjectStream, boolean, TrainingParameters)}
- * instead and pass in a TrainingParameters object.
+ * @deprecated Use
+ * {@link #train(String, ObjectStream, TokenizerFactory, TrainingParameters)}
+ * and pass in a {@link TokenizerFactory}
*/
@Deprecated
public static TokenizerModel train(String languageCode, ObjectStream<TokenSample> samples,
@@ -308,6 +350,11 @@ public class TokenizerME extends Abstrac
*
* @throws ObjectStreamException if reading from the {@link ObjectStream} fails
* created during training.
+ *
+ *
+ * @deprecated Use
+ * {@link #train(String, ObjectStream, TokenizerFactory, TrainingParameters)}
+ * and pass in a {@link TokenizerFactory}
*/
public static TokenizerModel train(String languageCode, ObjectStream<TokenSample> samples,
boolean useAlphaNumericOptimization) throws IOException, ObjectStreamException {
Modified: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerModel.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerModel.java?rev=1304647&r1=1304646&r2=1304647&view=diff
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerModel.java (original)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerModel.java Fri Mar 23 23:00:12 2012
@@ -30,6 +30,7 @@ import opennlp.maxent.io.BinaryGISModelR
import opennlp.model.AbstractModel;
import opennlp.model.MaxentModel;
import opennlp.tools.dictionary.Dictionary;
+import opennlp.tools.util.BaseToolFactory;
import opennlp.tools.util.InvalidFormatException;
import opennlp.tools.util.model.BaseModel;
import opennlp.tools.util.model.ModelUtil;
@@ -45,31 +46,37 @@ public final class TokenizerModel extend
private static final String COMPONENT_NAME = "TokenizerME";
private static final String TOKENIZER_MODEL_ENTRY = "token.model";
- private static final String ABBREVIATIONS_ENTRY_NAME = "abbreviations.dictionary";
- private static final String USE_ALPHA_NUMERIC_OPTIMIZATION =
- "useAlphaNumericOptimization";
+ /**
+ * Initializes the current instance.
+ *
+ * @param languageCode the language of the natural text
+ * @param tokenizerModel the model
+ * @param manifestInfoEntries the manifest
+ * @param tokenizerFactory the factory
+ */
+ public TokenizerModel(String languageCode, AbstractModel tokenizerModel,
+ Map<String, String> manifestInfoEntries, TokenizerFactory tokenizerFactory) {
+ super(COMPONENT_NAME, languageCode, manifestInfoEntries, tokenizerFactory);
+ artifactMap.put(TOKENIZER_MODEL_ENTRY, tokenizerModel);
+ checkArtifactMap();
+ }
/**
* Initializes the current instance.
*
* @param tokenizerMaxentModel
* @param useAlphaNumericOptimization
+ *
+ * @deprecated Use
+ * {@link TokenizerModel#TokenizerModel(String, AbstractModel, Map, TokenizerFactory)}
+ * instead and pass in a {@link TokenizerFactory}.
*/
public TokenizerModel(String language, AbstractModel tokenizerMaxentModel,
Dictionary abbreviations, boolean useAlphaNumericOptimization,
Map<String, String> manifestInfoEntries) {
- super(COMPONENT_NAME, language, manifestInfoEntries);
-
- artifactMap.put(TOKENIZER_MODEL_ENTRY, tokenizerMaxentModel);
-
- setManifestProperty(USE_ALPHA_NUMERIC_OPTIMIZATION,
- Boolean.toString(useAlphaNumericOptimization));
-
- // Abbreviations are optional
- if (abbreviations != null)
- artifactMap.put(ABBREVIATIONS_ENTRY_NAME, abbreviations);
- checkArtifactMap();
+ this(language, tokenizerMaxentModel, manifestInfoEntries,
+ new TokenizerFactory(language, abbreviations, useAlphaNumericOptimization, null));
}
/**
@@ -79,6 +86,10 @@ public final class TokenizerModel extend
* @param tokenizerMaxentModel
* @param useAlphaNumericOptimization
* @param manifestInfoEntries
+ *
+ * @deprecated Use
+ * {@link TokenizerModel#TokenizerModel(String, AbstractModel, Map, TokenizerFactory)}
+ * instead and pass in a {@link TokenizerFactory}.
*/
public TokenizerModel(String language, AbstractModel tokenizerMaxentModel,
boolean useAlphaNumericOptimization, Map<String, String> manifestInfoEntries) {
@@ -91,6 +102,10 @@ public final class TokenizerModel extend
* @param language
* @param tokenizerMaxentModel
* @param useAlphaNumericOptimization
+ *
+ * @deprecated Use
+ * {@link TokenizerModel#TokenizerModel(String, AbstractModel, Map, TokenizerFactory)}
+ * instead and pass in a {@link TokenizerFactory}.
*/
public TokenizerModel(String language, AbstractModel tokenizerMaxentModel,
boolean useAlphaNumericOptimization) {
@@ -130,17 +145,15 @@ public final class TokenizerModel extend
if (!isModelCompatible(getMaxentModel())) {
throw new InvalidFormatException("The maxent model is not compatible with the tokenizer!");
}
+ }
- if (getManifestProperty(USE_ALPHA_NUMERIC_OPTIMIZATION) == null) {
- throw new InvalidFormatException("The " + USE_ALPHA_NUMERIC_OPTIMIZATION + " parameter " +
- "cannot be found!");
- }
-
- Object abbreviationsEntry = artifactMap.get(ABBREVIATIONS_ENTRY_NAME);
+ public TokenizerFactory getFactory() {
+ return (TokenizerFactory) this.toolFactory;
+ }
- if (abbreviationsEntry != null && !(abbreviationsEntry instanceof Dictionary)) {
- throw new InvalidFormatException("Abbreviations dictionary has wrong type!");
- }
+ @Override
+ protected Class<? extends BaseToolFactory> getDefaultFactory() {
+ return TokenizerFactory.class;
}
public AbstractModel getMaxentModel() {
@@ -148,13 +161,17 @@ public final class TokenizerModel extend
}
public Dictionary getAbbreviations() {
- return (Dictionary) artifactMap.get(ABBREVIATIONS_ENTRY_NAME);
+ if (getFactory() != null) {
+ return getFactory().getAbbreviationDictionary();
+ }
+ return null;
}
public boolean useAlphaNumericOptimization() {
- String optimization = getManifestProperty(USE_ALPHA_NUMERIC_OPTIMIZATION);
-
- return Boolean.parseBoolean(optimization);
+ if (getFactory() != null) {
+ return getFactory().isUseAlphaNumericOptmization();
+ }
+ return false;
}
public static void main(String[] args) throws IOException {
Added: opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/tokenize/DummyTokenizerFactory.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/tokenize/DummyTokenizerFactory.java?rev=1304647&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/tokenize/DummyTokenizerFactory.java (added)
+++ opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/tokenize/DummyTokenizerFactory.java Fri Mar 23 23:00:12 2012
@@ -0,0 +1,122 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.tokenize;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.util.Map;
+import java.util.Set;
+import java.util.regex.Pattern;
+
+import opennlp.tools.dictionary.Dictionary;
+import opennlp.tools.util.InvalidFormatException;
+import opennlp.tools.util.model.ArtifactProvider;
+import opennlp.tools.util.model.ArtifactSerializer;
+
+public class DummyTokenizerFactory extends TokenizerFactory {
+
+ private static final String DUMMY_DICT = "dummy";
+ private DummyDictionary dict;
+
+ public DummyTokenizerFactory(String languageCode,
+ Dictionary abbreviationDictionary, boolean useAlphaNumericOptimization,
+ Pattern alphaNumericPattern) {
+ super(languageCode, abbreviationDictionary, useAlphaNumericOptimization,
+ alphaNumericPattern);
+ this.dict = new DummyDictionary(abbreviationDictionary);
+ }
+
+ public DummyTokenizerFactory(ArtifactProvider provider) {
+ super(provider);
+ }
+
+ @Override
+ public DummyDictionary getAbbreviationDictionary() {
+ if (this.dict == null && artifactProvider != null) {
+ this.dict = artifactProvider.getArtifact(DUMMY_DICT);
+ }
+ return this.dict;
+ }
+
+ @Override
+ public TokenContextGenerator getContextGenerator() {
+ return new DummyContextGenerator(getAbbreviationDictionary().asStringSet());
+ }
+
+ @Override
+ @SuppressWarnings("rawtypes")
+ public Map<String, ArtifactSerializer> createArtifactSerializersMap() {
+ Map<String, ArtifactSerializer> serializers = super
+ .createArtifactSerializersMap();
+
+ serializers.put(DUMMY_DICT, new DummyDictionarySerializer());
+ return serializers;
+ }
+
+ @Override
+ public Map<String, Object> createArtifactMap() {
+ Map<String, Object> artifactMap = super.createArtifactMap();
+ if (this.dict != null)
+ artifactMap.put(DUMMY_DICT, this.dict);
+ return artifactMap;
+ }
+
+ static class DummyDictionarySerializer implements
+ ArtifactSerializer<DummyDictionary> {
+
+ public DummyDictionary create(InputStream in) throws IOException,
+ InvalidFormatException {
+ return new DummyDictionary(in);
+ }
+
+ public void serialize(DummyDictionary artifact, OutputStream out)
+ throws IOException {
+ artifact.serialize(out);
+ }
+ }
+
+ static class DummyDictionary extends Dictionary {
+ private Dictionary indict;
+
+ public DummyDictionary(Dictionary dict) {
+ this.indict = dict;
+ }
+
+ public DummyDictionary(InputStream in) throws IOException {
+ this.indict = new Dictionary(in);
+ }
+
+ public void serialize(OutputStream out) throws IOException {
+ indict.serialize(out);
+ }
+
+ public Set<String> asStringSet() {
+ return indict.asStringSet();
+ }
+ }
+
+ static class DummyContextGenerator extends DefaultTokenContextGenerator {
+
+ public DummyContextGenerator(Set<String> inducedAbbreviations) {
+ super(inducedAbbreviations);
+ }
+
+ }
+
+}
Propchange: opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/tokenize/DummyTokenizerFactory.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerFactoryTest.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerFactoryTest.java?rev=1304647&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerFactoryTest.java (added)
+++ opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerFactoryTest.java Fri Mar 23 23:00:12 2012
@@ -0,0 +1,226 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreemnets. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.tokenize;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.util.regex.Pattern;
+
+import opennlp.tools.dictionary.Dictionary;
+import opennlp.tools.tokenize.DummyTokenizerFactory.DummyContextGenerator;
+import opennlp.tools.tokenize.DummyTokenizerFactory.DummyDictionary;
+import opennlp.tools.tokenize.lang.Factory;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.PlainTextByLineStream;
+import opennlp.tools.util.TrainingParameters;
+
+import org.junit.Test;
+
+/**
+ * Tests for the {@link TokenizerFactory} class.
+ */
+public class TokenizerFactoryTest {
+
+ private static ObjectStream<TokenSample> createSampleStream()
+ throws IOException {
+ InputStream in = TokenizerFactoryTest.class.getClassLoader()
+ .getResourceAsStream("opennlp/tools/tokenize/token.train");
+
+ return new TokenSampleStream(new PlainTextByLineStream(
+ new InputStreamReader(in)));
+ }
+
+ private static TokenizerModel train(TokenizerFactory factory)
+ throws IOException {
+ return TokenizerME.train(factory.getLanguageCode(), createSampleStream(),
+ factory, TrainingParameters.defaultParams());
+ }
+
+ static Dictionary loadAbbDictionary() throws IOException {
+ InputStream in = TokenizerFactoryTest.class.getClassLoader()
+ .getResourceAsStream("opennlp/tools/sentdetect/abb.xml");
+
+ return new Dictionary(in);
+ }
+
+ @Test
+ public void testDefault() throws IOException {
+
+ Dictionary dic = loadAbbDictionary();
+ final String lang = "es";
+
+ TokenizerModel model = train(new TokenizerFactory(lang, dic, false, null));
+
+ TokenizerFactory factory = model.getFactory();
+ assertTrue(factory.getAbbreviationDictionary() instanceof Dictionary);
+ assertTrue(factory.getContextGenerator() instanceof DefaultTokenContextGenerator);
+
+ assertEquals(Factory.DEFAULT_ALPHANUMERIC, factory.getAlphaNumericPattern()
+ .pattern());
+ assertEquals(lang, factory.getLanguageCode());
+ assertEquals(lang, model.getLanguage());
+ assertFalse(factory.isUseAlphaNumericOptmization());
+
+ ByteArrayOutputStream out = new ByteArrayOutputStream();
+ model.serialize(out);
+ ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray());
+
+ TokenizerModel fromSerialized = new TokenizerModel(in);
+
+ factory = fromSerialized.getFactory();
+ assertTrue(factory.getAbbreviationDictionary() instanceof Dictionary);
+ assertTrue(factory.getContextGenerator() instanceof DefaultTokenContextGenerator);
+
+ assertEquals(Factory.DEFAULT_ALPHANUMERIC, factory.getAlphaNumericPattern()
+ .pattern());
+ assertEquals(lang, factory.getLanguageCode());
+ assertEquals(lang, model.getLanguage());
+ assertFalse(factory.isUseAlphaNumericOptmization());
+ }
+
+ @Test
+ public void testNullDict() throws IOException {
+
+ Dictionary dic = null;
+ final String lang = "es";
+
+ TokenizerModel model = train(new TokenizerFactory(lang, dic, false, null));
+
+ TokenizerFactory factory = model.getFactory();
+ assertNull(factory.getAbbreviationDictionary());
+ assertTrue(factory.getContextGenerator() instanceof DefaultTokenContextGenerator);
+
+ assertEquals(Factory.DEFAULT_ALPHANUMERIC, factory.getAlphaNumericPattern()
+ .pattern());
+ assertEquals(lang, factory.getLanguageCode());
+ assertEquals(lang, model.getLanguage());
+ assertFalse(factory.isUseAlphaNumericOptmization());
+
+ ByteArrayOutputStream out = new ByteArrayOutputStream();
+ model.serialize(out);
+ ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray());
+
+ TokenizerModel fromSerialized = new TokenizerModel(in);
+
+ factory = fromSerialized.getFactory();
+ assertNull(factory.getAbbreviationDictionary());
+ assertTrue(factory.getContextGenerator() instanceof DefaultTokenContextGenerator);
+
+ assertEquals(Factory.DEFAULT_ALPHANUMERIC, factory.getAlphaNumericPattern()
+ .pattern());
+ assertEquals(lang, factory.getLanguageCode());
+ assertEquals(lang, model.getLanguage());
+ assertFalse(factory.isUseAlphaNumericOptmization());
+ }
+
+ @Test
+ public void testCustomPatternAndAlphaOpt() throws IOException {
+
+ Dictionary dic = null;
+ final String lang = "es";
+ String pattern = "^[0-9A-Za-z]+$";
+
+ TokenizerModel model = train(new TokenizerFactory(lang, dic, true,
+ Pattern.compile(pattern)));
+
+ TokenizerFactory factory = model.getFactory();
+ assertNull(factory.getAbbreviationDictionary());
+ assertTrue(factory.getContextGenerator() instanceof DefaultTokenContextGenerator);
+
+ assertEquals(pattern, factory.getAlphaNumericPattern().pattern());
+ assertEquals(lang, factory.getLanguageCode());
+ assertEquals(lang, model.getLanguage());
+ assertTrue(factory.isUseAlphaNumericOptmization());
+
+ ByteArrayOutputStream out = new ByteArrayOutputStream();
+ model.serialize(out);
+ ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray());
+
+ TokenizerModel fromSerialized = new TokenizerModel(in);
+
+ factory = fromSerialized.getFactory();
+ assertNull(factory.getAbbreviationDictionary());
+ assertTrue(factory.getContextGenerator() instanceof DefaultTokenContextGenerator);
+
+ assertEquals(pattern, factory.getAlphaNumericPattern().pattern());
+ assertEquals(lang, factory.getLanguageCode());
+ assertEquals(lang, model.getLanguage());
+ assertTrue(factory.isUseAlphaNumericOptmization());
+ }
+
+ @Test
+ public void testDummyFactory() throws IOException {
+
+ Dictionary dic = loadAbbDictionary();
+ final String lang = "es";
+ String pattern = "^[0-9A-Za-z]+$";
+
+ TokenizerModel model = train(new DummyTokenizerFactory(lang, dic, true,
+ Pattern.compile(pattern)));
+
+ TokenizerFactory factory = model.getFactory();
+ assertTrue(factory.getAbbreviationDictionary() instanceof DummyDictionary);
+ assertTrue(factory.getContextGenerator() instanceof DummyContextGenerator);
+
+ assertEquals(pattern, factory.getAlphaNumericPattern().pattern());
+ assertEquals(lang, factory.getLanguageCode());
+ assertEquals(lang, model.getLanguage());
+ assertTrue(factory.isUseAlphaNumericOptmization());
+
+ ByteArrayOutputStream out = new ByteArrayOutputStream();
+ model.serialize(out);
+ ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray());
+
+ TokenizerModel fromSerialized = new TokenizerModel(in);
+
+ factory = fromSerialized.getFactory();
+ assertTrue(factory.getAbbreviationDictionary() instanceof DummyDictionary);
+ assertTrue(factory.getContextGenerator() instanceof DummyContextGenerator);
+
+ assertEquals(pattern, factory.getAlphaNumericPattern().pattern());
+ assertEquals(lang, factory.getLanguageCode());
+ assertEquals(lang, model.getLanguage());
+ assertTrue(factory.isUseAlphaNumericOptmization());
+ }
+
+ @Test
+ public void testCreateDummyFactory() throws IOException {
+ Dictionary dic = loadAbbDictionary();
+ final String lang = "es";
+ String pattern = "^[0-9A-Za-z]+$";
+
+ TokenizerFactory factory = TokenizerFactory.create(
+ DummyTokenizerFactory.class.getCanonicalName(), lang, dic, true,
+ Pattern.compile(pattern));
+
+ assertTrue(factory.getAbbreviationDictionary() instanceof DummyDictionary);
+ assertTrue(factory.getContextGenerator() instanceof DummyContextGenerator);
+
+ assertEquals(pattern, factory.getAlphaNumericPattern().pattern());
+ assertEquals(lang, factory.getLanguageCode());
+ assertTrue(factory.isUseAlphaNumericOptmization());
+ }
+}
Propchange: opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerFactoryTest.java
------------------------------------------------------------------------------
svn:mime-type = text/plain