You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2011/05/25 11:07:56 UTC
svn commit: r1127443 -
/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinderModel.java
Author: joern
Date: Wed May 25 09:07:56 2011
New Revision: 1127443
URL: http://svn.apache.org/viewvc?rev=1127443&view=rev
Log:
OPENNLP-17 Added support for custom feature generator
Modified:
incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinderModel.java
Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinderModel.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinderModel.java?rev=1127443&r1=1127442&r2=1127443&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinderModel.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinderModel.java Wed May 25 09:07:56 2011
@@ -18,17 +18,24 @@
package opennlp.tools.namefind;
+import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
+import java.io.OutputStream;
import java.util.ArrayList;
+import java.util.Collections;
import java.util.List;
import java.util.Map;
import opennlp.model.AbstractModel;
import opennlp.model.MaxentModel;
import opennlp.tools.util.InvalidFormatException;
+import opennlp.tools.util.featuregen.AdaptiveFeatureGenerator;
+import opennlp.tools.util.featuregen.FeatureGeneratorResourceProvider;
+import opennlp.tools.util.featuregen.GeneratorFactory;
import opennlp.tools.util.model.ArtifactSerializer;
import opennlp.tools.util.model.BaseModel;
+import opennlp.tools.util.model.ModelUtil;
/**
* The {@link TokenNameFinderModel} is the model used
@@ -38,11 +45,32 @@ import opennlp.tools.util.model.BaseMode
*/
public class TokenNameFinderModel extends BaseModel {
+ public static class FeatureGeneratorCreationError extends RuntimeException {
+ FeatureGeneratorCreationError(Throwable t) {
+ super(t);
+ }
+ }
+
+ private static class ByteArraySerializer implements ArtifactSerializer<byte[]> {
+
+ public byte[] create(InputStream in) throws IOException,
+ InvalidFormatException {
+
+ return ModelUtil.read(in);
+ }
+
+ public void serialize(byte[] artifact, OutputStream out) throws IOException {
+ out.write(artifact);
+ }
+ }
+
private static final String COMPONENT_NAME = "NameFinderME";
private static final String MAXENT_MODEL_ENTRY_NAME = "nameFinder.model";
-
+
+ private static final String GENERATOR_DESCRIPTOR_ENTRY_NAME = "generator.featuregen";
+
public TokenNameFinderModel(String languageCode, AbstractModel nameFinderModel,
- Map<String, Object> resources, Map<String, String> manifestInfoEntries) {
+ byte[] generatorDescriptor, Map<String, Object> resources, Map<String, String> manifestInfoEntries) {
super(COMPONENT_NAME, languageCode, manifestInfoEntries);
@@ -52,9 +80,14 @@ public class TokenNameFinderModel extend
artifactMap.put(MAXENT_MODEL_ENTRY_NAME, nameFinderModel);
+ // TODO: Null check ?!
+ if (generatorDescriptor != null && generatorDescriptor.length > 0)
+ artifactMap.put(GENERATOR_DESCRIPTOR_ENTRY_NAME, generatorDescriptor);
+
// The resource map must not contain key which are already taken
// like the name finder maxent model name
- if (resources.containsKey(MAXENT_MODEL_ENTRY_NAME)) {
+ if (resources.containsKey(MAXENT_MODEL_ENTRY_NAME) ||
+ resources.containsKey(GENERATOR_DESCRIPTOR_ENTRY_NAME)) {
throw new IllegalArgumentException();
}
@@ -63,6 +96,11 @@ public class TokenNameFinderModel extend
artifactMap.putAll(resources);
}
+ public TokenNameFinderModel(String languageCode, AbstractModel nameFinderModel,
+ Map<String, Object> resources, Map<String, String> manifestInfoEntries) {
+ this(languageCode, nameFinderModel, null, resources, manifestInfoEntries);
+ }
+
public TokenNameFinderModel(InputStream in) throws IOException, InvalidFormatException {
super(COMPONENT_NAME, in);
}
@@ -76,6 +114,67 @@ public class TokenNameFinderModel extend
return (AbstractModel) artifactMap.get(MAXENT_MODEL_ENTRY_NAME);
}
+ /**
+ * Creates the {@link AdaptiveFeatureGenerator}. Usually this
+ * is a set of generators contained in the {@link AggregatedFeatureGenerator}.
+ *
+ * Note:
+ * The generators are created on every call to this method.
+ *
+ * @return the feature generator or null if there is no descriptor in the model
+ */
+ public AdaptiveFeatureGenerator createFeatureGenerators() {
+
+ byte descriptorBytes[] = (byte[]) artifactMap.get(GENERATOR_DESCRIPTOR_ENTRY_NAME);
+
+ if (descriptorBytes != null) {
+ InputStream descriptorIn = new ByteArrayInputStream(descriptorBytes);
+
+ AdaptiveFeatureGenerator generator = null;
+ try {
+ generator = GeneratorFactory.create(descriptorIn, new FeatureGeneratorResourceProvider() {
+
+ public Object getResource(String key) {
+ return artifactMap.get(key);
+ }
+ });
+ } catch (InvalidFormatException e) {
+ // It is assumed that the creation of the feature generation does not
+ // fail after it succeeded once during model loading.
+
+ // But it might still be possible that such an exception is thrown,
+ // in this case the caller should not be forced to handle the exception
+ // and a Runtime Exception is thrown instead.
+
+ // If the re-creation of the feature generation fails it is assumed
+ // that this can only be caused by a programming mistake and therefore
+ // throwing a Runtime Exception is reasonable
+
+ throw new FeatureGeneratorCreationError(e);
+ } catch (IOException e) {
+ throw new IllegalStateException("Reading from mem cannot result in an I/O error");
+ }
+
+ return generator;
+ }
+ else {
+ return null;
+ }
+ }
+
+ public TokenNameFinderModel updateFeatureGenerator(byte descriptor[]) {
+
+ TokenNameFinderModel model = new TokenNameFinderModel(getLanguage(), getNameFinderModel(),
+ descriptor, Collections.<String, Object>emptyMap(), Collections.<String, String>emptyMap());
+
+ // TODO: Not so nice!
+ model.artifactMap.clear();
+ model.artifactMap.putAll(artifactMap);
+ model.artifactMap.put(GENERATOR_DESCRIPTOR_ENTRY_NAME, descriptor);
+
+ return model;
+ }
+
// TODO: Write test for this method
public static boolean isModelValid(MaxentModel model) {
@@ -119,6 +218,21 @@ public class TokenNameFinderModel extend
@Override
protected void createArtifactSerializers(Map<String, ArtifactSerializer> serializers) {
super.createArtifactSerializers(serializers);
+
+ serializers.put("featuregen", new ByteArraySerializer());
+ }
+
+ public static Map<String, ArtifactSerializer> createArtifactSerializers() {
+
+ // TODO: Not so nice, because code cannot really be reused by the other create serializer method
+ // Has to be redesigned, we need static access to default serializers
+ // and these should be able to extend during runtime ?!
+
+ Map<String, ArtifactSerializer> serializers = BaseModel.createArtifactSerializers();
+
+ serializers.put("featuregen", new ByteArraySerializer());
+
+ return serializers;
}
protected void validateArtifactMap() throws InvalidFormatException {