You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by ma...@apache.org on 2013/12/11 14:37:36 UTC
svn commit: r1550134 - in
/opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder:
./ impls/
Author: markg
Date: Wed Dec 11 13:37:36 2013
New Revision: 1550134
URL: http://svn.apache.org/r1550134
Log:
OPENNLP-607
Fixed many issues. Added default file-based impls for all interfaces, and created a util class wrapper to allow for easy use of the default implementations.
Added:
opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/DefaultModelBuilderUtil.java
opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/KnownEntityProvider.java
- copied, changed from r1544512, opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/KnownEntityProvider.java
opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/ModelGenerationValidator.java
- copied, changed from r1544512, opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/ModelGenerationValidator.java
opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/ModelParameter.java
- copied, changed from r1544512, opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/ModelParameter.java
opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/Modelable.java
- copied, changed from r1544512, opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/Modelable.java
opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/SemiSupervisedModelGenerator.java
- copied, changed from r1544512, opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/SemiSupervisedModelGenerator.java
opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/SentenceProvider.java
- copied, changed from r1544512, opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/SentenceProvider.java
opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/
opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/BaseModelBuilderParams.java
opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/FileKnownEntityProvider.java
- copied, changed from r1544512, opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/FileKnownEntityProvider.java
opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/FileModelValidatorImpl.java
- copied, changed from r1544512, opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/FileModelValidatorImpl.java
opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/FileSentenceProvider.java
- copied, changed from r1544512, opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/FileSentenceProvider.java
opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/GenericModelGenerator.java
- copied, changed from r1544512, opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/GenericModelGenerator.java
opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/GenericModelableImpl.java
- copied, changed from r1544512, opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/ModelableImpl.java
Added: opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/DefaultModelBuilderUtil.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/DefaultModelBuilderUtil.java?rev=1550134&view=auto
==============================================================================
--- opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/DefaultModelBuilderUtil.java (added)
+++ opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/DefaultModelBuilderUtil.java Wed Dec 11 13:37:36 2013
@@ -0,0 +1,117 @@
+/*
+ * Copyright 2013 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.modelbuilder;
+
+import java.io.File;
+import opennlp.modelbuilder.impls.BaseModelBuilderParams;
+import opennlp.modelbuilder.impls.FileKnownEntityProvider;
+import opennlp.modelbuilder.impls.FileModelValidatorImpl;
+import opennlp.modelbuilder.impls.FileSentenceProvider;
+import opennlp.modelbuilder.impls.GenericModelGenerator;
+import opennlp.modelbuilder.impls.GenericModelableImpl;
+
+/**
+ *
+ * Utilizes the filebased implementations to produce an NER model from user
+ * The basic processing is such
+ * read in the list of known entities
+ * annotate the sentences based on the list of known entities
+ * create a model from the annotations
+ * perform NER with the model on the sentences
+ * add the NER results to the annotations
+ * rebuild the model
+ * loop
+ * defined data
+ */
+public class DefaultModelBuilderUtil {
+
+ /**
+ *
+ * @param sentences a file that contains one sentence per line.
+ * There should be at least 15K sentences
+ * consisting of a representative sample from
+ * user data
+ * @param knownEntities a file consisting of a simple list of
+ * unambiguous entities, one entry per line.
+ * For instance, if one was trying to build a
+ * person NER model then this file would be a
+ * list of person names that are unambiguous
+ * and are known to exist in the sentences
+ * file
+ * @param knownEntitiesBlacklist This file contains a list of known bad hits
+ * that the NER phase of this processing might
+ * catch early one before the model iterates
+ * to maturity
+ * @param modelOutFile the location where the model will be
+ * written to
+ * @param annotatedSentenceOutFile where the annotated sentences produced by
+ * this process will be written to
+ * @param namedEntityType the type of entity... for example, person,
+ * location, organization...
+ * @param iterations how many times to repeat the iterative loop
+ * of annotation, model generation, and NER
+ */
+ public static void generateModel(File sentences, File knownEntities, File knownEntitiesBlacklist,
+ File modelOutFile, File annotatedSentenceOutFile, String namedEntityType, int iterations) {
+ SemiSupervisedModelGenerator modelGenerator = new GenericModelGenerator();
+ BaseModelBuilderParams params = new BaseModelBuilderParams();
+ params.setAnnotatedTrainingDataFile(annotatedSentenceOutFile);
+ params.setSentenceFile(sentences);
+ params.setEntityType(namedEntityType);
+ params.setKnownEntitiesFile(knownEntities);
+ params.setModelFile(modelOutFile);
+ params.setKnownEntityBlacklist(knownEntitiesBlacklist);
+ /**
+ * sentence providers feed this process with user data derived sentences
+ * this impl just reads line by line through a file
+ */
+ SentenceProvider sentenceProvider = new FileSentenceProvider();
+ sentenceProvider.setParameters(params);
+ /**
+ * KnownEntityProviders provide a seed list of known entities... such as
+ * Barack Obama for person, or Germany for location obviously these would
+ * want to be prolific, non ambiguous names
+ */
+ KnownEntityProvider knownEntityProvider = new FileKnownEntityProvider();
+ knownEntityProvider.setParameters(params);
+ /**
+ * ModelGenerationValidators try to weed out bad hits by the iterations of
+ * the name finder. Since this is a recursive process, with each iteration
+ * the namefinder will get more and more greedy if bad entities are allowed
+ * in this provides a mechanism for throwing out obviously bad hits. A good
+ * impl may be to make sure a location is actually within a noun phrase
+ * etc...users can make this as specific as they need for their dat and
+ * their use case
+ */
+ ModelGenerationValidator validator = new FileModelValidatorImpl();
+ validator.setParameters(params);
+ /**
+ * Modelable's write and read the annotated sentences, as well as create and
+ * write the NER models
+ */
+ Modelable modelable = new GenericModelableImpl();
+ modelable.setParameters(params);
+
+ /**
+ * the modelGenerator actually runs the process with a set number of
+ * iterations... could be better by actually calculating the diff between
+ * runs and stopping based on a thresh, but for extrememly large sentence
+ * sets this may be too much.
+ */
+ modelGenerator.build(sentenceProvider, knownEntityProvider, validator, modelable, iterations);
+
+ }
+}
Copied: opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/KnownEntityProvider.java (from r1544512, opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/KnownEntityProvider.java)
URL: http://svn.apache.org/viewvc/opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/KnownEntityProvider.java?p2=opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/KnownEntityProvider.java&p1=opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/KnownEntityProvider.java&r1=1544512&r2=1550134&rev=1550134&view=diff
==============================================================================
--- opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/KnownEntityProvider.java (original)
+++ opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/KnownEntityProvider.java Wed Dec 11 13:37:36 2013
@@ -13,7 +13,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package opennlp.modelbuilder.v2;
+package opennlp.modelbuilder;
import java.util.Set;
Copied: opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/ModelGenerationValidator.java (from r1544512, opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/ModelGenerationValidator.java)
URL: http://svn.apache.org/viewvc/opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/ModelGenerationValidator.java?p2=opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/ModelGenerationValidator.java&p1=opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/ModelGenerationValidator.java&r1=1544512&r2=1550134&rev=1550134&view=diff
==============================================================================
--- opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/ModelGenerationValidator.java (original)
+++ opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/ModelGenerationValidator.java Wed Dec 11 13:37:36 2013
@@ -13,7 +13,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package opennlp.modelbuilder.v2;
+package opennlp.modelbuilder;
import java.util.Collection;
Copied: opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/ModelParameter.java (from r1544512, opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/ModelParameter.java)
URL: http://svn.apache.org/viewvc/opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/ModelParameter.java?p2=opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/ModelParameter.java&p1=opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/ModelParameter.java&r1=1544512&r2=1550134&rev=1550134&view=diff
==============================================================================
--- opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/ModelParameter.java (original)
+++ opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/ModelParameter.java Wed Dec 11 13:37:36 2013
@@ -13,14 +13,16 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package opennlp.modelbuilder.v2;
+package opennlp.modelbuilder;
-import java.util.Map;
+import opennlp.modelbuilder.impls.BaseModelBuilderParams;
/**
*
*/
-public interface ModelParameter {
+public interface ModelParameter<T extends BaseModelBuilderParams>{
- void setParameters(Map<String, String> params);
+ void setParameters(T params);
+
+
}
Copied: opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/Modelable.java (from r1544512, opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/Modelable.java)
URL: http://svn.apache.org/viewvc/opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/Modelable.java?p2=opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/Modelable.java&p1=opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/Modelable.java&r1=1544512&r2=1550134&rev=1550134&view=diff
==============================================================================
--- opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/Modelable.java (original)
+++ opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/Modelable.java Wed Dec 11 13:37:36 2013
@@ -13,7 +13,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package opennlp.modelbuilder.v2;
+package opennlp.modelbuilder;
import java.util.Set;
import opennlp.tools.namefind.TokenNameFinderModel;
Copied: opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/SemiSupervisedModelGenerator.java (from r1544512, opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/SemiSupervisedModelGenerator.java)
URL: http://svn.apache.org/viewvc/opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/SemiSupervisedModelGenerator.java?p2=opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/SemiSupervisedModelGenerator.java&p1=opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/SemiSupervisedModelGenerator.java&r1=1544512&r2=1550134&rev=1550134&view=diff
==============================================================================
--- opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/SemiSupervisedModelGenerator.java (original)
+++ opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/SemiSupervisedModelGenerator.java Wed Dec 11 13:37:36 2013
@@ -13,13 +13,15 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package opennlp.modelbuilder.v2;
+package opennlp.modelbuilder;
+
+import opennlp.modelbuilder.impls.BaseModelBuilderParams;
/**
*
*/
-public interface SemiSupervisedModelGenerator extends ModelParameter {
+public interface SemiSupervisedModelGenerator extends ModelParameter<BaseModelBuilderParams> {
void build(SentenceProvider sentenceProvider, KnownEntityProvider knownEntityProvider,
ModelGenerationValidator validator, Modelable modelable, int iterations);
Copied: opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/SentenceProvider.java (from r1544512, opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/SentenceProvider.java)
URL: http://svn.apache.org/viewvc/opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/SentenceProvider.java?p2=opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/SentenceProvider.java&p1=opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/SentenceProvider.java&r1=1544512&r2=1550134&rev=1550134&view=diff
==============================================================================
--- opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/SentenceProvider.java (original)
+++ opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/SentenceProvider.java Wed Dec 11 13:37:36 2013
@@ -13,14 +13,15 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package opennlp.modelbuilder.v2;
+package opennlp.modelbuilder;
import java.util.Set;
+import opennlp.modelbuilder.impls.BaseModelBuilderParams;
/**
*
*/
-public interface SentenceProvider extends ModelParameter {
+public interface SentenceProvider extends ModelParameter<BaseModelBuilderParams> {
Set<String> getSentences();
}
Added: opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/BaseModelBuilderParams.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/BaseModelBuilderParams.java?rev=1550134&view=auto
==============================================================================
--- opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/BaseModelBuilderParams.java (added)
+++ opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/BaseModelBuilderParams.java Wed Dec 11 13:37:36 2013
@@ -0,0 +1,90 @@
+/*
+ * Copyright 2013 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.modelbuilder.impls;
+
+import java.io.File;
+import java.util.Map;
+
+/**
+ *
+ * Used to pass params through the processing
+ */
+public class BaseModelBuilderParams {
+
+ private File modelFile;
+ private File sentenceFile;
+ private File knownEntitiesFile;
+ private File knownEntityBlacklist;
+ private File annotatedTrainingDataFile;
+ private String entityType;
+ private Map<String, String> additionalParams;
+
+ public File getModelFile() {
+ return modelFile;
+ }
+
+ public void setModelFile(File modelFile) {
+ this.modelFile = modelFile;
+ }
+
+ public File getSentenceFile() {
+ return sentenceFile;
+ }
+
+ public void setSentenceFile(File sentenceFile) {
+ this.sentenceFile = sentenceFile;
+ }
+
+ public File getKnownEntitiesFile() {
+ return knownEntitiesFile;
+ }
+
+ public void setKnownEntitiesFile(File knownEntitiesFile) {
+ this.knownEntitiesFile = knownEntitiesFile;
+ }
+
+ public File getKnownEntityBlacklist() {
+ return knownEntityBlacklist;
+ }
+
+ public void setKnownEntityBlacklist(File knownEntityBlacklist) {
+ this.knownEntityBlacklist = knownEntityBlacklist;
+ }
+
+ public Map<String, String> getAdditionalParams() {
+ return additionalParams;
+ }
+
+ public void setAdditionalParams(Map<String, String> additionalParams) {
+ this.additionalParams = additionalParams;
+ }
+
+ public String getEntityType() {
+ return entityType;
+ }
+
+ public void setEntityType(String entityType) {
+ this.entityType = entityType;
+ }
+
+ public File getAnnotatedTrainingDataFile() {
+ return annotatedTrainingDataFile;
+ }
+
+ public void setAnnotatedTrainingDataFile(File annotatedTrainingDataFile) {
+ this.annotatedTrainingDataFile = annotatedTrainingDataFile;
+ }
+}
\ No newline at end of file
Copied: opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/FileKnownEntityProvider.java (from r1544512, opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/FileKnownEntityProvider.java)
URL: http://svn.apache.org/viewvc/opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/FileKnownEntityProvider.java?p2=opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/FileKnownEntityProvider.java&p1=opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/FileKnownEntityProvider.java&r1=1544512&r2=1550134&rev=1550134&view=diff
==============================================================================
--- opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/FileKnownEntityProvider.java (original)
+++ opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/FileKnownEntityProvider.java Wed Dec 11 13:37:36 2013
@@ -13,7 +13,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package opennlp.modelbuilder.v2.impls;
+package opennlp.modelbuilder.impls;
import java.io.BufferedReader;
import java.io.FileInputStream;
@@ -22,21 +22,19 @@ import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
-import java.util.HashMap;
import java.util.HashSet;
-import java.util.Map;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
-import opennlp.modelbuilder.v2.KnownEntityProvider;
+import opennlp.modelbuilder.KnownEntityProvider;
/**
*
*/
public class FileKnownEntityProvider implements KnownEntityProvider {
- private Map<String, String> params = new HashMap<String, String>();
+
Set<String> knownEntities = new HashSet<String>();
-
+ BaseModelBuilderParams params;
@Override
public Set<String> getKnownEntities() {
if (knownEntities.isEmpty()) {
@@ -45,10 +43,10 @@ public class FileKnownEntityProvider imp
BufferedReader br;
String line;
- fis = new FileInputStream(params.get("knownentityfile"));
+ fis = new FileInputStream(params.getKnownEntitiesFile());
br = new BufferedReader(new InputStreamReader(fis, Charset.forName("UTF-8")));
while ((line = br.readLine()) != null) {
- knownEntities.add(line.split("\t")[2]);
+ knownEntities.add(line);
}
// Done with the file
@@ -72,13 +70,13 @@ public class FileKnownEntityProvider imp
@Override
public String getKnownEntitiesType() {
- return params.get("knownentitytype");
+ return params.getEntityType();
}
@Override
- public void setParameters(Map<String, String> params) {
+ public void setParameters(BaseModelBuilderParams params) {
this.params = params;
}
}
Copied: opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/FileModelValidatorImpl.java (from r1544512, opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/FileModelValidatorImpl.java)
URL: http://svn.apache.org/viewvc/opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/FileModelValidatorImpl.java?p2=opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/FileModelValidatorImpl.java&p1=opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/FileModelValidatorImpl.java&r1=1544512&r2=1550134&rev=1550134&view=diff
==============================================================================
--- opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/FileModelValidatorImpl.java (original)
+++ opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/FileModelValidatorImpl.java Wed Dec 11 13:37:36 2013
@@ -13,7 +13,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package opennlp.modelbuilder.v2.impls;
+package opennlp.modelbuilder.impls;
import java.io.BufferedReader;
import java.io.FileInputStream;
@@ -23,27 +23,22 @@ import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.util.Collection;
-import java.util.HashMap;
import java.util.HashSet;
-import java.util.Map;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
-import java.util.regex.Pattern;
-import opennlp.modelbuilder.v2.ModelGenerationValidator;
+import opennlp.modelbuilder.ModelGenerationValidator;
/**
- *
+ *Validates NER results input before inclusion into the model
*/
public class FileModelValidatorImpl implements ModelGenerationValidator {
private Set<String> badentities = new HashSet<String>();
- private final double MIN_SCORE_FOR_TRAINING = 0.95d;
- private Object validationData;
- private Map<String, String> params = new HashMap<String, String>();
+ BaseModelBuilderParams params;
@Override
- public void setParameters(Map<String, String> params) {
+ public void setParameters(BaseModelBuilderParams params) {
this.params = params;
}
@@ -59,11 +54,11 @@ public class FileModelValidatorImpl impl
if (badentities.isEmpty()) {
getBlackList();
}
-
- Pattern p = Pattern.compile("[0-9]", Pattern.CASE_INSENSITIVE | Pattern.MULTILINE);
- if (p.matcher(namedEntity).find()) {
- return false;
- }
+//
+// Pattern p = Pattern.compile("[0-9]", Pattern.CASE_INSENSITIVE | Pattern.MULTILINE);
+// if (p.matcher(namedEntity).find()) {
+// return false;
+// }
Boolean b = true;
if (badentities.contains(namedEntity.toLowerCase())) {
b = false;
@@ -73,17 +68,20 @@ public class FileModelValidatorImpl impl
@Override
public Collection<String> getBlackList() {
+ if (params.getKnownEntityBlacklist() == null) {
+ return badentities;
+ }
if (!badentities.isEmpty()) {
try {
InputStream fis;
BufferedReader br;
String line;
- fis = new FileInputStream(params.get("blacklistfile"));
+ fis = new FileInputStream(params.getKnownEntityBlacklist());
br = new BufferedReader(new InputStreamReader(fis, Charset.forName("UTF-8")));
while ((line = br.readLine()) != null) {
badentities.add(line);
- }
+ }
br.close();
br = null;
fis = null;
Copied: opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/FileSentenceProvider.java (from r1544512, opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/FileSentenceProvider.java)
URL: http://svn.apache.org/viewvc/opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/FileSentenceProvider.java?p2=opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/FileSentenceProvider.java&p1=opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/FileSentenceProvider.java&r1=1544512&r2=1550134&rev=1550134&view=diff
==============================================================================
--- opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/FileSentenceProvider.java (original)
+++ opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/FileSentenceProvider.java Wed Dec 11 13:37:36 2013
@@ -13,7 +13,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package opennlp.modelbuilder.v2.impls;
+package opennlp.modelbuilder.impls;
import java.io.BufferedReader;
import java.io.FileInputStream;
@@ -22,20 +22,18 @@ import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
-import java.util.HashMap;
import java.util.HashSet;
-import java.util.Map;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
-import opennlp.modelbuilder.v2.SentenceProvider;
+import opennlp.modelbuilder.SentenceProvider;
/**
- *
+ * Provides user sentences via a simple text file
*/
public class FileSentenceProvider implements SentenceProvider {
- private Map<String, String> params = new HashMap<String, String>();
+ BaseModelBuilderParams params ;
Set<String> sentences = new HashSet<String>();
public Set<String> getSentences() {
@@ -45,7 +43,7 @@ public class FileSentenceProvider implem
BufferedReader br;
String line;
- fis = new FileInputStream(params.get("sentencesfile"));
+ fis = new FileInputStream(params.getSentenceFile());
br = new BufferedReader(new InputStreamReader(fis, Charset.forName("UTF-8")));
int i=0;
while ((line = br.readLine()) != null) {
@@ -66,7 +64,7 @@ public class FileSentenceProvider implem
return sentences;
}
- public void setParameters(Map<String, String> params) {
+ public void setParameters(BaseModelBuilderParams params) {
this.params = params;
}
}
Copied: opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/GenericModelGenerator.java (from r1544512, opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/GenericModelGenerator.java)
URL: http://svn.apache.org/viewvc/opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/GenericModelGenerator.java?p2=opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/GenericModelGenerator.java&p1=opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/GenericModelGenerator.java&r1=1544512&r2=1550134&rev=1550134&view=diff
==============================================================================
--- opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/GenericModelGenerator.java (original)
+++ opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/GenericModelGenerator.java Wed Dec 11 13:37:36 2013
@@ -13,24 +13,31 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package opennlp.modelbuilder.v2;
+package opennlp.modelbuilder.impls;
import java.util.HashMap;
import java.util.Map;
+import opennlp.modelbuilder.KnownEntityProvider;
+import opennlp.modelbuilder.ModelGenerationValidator;
+import opennlp.modelbuilder.Modelable;
+import opennlp.modelbuilder.SemiSupervisedModelGenerator;
+import opennlp.modelbuilder.SentenceProvider;
import opennlp.tools.namefind.NameFinderME;
import opennlp.tools.util.Span;
/**
*
- *Generic impl
+ * Generic impl that handles all processing using the default file implementations
*/
-public class GenericModelGenerator implements SemiSupervisedModelGenerator{
- private Map<String, String> params = new HashMap<String, String>();
+public class GenericModelGenerator implements SemiSupervisedModelGenerator {
+
+ private Map<String, String> params = new HashMap<String, String>();
@Override
- public void setParameters(Map<String, String> params) {
- this.params = params;
+ public void setParameters(BaseModelBuilderParams params) {
+ this.params = params.getAdditionalParams();
}
+
@Override
public void build(SentenceProvider sentenceProvider, KnownEntityProvider knownEntityProvider,
ModelGenerationValidator validator, Modelable modelable, int iterations) {
@@ -47,12 +54,23 @@ public class GenericModelGenerator imple
}
}
}
+ if (sentenceProvider.getSentences().isEmpty()) {
+ System.out.println("No sentences in file");
+ return;
+ }
+ if (knownEntityProvider.getKnownEntities().isEmpty()) {
+ System.out.println("No known entities in file");
+ return;
+ }
System.out.println("\t\twriting annotated sentences....: ");
modelable.writeAnnotatedSentences();
+ System.out.println("\t\tbuilding model.... ");
modelable.buildModel(knownEntityProvider.getKnownEntitiesType());
+ System.out.println("\t\tmodel building complete.... ");
NameFinderME nf = new NameFinderME(modelable.getModel());
System.out.println("\t\tannotated sentences: " + modelable.getAnnotatedSentences().size());
- System.out.println("\tPerforming NER");
+ System.out.println("\tPerforming NER with new model");
+ System.out.println("\t\tPrinting NER Results. Add undesired results to the blacklist file and start over");
for (String sentence : sentenceProvider.getSentences()) {
if (!validator.validSentence(sentence)) {
continue;
@@ -65,10 +83,14 @@ public class GenericModelGenerator imple
String[] namedEntities = Span.spansToStrings(find, tokens);
for (String namedEntity : namedEntities) {
+ System.out.println("\t\t" + namedEntity);
if (validator.validNamedEntity(namedEntity)) {
+
knownEntityProvider.addKnownEntity(namedEntity);
modelable.addAnnotatedSentence(modelable.annotate(sentence, namedEntity, knownEntityProvider.getKnownEntitiesType()));
+ } else {
+ System.out.println("\t\t" + namedEntity + "...already blacklisted");
}
}
}
Copied: opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/GenericModelableImpl.java (from r1544512, opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/ModelableImpl.java)
URL: http://svn.apache.org/viewvc/opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/GenericModelableImpl.java?p2=opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/GenericModelableImpl.java&p1=opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/ModelableImpl.java&r1=1544512&r2=1550134&rev=1550134&view=diff
==============================================================================
--- opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/ModelableImpl.java (original)
+++ opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/GenericModelableImpl.java Wed Dec 11 13:37:36 2013
@@ -13,57 +13,43 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package opennlp.modelbuilder.v2.impls;
+package opennlp.modelbuilder.impls;
import java.io.BufferedOutputStream;
-import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.charset.Charset;
-import java.util.HashMap;
import java.util.HashSet;
-import java.util.Map;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
-import opennlp.modelbuilder.v2.Modelable;
+import opennlp.modelbuilder.Modelable;
import opennlp.tools.namefind.NameFinderME;
import opennlp.tools.namefind.NameSample;
import opennlp.tools.namefind.NameSampleDataStream;
import opennlp.tools.namefind.TokenNameFinderModel;
-import opennlp.tools.tokenize.TokenizerME;
-import opennlp.tools.tokenize.TokenizerModel;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
/**
- *
+ * Creates annotations, writes annotations to file, and creates a model and writes to a file
*/
-public class ModelableImpl implements Modelable {
+public class GenericModelableImpl implements Modelable {
- private TokenizerModel tm;
- private TokenizerME wordBreaker;
- private String path = "c:\\temp\\opennlpmodels\\";
- private String trainingDataPath = "";
- private String modelOutPath = "";
private Set<String> annotatedSentences = new HashSet<String>();
- private Map<String, String> params = new HashMap<String, String>();
+ BaseModelBuilderParams params;
@Override
- public void setParameters(Map<String, String> params) {
+ public void setParameters(BaseModelBuilderParams params) {
this.params = params;
- path = params.get("modelablepath");
- trainingDataPath = path + "\\" + params.get("knownentitytype") + ".train";
- modelOutPath = path + "\\" + params.get("knownentitytype")+".model";
}
@Override
public String annotate(String sentence, String namedEntity, String entityType) {
String annotation = sentence.replace(namedEntity, " <START:" + entityType + "> " + namedEntity + " <END> ");
-
return annotation;
}
@@ -71,7 +57,7 @@ public class ModelableImpl implements Mo
public void writeAnnotatedSentences() {
try {
- FileWriter writer = new FileWriter(trainingDataPath, false);
+ FileWriter writer = new FileWriter(params.getAnnotatedTrainingDataFile(), false);
for (String s : annotatedSentences) {
writer.write(s.replace("\n", " ").trim() + "\n");
@@ -104,13 +90,13 @@ public class ModelableImpl implements Mo
System.out.println("\t\treading training data...");
Charset charset = Charset.forName("UTF-8");
ObjectStream<String> lineStream =
- new PlainTextByLineStream(new FileInputStream(trainingDataPath), charset);
+ new PlainTextByLineStream(new FileInputStream(params.getAnnotatedTrainingDataFile()), charset);
ObjectStream<NameSample> sampleStream = new NameSampleDataStream(lineStream);
TokenNameFinderModel model;
model = NameFinderME.train("en", entityType, sampleStream, null);
sampleStream.close();
- OutputStream modelOut = new BufferedOutputStream(new FileOutputStream(new File(modelOutPath)));
+ OutputStream modelOut = new BufferedOutputStream(new FileOutputStream(params.getModelFile()));
model.serialize(modelOut);
if (modelOut != null) {
modelOut.close();
@@ -126,9 +112,9 @@ public class ModelableImpl implements Mo
TokenNameFinderModel nerModel = null;
try {
- nerModel = new TokenNameFinderModel(new FileInputStream(new File(modelOutPath)));
+ nerModel = new TokenNameFinderModel(new FileInputStream(params.getModelFile()));
} catch (IOException ex) {
- Logger.getLogger(ModelableImpl.class.getName()).log(Level.SEVERE, null, ex);
+ Logger.getLogger(GenericModelableImpl.class.getName()).log(Level.SEVERE, null, ex);
}
return nerModel;
}