You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jz...@apache.org on 2023/01/20 15:24:39 UTC
[opennlp-sandbox] branch master updated: updates sandbox component 'mallet-addon' to be compatible with latest opennlp-tools release (#61)
This is an automated email from the ASF dual-hosted git repository.
jzemerick pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/opennlp-sandbox.git
The following commit(s) were added to refs/heads/master by this push:
new f19a3b6 updates sandbox component 'mallet-addon' to be compatible with latest opennlp-tools release (#61)
f19a3b6 is described below
commit f19a3b67b210627f82d41cb5f665b2b99a2aac8a
Author: Martin Wiesner <ma...@users.noreply.github.com>
AuthorDate: Fri Jan 20 16:24:32 2023 +0100
updates sandbox component 'mallet-addon' to be compatible with latest opennlp-tools release (#61)
- adjusts opennlp-tools to 2.1.0
- adjusts parent project (org.apache.apache) to version 18
- adjusts Java language level to 11
- updates to mallet version 2.0.8 to mitigate several CVEs, adds exclusions and related newer versions to mitigate CVEs
- adjusts some array declarations to comply with Java, not C, style
- improves resource handling of streams
- removes unused imports
---
mallet-addon/pom.xml | 44 +++++++++++++++++++---
.../java/opennlp/addons/mallet/CRFTrainer.java | 15 +++-----
.../opennlp/addons/mallet/ClassifierModel.java | 27 +++++++------
.../addons/mallet/ClassifierModelSerializer.java | 15 +++-----
.../java/opennlp/addons/mallet/MaxentTrainer.java | 18 ++-------
.../opennlp/addons/mallet/TransducerModel.java | 20 +++++-----
.../addons/mallet/TransducerModelSerializer.java | 9 ++---
7 files changed, 82 insertions(+), 66 deletions(-)
diff --git a/mallet-addon/pom.xml b/mallet-addon/pom.xml
index c5f2ca9..d1e134f 100644
--- a/mallet-addon/pom.xml
+++ b/mallet-addon/pom.xml
@@ -21,10 +21,17 @@
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
-
+ <parent>
+ <groupId>org.apache</groupId>
+ <artifactId>apache</artifactId>
+ <!-- TODO OPENNLP-1452 once this is resolved, move to 29 as well. -->
+ <version>18</version>
+ <relativePath />
+ </parent>
+
<groupId>kottmann.opennlp</groupId>
<artifactId>mallet-addon</artifactId>
- <version>1.6.0-SNAPSHOT</version>
+ <version>2.1.1-SNAPSHOT</version>
<packaging>jar</packaging>
<name>Apache OpenNLP Mallet Addon</name>
@@ -33,13 +40,37 @@
<dependency>
<groupId>org.apache.opennlp</groupId>
<artifactId>opennlp-tools</artifactId>
- <version>1.6.1-SNAPSHOT</version>
+ <version>2.1.0</version>
</dependency>
<dependency>
<groupId>cc.mallet</groupId>
<artifactId>mallet</artifactId>
- <version>2.0.7</version>
+ <version>2.0.8</version>
+ <exclusions>
+ <exclusion>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>org.jdom</groupId>
+ <artifactId>jdom</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>org.beanshell</groupId>
+ <artifactId>bsh</artifactId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+ <dependency>
+ <groupId>org.jdom</groupId>
+ <artifactId>jdom</artifactId>
+ <version>1.1.3</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache-extras.beanshell</groupId>
+ <artifactId>bsh</artifactId>
+ <version>2.0b6</version>
</dependency>
</dependencies>
@@ -67,8 +98,9 @@
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
- <source>1.7</source>
- <target>1.7</target>
+ <source>11</source>
+ <target>11</target>
+ <compilerArgument>-Xlint</compilerArgument>
</configuration>
</plugin>
<plugin>
diff --git a/mallet-addon/src/main/java/opennlp/addons/mallet/CRFTrainer.java b/mallet-addon/src/main/java/opennlp/addons/mallet/CRFTrainer.java
index 7e6de66..0700e2b 100644
--- a/mallet-addon/src/main/java/opennlp/addons/mallet/CRFTrainer.java
+++ b/mallet-addon/src/main/java/opennlp/addons/mallet/CRFTrainer.java
@@ -20,7 +20,6 @@
package opennlp.addons.mallet;
import java.io.IOException;
-import java.util.Map;
import java.util.regex.Pattern;
import opennlp.tools.ml.AbstractSequenceTrainer;
@@ -30,7 +29,6 @@ import opennlp.tools.ml.model.SequenceClassificationModel;
import opennlp.tools.ml.model.SequenceStream;
import cc.mallet.fst.CRF;
import cc.mallet.fst.CRFOptimizableByLabelLikelihood;
-import cc.mallet.fst.CRFTrainerByLabelLikelihood;
import cc.mallet.fst.CRFTrainerByValueGradients;
import cc.mallet.fst.Transducer;
import cc.mallet.optimize.Optimizable;
@@ -71,17 +69,17 @@ public class CRFTrainer extends AbstractSequenceTrainer {
int nameIndex = 0;
Sequence sequence;
while ((sequence = sequences.read()) != null) {
- FeatureVector featureVectors[] = new FeatureVector[sequence.getEvents().length];
- Label malletOutcomes[] = new Label[sequence.getEvents().length];
+ FeatureVector[] featureVectors = new FeatureVector[sequence.getEvents().length];
+ Label[] malletOutcomes = new Label[sequence.getEvents().length];
- Event events[] = sequence.getEvents();
+ Event[] events = sequence.getEvents();
for (int eventIndex = 0; eventIndex < events.length; eventIndex++) {
Event event = events[eventIndex];
- String features[] = event.getContext();
- int malletFeatures[] = new int[features.length];
+ String[] features = event.getContext();
+ int[] malletFeatures = new int[features.length];
for (int featureIndex = 0; featureIndex < features.length; featureIndex++) {
malletFeatures[featureIndex] = dataAlphabet.lookupIndex(
@@ -109,8 +107,7 @@ public class CRFTrainer extends AbstractSequenceTrainer {
CRF crf = new CRF(trainingData.getDataAlphabet(),
trainingData.getTargetAlphabet());
- String startStateName = crf.addOrderNStates(trainingData, getOrders(),
- (boolean[]) null,
+ String startStateName = crf.addOrderNStates(trainingData, getOrders(), null,
// default label
"other", Pattern.compile("other,*-cont"), // forbidden pattern
null, // allowed pattern
diff --git a/mallet-addon/src/main/java/opennlp/addons/mallet/ClassifierModel.java b/mallet-addon/src/main/java/opennlp/addons/mallet/ClassifierModel.java
index 5f6661d..1426be9 100644
--- a/mallet-addon/src/main/java/opennlp/addons/mallet/ClassifierModel.java
+++ b/mallet-addon/src/main/java/opennlp/addons/mallet/ClassifierModel.java
@@ -35,18 +35,19 @@ import cc.mallet.types.LabelVector;
class ClassifierModel implements MaxentModel, SerializableArtifact {
- private Classifier classifer;
+ private final Classifier classifier;
public ClassifierModel(Classifier classifer) {
- this.classifer = classifer;
+ this.classifier = classifer;
}
- Classifier getClassifer() {
- return classifer;
+ Classifier getClassifier() {
+ return classifier;
}
-
+
+ @Override
public double[] eval(String[] features) {
- Alphabet dataAlphabet = classifer.getAlphabet();
+ Alphabet dataAlphabet = classifier.getAlphabet();
List<Integer> malletFeatureList = new ArrayList<>(features.length);
@@ -62,15 +63,15 @@ class ClassifierModel implements MaxentModel, SerializableArtifact {
malletFeatures[i] = malletFeatureList.get(i);
}
- FeatureVector fv = new FeatureVector(classifer.getAlphabet(),
+ FeatureVector fv = new FeatureVector(classifier.getAlphabet(),
malletFeatures);
Instance instance = new Instance(fv, null, null, null);
- Classification result = classifer.classify(instance);
+ Classification result = classifier.classify(instance);
LabelVector labeling = result.getLabelVector();
- LabelAlphabet targetAlphabet = classifer.getLabelAlphabet();
+ LabelAlphabet targetAlphabet = classifier.getLabelAlphabet();
double outcomes[] = new double[targetAlphabet.size()];
for (int i = 0; i < outcomes.length; i++) {
@@ -84,10 +85,12 @@ class ClassifierModel implements MaxentModel, SerializableArtifact {
return outcomes;
}
+ @Override
public double[] eval(String[] context, double[] probs) {
return eval(context);
}
+ @Override
public double[] eval(String[] context, float[] values) {
return eval(context);
}
@@ -109,17 +112,17 @@ class ClassifierModel implements MaxentModel, SerializableArtifact {
@Override
public String getOutcome(int i) {
- return classifer.getLabelAlphabet().lookupLabel(i).getEntry().toString();
+ return classifier.getLabelAlphabet().lookupLabel(i).getEntry().toString();
}
@Override
public int getIndex(String outcome) {
- return classifer.getLabelAlphabet().lookupIndex(outcome);
+ return classifier.getLabelAlphabet().lookupIndex(outcome);
}
@Override
public int getNumOutcomes() {
- return classifer.getLabelAlphabet().size();
+ return classifier.getLabelAlphabet().size();
}
@Override
diff --git a/mallet-addon/src/main/java/opennlp/addons/mallet/ClassifierModelSerializer.java b/mallet-addon/src/main/java/opennlp/addons/mallet/ClassifierModelSerializer.java
index 9cfb6f2..f3b4806 100644
--- a/mallet-addon/src/main/java/opennlp/addons/mallet/ClassifierModelSerializer.java
+++ b/mallet-addon/src/main/java/opennlp/addons/mallet/ClassifierModelSerializer.java
@@ -26,20 +26,16 @@ import java.io.ObjectOutputStream;
import java.io.OutputStream;
import cc.mallet.classify.Classifier;
-import opennlp.tools.util.InvalidFormatException;
import opennlp.tools.util.model.ArtifactSerializer;
// The standard method for saving classifiers in Mallet is through Java serialization.
-public class ClassifierModelSerializer implements
- ArtifactSerializer<ClassifierModel> {
+public class ClassifierModelSerializer implements ArtifactSerializer<ClassifierModel> {
@Override
- public ClassifierModel create(InputStream in) throws IOException,
- InvalidFormatException {
+ public ClassifierModel create(InputStream in) throws IOException {
- ObjectInputStream ois = new ObjectInputStream(in);
- try {
+ try ( ObjectInputStream ois = new ObjectInputStream(in)) {
Classifier classifier = (Classifier) ois.readObject();
return new ClassifierModel(classifier);
} catch (ClassNotFoundException e) {
@@ -48,10 +44,9 @@ public class ClassifierModelSerializer implements
}
@Override
- public void serialize(ClassifierModel artifact, OutputStream out)
- throws IOException {
+ public void serialize(ClassifierModel artifact, OutputStream out) throws IOException {
ObjectOutputStream oos = new ObjectOutputStream(out);
- oos.writeObject(artifact.getClassifer());
+ oos.writeObject(artifact.getClassifier());
oos.flush();
}
}
diff --git a/mallet-addon/src/main/java/opennlp/addons/mallet/MaxentTrainer.java b/mallet-addon/src/main/java/opennlp/addons/mallet/MaxentTrainer.java
index e9524a9..cfcb294 100644
--- a/mallet-addon/src/main/java/opennlp/addons/mallet/MaxentTrainer.java
+++ b/mallet-addon/src/main/java/opennlp/addons/mallet/MaxentTrainer.java
@@ -22,22 +22,12 @@ package opennlp.addons.mallet;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
-import java.util.Map;
import opennlp.tools.ml.AbstractEventTrainer;
import opennlp.tools.ml.model.DataIndexer;
import opennlp.tools.ml.model.MaxentModel;
-import cc.mallet.classify.C45Trainer;
import cc.mallet.classify.Classifier;
-import cc.mallet.classify.MaxEntGETrainer;
-import cc.mallet.classify.MaxEntL1Trainer;
-import cc.mallet.classify.MaxEntPRTrainer;
import cc.mallet.classify.MaxEntTrainer;
-import cc.mallet.classify.NaiveBayes;
-import cc.mallet.classify.NaiveBayesEMTrainer;
-import cc.mallet.classify.NaiveBayesTrainer;
-import cc.mallet.optimize.LimitedMemoryBFGS;
-import cc.mallet.optimize.Optimizer;
import cc.mallet.types.Alphabet;
import cc.mallet.types.FeatureVector;
import cc.mallet.types.Instance;
@@ -61,13 +51,13 @@ public class MaxentTrainer extends AbstractEventTrainer {
Collection<Instance> instances = new ArrayList<>();
- String predLabels[] = indexer.getPredLabels();
+ String[] predLabels = indexer.getPredLabels();
- int outcomes[] = indexer.getOutcomeList();
+ int[] outcomes = indexer.getOutcomeList();
for (int contextIndex = 0; contextIndex < indexer.getContexts().length; contextIndex++) {
- int malletFeatures[] = new int[indexer.getContexts()[contextIndex].length];
- double weights[] = new double[indexer.getContexts()[contextIndex].length];
+ int[] malletFeatures = new int[indexer.getContexts()[contextIndex].length];
+ double[] weights = new double[indexer.getContexts()[contextIndex].length];
for (int featureIndex = 0; featureIndex < malletFeatures.length; featureIndex++) {
malletFeatures[featureIndex] = dataAlphabet.lookupIndex(
diff --git a/mallet-addon/src/main/java/opennlp/addons/mallet/TransducerModel.java b/mallet-addon/src/main/java/opennlp/addons/mallet/TransducerModel.java
index e713d83..91afec3 100644
--- a/mallet-addon/src/main/java/opennlp/addons/mallet/TransducerModel.java
+++ b/mallet-addon/src/main/java/opennlp/addons/mallet/TransducerModel.java
@@ -36,7 +36,7 @@ import cc.mallet.types.Sequence;
public class TransducerModel<T> implements SequenceClassificationModel<T>, SerializableArtifact {
- private Transducer model;
+ private final Transducer model;
public TransducerModel(Transducer model) {
this.model = model;
@@ -45,7 +45,8 @@ public class TransducerModel<T> implements SequenceClassificationModel<T>, Seria
Transducer getModel() {
return model;
}
-
+
+ @Override
public opennlp.tools.util.Sequence bestSequence(T[] sequence,
Object[] additionalContext, BeamSearchContextGenerator<T> cg,
SequenceValidator<T> validator) {
@@ -59,7 +60,8 @@ public class TransducerModel<T> implements SequenceClassificationModel<T>, Seria
// TODO: How to implement min score filtering here?
return bestSequences(numSequences, sequence, additionalContext, cg, validator);
}
-
+
+ @Override
public opennlp.tools.util.Sequence[] bestSequences(int numSequences,
T[] sequence, Object[] additionalContext,
BeamSearchContextGenerator<T> cg, SequenceValidator<T> validator) {
@@ -67,16 +69,16 @@ public class TransducerModel<T> implements SequenceClassificationModel<T>, Seria
// TODO: CRF.getInputAlphabet
Alphabet dataAlphabet = model.getInputPipe().getAlphabet();
- FeatureVector featureVectors[] = new FeatureVector[sequence.length];
+ FeatureVector[] featureVectors = new FeatureVector[sequence.length];
// TODO:: The feature generator needs to get the detected sequence in the end
// to update the adaptive data!
- String prior[] = new String[sequence.length];
+ String[] prior = new String[sequence.length];
Arrays.fill(prior, "s"); // <- HACK, this will degrade performance!
// TODO: Put together a feature generator which doesn't fail if outcomes is null!
for (int i = 0; i < sequence.length; i++) {
- String features[] = cg.getContext(i, sequence, null, additionalContext);
+ String[] features = cg.getContext(i, sequence, null, additionalContext);
List<Integer> malletFeatureList = new ArrayList<>(features.length);
@@ -86,7 +88,7 @@ public class TransducerModel<T> implements SequenceClassificationModel<T>, Seria
}
}
- int malletFeatures[] = new int[malletFeatureList.size()];
+ int[] malletFeatures = new int[malletFeatureList.size()];
for (int k = 0; k < malletFeatureList.size(); k++) {
malletFeatures[k] = malletFeatureList.get(k);
}
@@ -97,7 +99,7 @@ public class TransducerModel<T> implements SequenceClassificationModel<T>, Seria
FeatureVectorSequence malletSequence = new FeatureVectorSequence(featureVectors);
- Sequence[] answers = null;
+ Sequence[] answers;
if (numSequences == 1) {
answers = new Sequence[1];
answers[0] = model.transduce(malletSequence);
@@ -136,7 +138,7 @@ public class TransducerModel<T> implements SequenceClassificationModel<T>, Seria
Alphabet targetAlphabet = model.getInputPipe().getTargetAlphabet();
- String outcomes[] = new String[targetAlphabet.size()];
+ String[] outcomes = new String[targetAlphabet.size()];
for (int i = 0; i < targetAlphabet.size(); i++) {
outcomes[i] = targetAlphabet.lookupObject(i).toString();
diff --git a/mallet-addon/src/main/java/opennlp/addons/mallet/TransducerModelSerializer.java b/mallet-addon/src/main/java/opennlp/addons/mallet/TransducerModelSerializer.java
index b793ca2..6e05eab 100644
--- a/mallet-addon/src/main/java/opennlp/addons/mallet/TransducerModelSerializer.java
+++ b/mallet-addon/src/main/java/opennlp/addons/mallet/TransducerModelSerializer.java
@@ -32,10 +32,8 @@ import cc.mallet.fst.Transducer;
public class TransducerModelSerializer implements ArtifactSerializer<TransducerModel> {
@Override
- public TransducerModel create(InputStream in) throws IOException,
- InvalidFormatException {
- ObjectInputStream ois = new ObjectInputStream(in);
- try {
+ public TransducerModel create(InputStream in) throws IOException, InvalidFormatException {
+ try (ObjectInputStream ois = new ObjectInputStream(in)) {
Transducer classifier = (Transducer) ois.readObject();
return new TransducerModel(classifier);
} catch (ClassNotFoundException e) {
@@ -44,8 +42,7 @@ public class TransducerModelSerializer implements ArtifactSerializer<TransducerM
}
@Override
- public void serialize(TransducerModel artifact, OutputStream out)
- throws IOException {
+ public void serialize(TransducerModel artifact, OutputStream out) throws IOException {
ObjectOutputStream oos = new ObjectOutputStream(out);
oos.writeObject(artifact.getModel());
oos.flush();