You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jz...@apache.org on 2023/01/20 15:24:39 UTC

[opennlp-sandbox] branch master updated: updates sandbox component 'mallet-addon' to be compatible with latest opennlp-tools release (#61)

This is an automated email from the ASF dual-hosted git repository.

jzemerick pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/opennlp-sandbox.git


The following commit(s) were added to refs/heads/master by this push:
     new f19a3b6  updates sandbox component 'mallet-addon' to be compatible with latest opennlp-tools release (#61)
f19a3b6 is described below

commit f19a3b67b210627f82d41cb5f665b2b99a2aac8a
Author: Martin Wiesner <ma...@users.noreply.github.com>
AuthorDate: Fri Jan 20 16:24:32 2023 +0100

    updates sandbox component 'mallet-addon' to be compatible with latest opennlp-tools release (#61)
    
    - adjusts opennlp-tools to 2.1.0
    - adjusts parent project (org.apache.apache) to version 18
    - adjusts Java language level to 11
    - updates to mallet version 2.0.8 to mitigate several CVEs, adds exclusions and related newer versions to mitigate CVEs
    - adjusts some array declarations to comply with Java, not C, style
    - improves resource handling of streams
    - removes unused imports
---
 mallet-addon/pom.xml                               | 44 +++++++++++++++++++---
 .../java/opennlp/addons/mallet/CRFTrainer.java     | 15 +++-----
 .../opennlp/addons/mallet/ClassifierModel.java     | 27 +++++++------
 .../addons/mallet/ClassifierModelSerializer.java   | 15 +++-----
 .../java/opennlp/addons/mallet/MaxentTrainer.java  | 18 ++-------
 .../opennlp/addons/mallet/TransducerModel.java     | 20 +++++-----
 .../addons/mallet/TransducerModelSerializer.java   |  9 ++---
 7 files changed, 82 insertions(+), 66 deletions(-)

diff --git a/mallet-addon/pom.xml b/mallet-addon/pom.xml
index c5f2ca9..d1e134f 100644
--- a/mallet-addon/pom.xml
+++ b/mallet-addon/pom.xml
@@ -21,10 +21,17 @@
 
 <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 	<modelVersion>4.0.0</modelVersion>
-	
+	<parent>
+		<groupId>org.apache</groupId>
+		<artifactId>apache</artifactId>
+		<!-- TODO OPENNLP-1452 once this is resolved, move to 29 as well. -->
+		<version>18</version>
+		<relativePath />
+	</parent>
+
 	<groupId>kottmann.opennlp</groupId>
 	<artifactId>mallet-addon</artifactId>
-	<version>1.6.0-SNAPSHOT</version>
+	<version>2.1.1-SNAPSHOT</version>
 
 	<packaging>jar</packaging>
 	<name>Apache OpenNLP Mallet Addon</name>
@@ -33,13 +40,37 @@
 		<dependency>
 			<groupId>org.apache.opennlp</groupId>
 			<artifactId>opennlp-tools</artifactId>
-			<version>1.6.1-SNAPSHOT</version>
+			<version>2.1.0</version>
 		</dependency>
 		
 		<dependency>
 			<groupId>cc.mallet</groupId>
 			<artifactId>mallet</artifactId>
-			<version>2.0.7</version>
+			<version>2.0.8</version>
+			<exclusions>
+				<exclusion>
+					<groupId>junit</groupId>
+					<artifactId>junit</artifactId>
+				</exclusion>
+				<exclusion>
+					<groupId>org.jdom</groupId>
+					<artifactId>jdom</artifactId>
+				</exclusion>
+				<exclusion>
+					<groupId>org.beanshell</groupId>
+					<artifactId>bsh</artifactId>
+				</exclusion>
+			</exclusions>
+		</dependency>
+		<dependency>
+			<groupId>org.jdom</groupId>
+			<artifactId>jdom</artifactId>
+			<version>1.1.3</version>
+		</dependency>
+		<dependency>
+			<groupId>org.apache-extras.beanshell</groupId>
+			<artifactId>bsh</artifactId>
+			<version>2.0b6</version>
 		</dependency>
 	</dependencies>
 
@@ -67,8 +98,9 @@
 				<groupId>org.apache.maven.plugins</groupId>
 				<artifactId>maven-compiler-plugin</artifactId>
 				<configuration>
-					<source>1.7</source>
-					<target>1.7</target>
+					<source>11</source>
+					<target>11</target>
+					<compilerArgument>-Xlint</compilerArgument>
 				</configuration>
 			</plugin>
 			<plugin>
diff --git a/mallet-addon/src/main/java/opennlp/addons/mallet/CRFTrainer.java b/mallet-addon/src/main/java/opennlp/addons/mallet/CRFTrainer.java
index 7e6de66..0700e2b 100644
--- a/mallet-addon/src/main/java/opennlp/addons/mallet/CRFTrainer.java
+++ b/mallet-addon/src/main/java/opennlp/addons/mallet/CRFTrainer.java
@@ -20,7 +20,6 @@
 package opennlp.addons.mallet;
 
 import java.io.IOException;
-import java.util.Map;
 import java.util.regex.Pattern;
 
 import opennlp.tools.ml.AbstractSequenceTrainer;
@@ -30,7 +29,6 @@ import opennlp.tools.ml.model.SequenceClassificationModel;
 import opennlp.tools.ml.model.SequenceStream;
 import cc.mallet.fst.CRF;
 import cc.mallet.fst.CRFOptimizableByLabelLikelihood;
-import cc.mallet.fst.CRFTrainerByLabelLikelihood;
 import cc.mallet.fst.CRFTrainerByValueGradients;
 import cc.mallet.fst.Transducer;
 import cc.mallet.optimize.Optimizable;
@@ -71,17 +69,17 @@ public class CRFTrainer extends AbstractSequenceTrainer {
     int nameIndex = 0;
     Sequence sequence;
     while ((sequence = sequences.read()) != null) {
-      FeatureVector featureVectors[] = new FeatureVector[sequence.getEvents().length];
-      Label malletOutcomes[] = new Label[sequence.getEvents().length];
+      FeatureVector[] featureVectors = new FeatureVector[sequence.getEvents().length];
+      Label[] malletOutcomes = new Label[sequence.getEvents().length];
 
-      Event events[] = sequence.getEvents();
+      Event[] events = sequence.getEvents();
 
       for (int eventIndex = 0; eventIndex < events.length; eventIndex++) {
 
         Event event = events[eventIndex];
 
-        String features[] = event.getContext();
-        int malletFeatures[] = new int[features.length];
+        String[] features = event.getContext();
+        int[] malletFeatures = new int[features.length];
 
         for (int featureIndex = 0; featureIndex < features.length; featureIndex++) {
           malletFeatures[featureIndex] = dataAlphabet.lookupIndex(
@@ -109,8 +107,7 @@ public class CRFTrainer extends AbstractSequenceTrainer {
     CRF crf = new CRF(trainingData.getDataAlphabet(),
         trainingData.getTargetAlphabet());
 
-    String startStateName = crf.addOrderNStates(trainingData, getOrders(),
-        (boolean[]) null,
+    String startStateName = crf.addOrderNStates(trainingData, getOrders(), null,
         // default label
         "other", Pattern.compile("other,*-cont"), // forbidden pattern
         null, // allowed pattern
diff --git a/mallet-addon/src/main/java/opennlp/addons/mallet/ClassifierModel.java b/mallet-addon/src/main/java/opennlp/addons/mallet/ClassifierModel.java
index 5f6661d..1426be9 100644
--- a/mallet-addon/src/main/java/opennlp/addons/mallet/ClassifierModel.java
+++ b/mallet-addon/src/main/java/opennlp/addons/mallet/ClassifierModel.java
@@ -35,18 +35,19 @@ import cc.mallet.types.LabelVector;
 
 class ClassifierModel implements MaxentModel, SerializableArtifact {
 
-  private Classifier classifer;
+  private final Classifier classifier;
 
   public ClassifierModel(Classifier classifer) {
-    this.classifer = classifer;
+    this.classifier = classifer;
   }
 
-  Classifier getClassifer() {
-    return classifer;
+  Classifier getClassifier() {
+    return classifier;
   }
-  
+
+  @Override
   public double[] eval(String[] features) {
-    Alphabet dataAlphabet = classifer.getAlphabet();
+    Alphabet dataAlphabet = classifier.getAlphabet();
 
     List<Integer> malletFeatureList = new ArrayList<>(features.length);
 
@@ -62,15 +63,15 @@ class ClassifierModel implements MaxentModel, SerializableArtifact {
       malletFeatures[i] = malletFeatureList.get(i);
     }
 
-    FeatureVector fv = new FeatureVector(classifer.getAlphabet(),
+    FeatureVector fv = new FeatureVector(classifier.getAlphabet(),
         malletFeatures);
     Instance instance = new Instance(fv, null, null, null);
 
-    Classification result = classifer.classify(instance);
+    Classification result = classifier.classify(instance);
 
     LabelVector labeling = result.getLabelVector();
 
-    LabelAlphabet targetAlphabet = classifer.getLabelAlphabet();
+    LabelAlphabet targetAlphabet = classifier.getLabelAlphabet();
 
     double outcomes[] = new double[targetAlphabet.size()];
     for (int i = 0; i < outcomes.length; i++) {
@@ -84,10 +85,12 @@ class ClassifierModel implements MaxentModel, SerializableArtifact {
     return outcomes;
   }
 
+  @Override
   public double[] eval(String[] context, double[] probs) {
     return eval(context);
   }
 
+  @Override
   public double[] eval(String[] context, float[] values) {
     return eval(context);
   }
@@ -109,17 +112,17 @@ class ClassifierModel implements MaxentModel, SerializableArtifact {
 
   @Override
   public String getOutcome(int i) {
-    return classifer.getLabelAlphabet().lookupLabel(i).getEntry().toString();
+    return classifier.getLabelAlphabet().lookupLabel(i).getEntry().toString();
   }
 
   @Override
   public int getIndex(String outcome) {
-    return classifer.getLabelAlphabet().lookupIndex(outcome);
+    return classifier.getLabelAlphabet().lookupIndex(outcome);
   }
 
   @Override
   public int getNumOutcomes() {
-    return classifer.getLabelAlphabet().size();
+    return classifier.getLabelAlphabet().size();
   }
 
   @Override
diff --git a/mallet-addon/src/main/java/opennlp/addons/mallet/ClassifierModelSerializer.java b/mallet-addon/src/main/java/opennlp/addons/mallet/ClassifierModelSerializer.java
index 9cfb6f2..f3b4806 100644
--- a/mallet-addon/src/main/java/opennlp/addons/mallet/ClassifierModelSerializer.java
+++ b/mallet-addon/src/main/java/opennlp/addons/mallet/ClassifierModelSerializer.java
@@ -26,20 +26,16 @@ import java.io.ObjectOutputStream;
 import java.io.OutputStream;
 
 import cc.mallet.classify.Classifier;
-import opennlp.tools.util.InvalidFormatException;
 import opennlp.tools.util.model.ArtifactSerializer;
 
 // The standard method for saving classifiers in Mallet is through Java serialization.
 
-public class ClassifierModelSerializer implements
-    ArtifactSerializer<ClassifierModel> {
+public class ClassifierModelSerializer implements ArtifactSerializer<ClassifierModel> {
 
   @Override
-  public ClassifierModel create(InputStream in) throws IOException,
-      InvalidFormatException {
+  public ClassifierModel create(InputStream in) throws IOException {
 
-    ObjectInputStream ois = new ObjectInputStream(in);
-    try {
+    try ( ObjectInputStream ois = new ObjectInputStream(in)) {
       Classifier classifier = (Classifier) ois.readObject();
       return new ClassifierModel(classifier);
     } catch (ClassNotFoundException e) {
@@ -48,10 +44,9 @@ public class ClassifierModelSerializer implements
   }
 
   @Override
-  public void serialize(ClassifierModel artifact, OutputStream out)
-      throws IOException {
+  public void serialize(ClassifierModel artifact, OutputStream out) throws IOException {
     ObjectOutputStream oos = new ObjectOutputStream(out);
-    oos.writeObject(artifact.getClassifer());
+    oos.writeObject(artifact.getClassifier());
     oos.flush();
   }
 }
diff --git a/mallet-addon/src/main/java/opennlp/addons/mallet/MaxentTrainer.java b/mallet-addon/src/main/java/opennlp/addons/mallet/MaxentTrainer.java
index e9524a9..cfcb294 100644
--- a/mallet-addon/src/main/java/opennlp/addons/mallet/MaxentTrainer.java
+++ b/mallet-addon/src/main/java/opennlp/addons/mallet/MaxentTrainer.java
@@ -22,22 +22,12 @@ package opennlp.addons.mallet;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Collection;
-import java.util.Map;
 
 import opennlp.tools.ml.AbstractEventTrainer;
 import opennlp.tools.ml.model.DataIndexer;
 import opennlp.tools.ml.model.MaxentModel;
-import cc.mallet.classify.C45Trainer;
 import cc.mallet.classify.Classifier;
-import cc.mallet.classify.MaxEntGETrainer;
-import cc.mallet.classify.MaxEntL1Trainer;
-import cc.mallet.classify.MaxEntPRTrainer;
 import cc.mallet.classify.MaxEntTrainer;
-import cc.mallet.classify.NaiveBayes;
-import cc.mallet.classify.NaiveBayesEMTrainer;
-import cc.mallet.classify.NaiveBayesTrainer;
-import cc.mallet.optimize.LimitedMemoryBFGS;
-import cc.mallet.optimize.Optimizer;
 import cc.mallet.types.Alphabet;
 import cc.mallet.types.FeatureVector;
 import cc.mallet.types.Instance;
@@ -61,13 +51,13 @@ public class MaxentTrainer extends AbstractEventTrainer {
 
     Collection<Instance> instances = new ArrayList<>();
 
-    String predLabels[] = indexer.getPredLabels();
+    String[] predLabels = indexer.getPredLabels();
     
-    int outcomes[] = indexer.getOutcomeList();
+    int[] outcomes = indexer.getOutcomeList();
     for (int contextIndex = 0; contextIndex < indexer.getContexts().length; contextIndex++) {
 
-      int malletFeatures[] = new int[indexer.getContexts()[contextIndex].length];
-      double weights[] = new double[indexer.getContexts()[contextIndex].length];
+      int[] malletFeatures = new int[indexer.getContexts()[contextIndex].length];
+      double[] weights = new double[indexer.getContexts()[contextIndex].length];
 
       for (int featureIndex = 0; featureIndex < malletFeatures.length; featureIndex++) {
         malletFeatures[featureIndex] = dataAlphabet.lookupIndex(
diff --git a/mallet-addon/src/main/java/opennlp/addons/mallet/TransducerModel.java b/mallet-addon/src/main/java/opennlp/addons/mallet/TransducerModel.java
index e713d83..91afec3 100644
--- a/mallet-addon/src/main/java/opennlp/addons/mallet/TransducerModel.java
+++ b/mallet-addon/src/main/java/opennlp/addons/mallet/TransducerModel.java
@@ -36,7 +36,7 @@ import cc.mallet.types.Sequence;
 
 public class TransducerModel<T> implements SequenceClassificationModel<T>, SerializableArtifact {
 
-  private Transducer model;
+  private final Transducer model;
 
   public TransducerModel(Transducer model) {
     this.model = model;
@@ -45,7 +45,8 @@ public class TransducerModel<T> implements SequenceClassificationModel<T>, Seria
   Transducer getModel() {
     return model;
   }
-  
+
+  @Override
   public opennlp.tools.util.Sequence bestSequence(T[] sequence,
       Object[] additionalContext, BeamSearchContextGenerator<T> cg,
       SequenceValidator<T> validator) {
@@ -59,7 +60,8 @@ public class TransducerModel<T> implements SequenceClassificationModel<T>, Seria
     // TODO: How to implement min score filtering here? 
     return bestSequences(numSequences, sequence, additionalContext, cg, validator);
   }
-  
+
+  @Override
   public opennlp.tools.util.Sequence[] bestSequences(int numSequences,
       T[] sequence, Object[] additionalContext,
       BeamSearchContextGenerator<T> cg, SequenceValidator<T> validator) {
@@ -67,16 +69,16 @@ public class TransducerModel<T> implements SequenceClassificationModel<T>, Seria
     // TODO: CRF.getInputAlphabet
     Alphabet dataAlphabet = model.getInputPipe().getAlphabet();
     
-    FeatureVector featureVectors[] = new FeatureVector[sequence.length];
+    FeatureVector[] featureVectors = new FeatureVector[sequence.length];
     
     // TODO:: The feature generator needs to get the detected sequence in the end
     // to update the adaptive data!
-    String prior[] = new String[sequence.length];
+    String[] prior = new String[sequence.length];
     Arrays.fill(prior, "s"); // <- HACK, this will degrade performance!
     
     // TODO: Put together a feature generator which doesn't fail if outcomes is null!
     for (int i = 0; i < sequence.length; i++) {
-      String features[] = cg.getContext(i, sequence, null, additionalContext);
+      String[] features = cg.getContext(i, sequence, null, additionalContext);
       
       List<Integer> malletFeatureList = new ArrayList<>(features.length);
       
@@ -86,7 +88,7 @@ public class TransducerModel<T> implements SequenceClassificationModel<T>, Seria
         }
       }
 
-      int malletFeatures[] = new int[malletFeatureList.size()];
+      int[] malletFeatures = new int[malletFeatureList.size()];
       for (int k = 0; k < malletFeatureList.size(); k++) {
         malletFeatures[k] = malletFeatureList.get(k);
       }
@@ -97,7 +99,7 @@ public class TransducerModel<T> implements SequenceClassificationModel<T>, Seria
     
     FeatureVectorSequence malletSequence = new FeatureVectorSequence(featureVectors);
     
-    Sequence[] answers = null;
+    Sequence[] answers;
     if (numSequences == 1) {
       answers = new Sequence[1];
       answers[0] = model.transduce(malletSequence);
@@ -136,7 +138,7 @@ public class TransducerModel<T> implements SequenceClassificationModel<T>, Seria
     
     Alphabet targetAlphabet = model.getInputPipe().getTargetAlphabet();
     
-    String outcomes[] = new String[targetAlphabet.size()];
+    String[] outcomes = new String[targetAlphabet.size()];
     
     for (int i = 0; i < targetAlphabet.size(); i++) {
       outcomes[i] = targetAlphabet.lookupObject(i).toString();
diff --git a/mallet-addon/src/main/java/opennlp/addons/mallet/TransducerModelSerializer.java b/mallet-addon/src/main/java/opennlp/addons/mallet/TransducerModelSerializer.java
index b793ca2..6e05eab 100644
--- a/mallet-addon/src/main/java/opennlp/addons/mallet/TransducerModelSerializer.java
+++ b/mallet-addon/src/main/java/opennlp/addons/mallet/TransducerModelSerializer.java
@@ -32,10 +32,8 @@ import cc.mallet.fst.Transducer;
 public class TransducerModelSerializer implements ArtifactSerializer<TransducerModel> {
 
   @Override
-  public TransducerModel create(InputStream in) throws IOException,
-      InvalidFormatException {
-    ObjectInputStream ois = new ObjectInputStream(in);
-    try {
+  public TransducerModel create(InputStream in) throws IOException, InvalidFormatException {
+    try (ObjectInputStream ois = new ObjectInputStream(in)) {
       Transducer classifier = (Transducer) ois.readObject();
       return new TransducerModel(classifier);
     } catch (ClassNotFoundException e) {
@@ -44,8 +42,7 @@ public class TransducerModelSerializer implements ArtifactSerializer<TransducerM
   }
 
   @Override
-  public void serialize(TransducerModel artifact, OutputStream out)
-      throws IOException {
+  public void serialize(TransducerModel artifact, OutputStream out) throws IOException {
     ObjectOutputStream oos = new ObjectOutputStream(out);
     oos.writeObject(artifact.getModel());
     oos.flush();