You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by ma...@apache.org on 2023/01/20 10:25:00 UTC

[opennlp-sandbox] branch migrate-mallet-addon-to-opennlp-tools-2_1_0 created (now f369e6c)

This is an automated email from the ASF dual-hosted git repository.

mawiesne pushed a change to branch migrate-mallet-addon-to-opennlp-tools-2_1_0
in repository https://gitbox.apache.org/repos/asf/opennlp-sandbox.git


      at f369e6c  updates sandbox component 'mallet-addon' to be compatible with latest opennlp-tools release

This branch includes the following new commits:

     new f369e6c  updates sandbox component 'mallet-addon' to be compatible with latest opennlp-tools release

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.



[opennlp-sandbox] 01/01: updates sandbox component 'mallet-addon' to be compatible with latest opennlp-tools release

Posted by ma...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

mawiesne pushed a commit to branch migrate-mallet-addon-to-opennlp-tools-2_1_0
in repository https://gitbox.apache.org/repos/asf/opennlp-sandbox.git

commit f369e6c8b582d89e03baf564b95d4537cc7bf76e
Author: Martin Wiesner <ma...@hs-heilbronn.de>
AuthorDate: Fri Jan 20 11:24:53 2023 +0100

    updates sandbox component 'mallet-addon' to be compatible with latest opennlp-tools release
    
    - adjusts opennlp-tools to 2.1.0
    - adjusts parent project (org.apache.apache) to version 18
    - adjusts Java language level to 11
    - updates to mallet version 2.0.8 to mitigate several CVEs, adds exclusions and related newer versions to mitigate CVEs
    - adjusts some array declarations to comply with Java, not C, style
    - improves resource handling of streams
    - removes unused imports
---
 mallet-addon/pom.xml                               | 44 +++++++++++++++++++---
 .../java/opennlp/addons/mallet/CRFTrainer.java     | 15 +++-----
 .../opennlp/addons/mallet/ClassifierModel.java     | 27 +++++++------
 .../addons/mallet/ClassifierModelSerializer.java   | 15 +++-----
 .../java/opennlp/addons/mallet/MaxentTrainer.java  | 18 ++-------
 .../opennlp/addons/mallet/TransducerModel.java     | 20 +++++-----
 .../addons/mallet/TransducerModelSerializer.java   |  9 ++---
 7 files changed, 82 insertions(+), 66 deletions(-)

diff --git a/mallet-addon/pom.xml b/mallet-addon/pom.xml
index c5f2ca9..d1e134f 100644
--- a/mallet-addon/pom.xml
+++ b/mallet-addon/pom.xml
@@ -21,10 +21,17 @@
 
 <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 	<modelVersion>4.0.0</modelVersion>
-	
+	<parent>
+		<groupId>org.apache</groupId>
+		<artifactId>apache</artifactId>
+		<!-- TODO OPENNLP-1452 once this is resolved, move to 29 as well. -->
+		<version>18</version>
+		<relativePath />
+	</parent>
+
 	<groupId>kottmann.opennlp</groupId>
 	<artifactId>mallet-addon</artifactId>
-	<version>1.6.0-SNAPSHOT</version>
+	<version>2.1.1-SNAPSHOT</version>
 
 	<packaging>jar</packaging>
 	<name>Apache OpenNLP Mallet Addon</name>
@@ -33,13 +40,37 @@
 		<dependency>
 			<groupId>org.apache.opennlp</groupId>
 			<artifactId>opennlp-tools</artifactId>
-			<version>1.6.1-SNAPSHOT</version>
+			<version>2.1.0</version>
 		</dependency>
 		
 		<dependency>
 			<groupId>cc.mallet</groupId>
 			<artifactId>mallet</artifactId>
-			<version>2.0.7</version>
+			<version>2.0.8</version>
+			<exclusions>
+				<exclusion>
+					<groupId>junit</groupId>
+					<artifactId>junit</artifactId>
+				</exclusion>
+				<exclusion>
+					<groupId>org.jdom</groupId>
+					<artifactId>jdom</artifactId>
+				</exclusion>
+				<exclusion>
+					<groupId>org.beanshell</groupId>
+					<artifactId>bsh</artifactId>
+				</exclusion>
+			</exclusions>
+		</dependency>
+		<dependency>
+			<groupId>org.jdom</groupId>
+			<artifactId>jdom</artifactId>
+			<version>1.1.3</version>
+		</dependency>
+		<dependency>
+			<groupId>org.apache-extras.beanshell</groupId>
+			<artifactId>bsh</artifactId>
+			<version>2.0b6</version>
 		</dependency>
 	</dependencies>
 
@@ -67,8 +98,9 @@
 				<groupId>org.apache.maven.plugins</groupId>
 				<artifactId>maven-compiler-plugin</artifactId>
 				<configuration>
-					<source>1.7</source>
-					<target>1.7</target>
+					<source>11</source>
+					<target>11</target>
+					<compilerArgument>-Xlint</compilerArgument>
 				</configuration>
 			</plugin>
 			<plugin>
diff --git a/mallet-addon/src/main/java/opennlp/addons/mallet/CRFTrainer.java b/mallet-addon/src/main/java/opennlp/addons/mallet/CRFTrainer.java
index 7e6de66..0700e2b 100644
--- a/mallet-addon/src/main/java/opennlp/addons/mallet/CRFTrainer.java
+++ b/mallet-addon/src/main/java/opennlp/addons/mallet/CRFTrainer.java
@@ -20,7 +20,6 @@
 package opennlp.addons.mallet;
 
 import java.io.IOException;
-import java.util.Map;
 import java.util.regex.Pattern;
 
 import opennlp.tools.ml.AbstractSequenceTrainer;
@@ -30,7 +29,6 @@ import opennlp.tools.ml.model.SequenceClassificationModel;
 import opennlp.tools.ml.model.SequenceStream;
 import cc.mallet.fst.CRF;
 import cc.mallet.fst.CRFOptimizableByLabelLikelihood;
-import cc.mallet.fst.CRFTrainerByLabelLikelihood;
 import cc.mallet.fst.CRFTrainerByValueGradients;
 import cc.mallet.fst.Transducer;
 import cc.mallet.optimize.Optimizable;
@@ -71,17 +69,17 @@ public class CRFTrainer extends AbstractSequenceTrainer {
     int nameIndex = 0;
     Sequence sequence;
     while ((sequence = sequences.read()) != null) {
-      FeatureVector featureVectors[] = new FeatureVector[sequence.getEvents().length];
-      Label malletOutcomes[] = new Label[sequence.getEvents().length];
+      FeatureVector[] featureVectors = new FeatureVector[sequence.getEvents().length];
+      Label[] malletOutcomes = new Label[sequence.getEvents().length];
 
-      Event events[] = sequence.getEvents();
+      Event[] events = sequence.getEvents();
 
       for (int eventIndex = 0; eventIndex < events.length; eventIndex++) {
 
         Event event = events[eventIndex];
 
-        String features[] = event.getContext();
-        int malletFeatures[] = new int[features.length];
+        String[] features = event.getContext();
+        int[] malletFeatures = new int[features.length];
 
         for (int featureIndex = 0; featureIndex < features.length; featureIndex++) {
           malletFeatures[featureIndex] = dataAlphabet.lookupIndex(
@@ -109,8 +107,7 @@ public class CRFTrainer extends AbstractSequenceTrainer {
     CRF crf = new CRF(trainingData.getDataAlphabet(),
         trainingData.getTargetAlphabet());
 
-    String startStateName = crf.addOrderNStates(trainingData, getOrders(),
-        (boolean[]) null,
+    String startStateName = crf.addOrderNStates(trainingData, getOrders(), null,
         // default label
         "other", Pattern.compile("other,*-cont"), // forbidden pattern
         null, // allowed pattern
diff --git a/mallet-addon/src/main/java/opennlp/addons/mallet/ClassifierModel.java b/mallet-addon/src/main/java/opennlp/addons/mallet/ClassifierModel.java
index 5f6661d..1426be9 100644
--- a/mallet-addon/src/main/java/opennlp/addons/mallet/ClassifierModel.java
+++ b/mallet-addon/src/main/java/opennlp/addons/mallet/ClassifierModel.java
@@ -35,18 +35,19 @@ import cc.mallet.types.LabelVector;
 
 class ClassifierModel implements MaxentModel, SerializableArtifact {
 
-  private Classifier classifer;
+  private final Classifier classifier;
 
   public ClassifierModel(Classifier classifer) {
-    this.classifer = classifer;
+    this.classifier = classifer;
   }
 
-  Classifier getClassifer() {
-    return classifer;
+  Classifier getClassifier() {
+    return classifier;
   }
-  
+
+  @Override
   public double[] eval(String[] features) {
-    Alphabet dataAlphabet = classifer.getAlphabet();
+    Alphabet dataAlphabet = classifier.getAlphabet();
 
     List<Integer> malletFeatureList = new ArrayList<>(features.length);
 
@@ -62,15 +63,15 @@ class ClassifierModel implements MaxentModel, SerializableArtifact {
       malletFeatures[i] = malletFeatureList.get(i);
     }
 
-    FeatureVector fv = new FeatureVector(classifer.getAlphabet(),
+    FeatureVector fv = new FeatureVector(classifier.getAlphabet(),
         malletFeatures);
     Instance instance = new Instance(fv, null, null, null);
 
-    Classification result = classifer.classify(instance);
+    Classification result = classifier.classify(instance);
 
     LabelVector labeling = result.getLabelVector();
 
-    LabelAlphabet targetAlphabet = classifer.getLabelAlphabet();
+    LabelAlphabet targetAlphabet = classifier.getLabelAlphabet();
 
     double outcomes[] = new double[targetAlphabet.size()];
     for (int i = 0; i < outcomes.length; i++) {
@@ -84,10 +85,12 @@ class ClassifierModel implements MaxentModel, SerializableArtifact {
     return outcomes;
   }
 
+  @Override
   public double[] eval(String[] context, double[] probs) {
     return eval(context);
   }
 
+  @Override
   public double[] eval(String[] context, float[] values) {
     return eval(context);
   }
@@ -109,17 +112,17 @@ class ClassifierModel implements MaxentModel, SerializableArtifact {
 
   @Override
   public String getOutcome(int i) {
-    return classifer.getLabelAlphabet().lookupLabel(i).getEntry().toString();
+    return classifier.getLabelAlphabet().lookupLabel(i).getEntry().toString();
   }
 
   @Override
   public int getIndex(String outcome) {
-    return classifer.getLabelAlphabet().lookupIndex(outcome);
+    return classifier.getLabelAlphabet().lookupIndex(outcome);
   }
 
   @Override
   public int getNumOutcomes() {
-    return classifer.getLabelAlphabet().size();
+    return classifier.getLabelAlphabet().size();
   }
 
   @Override
diff --git a/mallet-addon/src/main/java/opennlp/addons/mallet/ClassifierModelSerializer.java b/mallet-addon/src/main/java/opennlp/addons/mallet/ClassifierModelSerializer.java
index 9cfb6f2..f3b4806 100644
--- a/mallet-addon/src/main/java/opennlp/addons/mallet/ClassifierModelSerializer.java
+++ b/mallet-addon/src/main/java/opennlp/addons/mallet/ClassifierModelSerializer.java
@@ -26,20 +26,16 @@ import java.io.ObjectOutputStream;
 import java.io.OutputStream;
 
 import cc.mallet.classify.Classifier;
-import opennlp.tools.util.InvalidFormatException;
 import opennlp.tools.util.model.ArtifactSerializer;
 
 // The standard method for saving classifiers in Mallet is through Java serialization.
 
-public class ClassifierModelSerializer implements
-    ArtifactSerializer<ClassifierModel> {
+public class ClassifierModelSerializer implements ArtifactSerializer<ClassifierModel> {
 
   @Override
-  public ClassifierModel create(InputStream in) throws IOException,
-      InvalidFormatException {
+  public ClassifierModel create(InputStream in) throws IOException {
 
-    ObjectInputStream ois = new ObjectInputStream(in);
-    try {
+    try ( ObjectInputStream ois = new ObjectInputStream(in)) {
       Classifier classifier = (Classifier) ois.readObject();
       return new ClassifierModel(classifier);
     } catch (ClassNotFoundException e) {
@@ -48,10 +44,9 @@ public class ClassifierModelSerializer implements
   }
 
   @Override
-  public void serialize(ClassifierModel artifact, OutputStream out)
-      throws IOException {
+  public void serialize(ClassifierModel artifact, OutputStream out) throws IOException {
     ObjectOutputStream oos = new ObjectOutputStream(out);
-    oos.writeObject(artifact.getClassifer());
+    oos.writeObject(artifact.getClassifier());
     oos.flush();
   }
 }
diff --git a/mallet-addon/src/main/java/opennlp/addons/mallet/MaxentTrainer.java b/mallet-addon/src/main/java/opennlp/addons/mallet/MaxentTrainer.java
index e9524a9..cfcb294 100644
--- a/mallet-addon/src/main/java/opennlp/addons/mallet/MaxentTrainer.java
+++ b/mallet-addon/src/main/java/opennlp/addons/mallet/MaxentTrainer.java
@@ -22,22 +22,12 @@ package opennlp.addons.mallet;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Collection;
-import java.util.Map;
 
 import opennlp.tools.ml.AbstractEventTrainer;
 import opennlp.tools.ml.model.DataIndexer;
 import opennlp.tools.ml.model.MaxentModel;
-import cc.mallet.classify.C45Trainer;
 import cc.mallet.classify.Classifier;
-import cc.mallet.classify.MaxEntGETrainer;
-import cc.mallet.classify.MaxEntL1Trainer;
-import cc.mallet.classify.MaxEntPRTrainer;
 import cc.mallet.classify.MaxEntTrainer;
-import cc.mallet.classify.NaiveBayes;
-import cc.mallet.classify.NaiveBayesEMTrainer;
-import cc.mallet.classify.NaiveBayesTrainer;
-import cc.mallet.optimize.LimitedMemoryBFGS;
-import cc.mallet.optimize.Optimizer;
 import cc.mallet.types.Alphabet;
 import cc.mallet.types.FeatureVector;
 import cc.mallet.types.Instance;
@@ -61,13 +51,13 @@ public class MaxentTrainer extends AbstractEventTrainer {
 
     Collection<Instance> instances = new ArrayList<>();
 
-    String predLabels[] = indexer.getPredLabels();
+    String[] predLabels = indexer.getPredLabels();
     
-    int outcomes[] = indexer.getOutcomeList();
+    int[] outcomes = indexer.getOutcomeList();
     for (int contextIndex = 0; contextIndex < indexer.getContexts().length; contextIndex++) {
 
-      int malletFeatures[] = new int[indexer.getContexts()[contextIndex].length];
-      double weights[] = new double[indexer.getContexts()[contextIndex].length];
+      int[] malletFeatures = new int[indexer.getContexts()[contextIndex].length];
+      double[] weights = new double[indexer.getContexts()[contextIndex].length];
 
       for (int featureIndex = 0; featureIndex < malletFeatures.length; featureIndex++) {
         malletFeatures[featureIndex] = dataAlphabet.lookupIndex(
diff --git a/mallet-addon/src/main/java/opennlp/addons/mallet/TransducerModel.java b/mallet-addon/src/main/java/opennlp/addons/mallet/TransducerModel.java
index e713d83..91afec3 100644
--- a/mallet-addon/src/main/java/opennlp/addons/mallet/TransducerModel.java
+++ b/mallet-addon/src/main/java/opennlp/addons/mallet/TransducerModel.java
@@ -36,7 +36,7 @@ import cc.mallet.types.Sequence;
 
 public class TransducerModel<T> implements SequenceClassificationModel<T>, SerializableArtifact {
 
-  private Transducer model;
+  private final Transducer model;
 
   public TransducerModel(Transducer model) {
     this.model = model;
@@ -45,7 +45,8 @@ public class TransducerModel<T> implements SequenceClassificationModel<T>, Seria
   Transducer getModel() {
     return model;
   }
-  
+
+  @Override
   public opennlp.tools.util.Sequence bestSequence(T[] sequence,
       Object[] additionalContext, BeamSearchContextGenerator<T> cg,
       SequenceValidator<T> validator) {
@@ -59,7 +60,8 @@ public class TransducerModel<T> implements SequenceClassificationModel<T>, Seria
     // TODO: How to implement min score filtering here? 
     return bestSequences(numSequences, sequence, additionalContext, cg, validator);
   }
-  
+
+  @Override
   public opennlp.tools.util.Sequence[] bestSequences(int numSequences,
       T[] sequence, Object[] additionalContext,
       BeamSearchContextGenerator<T> cg, SequenceValidator<T> validator) {
@@ -67,16 +69,16 @@ public class TransducerModel<T> implements SequenceClassificationModel<T>, Seria
     // TODO: CRF.getInputAlphabet
     Alphabet dataAlphabet = model.getInputPipe().getAlphabet();
     
-    FeatureVector featureVectors[] = new FeatureVector[sequence.length];
+    FeatureVector[] featureVectors = new FeatureVector[sequence.length];
     
     // TODO:: The feature generator needs to get the detected sequence in the end
     // to update the adaptive data!
-    String prior[] = new String[sequence.length];
+    String[] prior = new String[sequence.length];
     Arrays.fill(prior, "s"); // <- HACK, this will degrade performance!
     
     // TODO: Put together a feature generator which doesn't fail if outcomes is null!
     for (int i = 0; i < sequence.length; i++) {
-      String features[] = cg.getContext(i, sequence, null, additionalContext);
+      String[] features = cg.getContext(i, sequence, null, additionalContext);
       
       List<Integer> malletFeatureList = new ArrayList<>(features.length);
       
@@ -86,7 +88,7 @@ public class TransducerModel<T> implements SequenceClassificationModel<T>, Seria
         }
       }
 
-      int malletFeatures[] = new int[malletFeatureList.size()];
+      int[] malletFeatures = new int[malletFeatureList.size()];
       for (int k = 0; k < malletFeatureList.size(); k++) {
         malletFeatures[k] = malletFeatureList.get(k);
       }
@@ -97,7 +99,7 @@ public class TransducerModel<T> implements SequenceClassificationModel<T>, Seria
     
     FeatureVectorSequence malletSequence = new FeatureVectorSequence(featureVectors);
     
-    Sequence[] answers = null;
+    Sequence[] answers;
     if (numSequences == 1) {
       answers = new Sequence[1];
       answers[0] = model.transduce(malletSequence);
@@ -136,7 +138,7 @@ public class TransducerModel<T> implements SequenceClassificationModel<T>, Seria
     
     Alphabet targetAlphabet = model.getInputPipe().getTargetAlphabet();
     
-    String outcomes[] = new String[targetAlphabet.size()];
+    String[] outcomes = new String[targetAlphabet.size()];
     
     for (int i = 0; i < targetAlphabet.size(); i++) {
       outcomes[i] = targetAlphabet.lookupObject(i).toString();
diff --git a/mallet-addon/src/main/java/opennlp/addons/mallet/TransducerModelSerializer.java b/mallet-addon/src/main/java/opennlp/addons/mallet/TransducerModelSerializer.java
index b793ca2..6e05eab 100644
--- a/mallet-addon/src/main/java/opennlp/addons/mallet/TransducerModelSerializer.java
+++ b/mallet-addon/src/main/java/opennlp/addons/mallet/TransducerModelSerializer.java
@@ -32,10 +32,8 @@ import cc.mallet.fst.Transducer;
 public class TransducerModelSerializer implements ArtifactSerializer<TransducerModel> {
 
   @Override
-  public TransducerModel create(InputStream in) throws IOException,
-      InvalidFormatException {
-    ObjectInputStream ois = new ObjectInputStream(in);
-    try {
+  public TransducerModel create(InputStream in) throws IOException, InvalidFormatException {
+    try (ObjectInputStream ois = new ObjectInputStream(in)) {
       Transducer classifier = (Transducer) ois.readObject();
       return new TransducerModel(classifier);
     } catch (ClassNotFoundException e) {
@@ -44,8 +42,7 @@ public class TransducerModelSerializer implements ArtifactSerializer<TransducerM
   }
 
   @Override
-  public void serialize(TransducerModel artifact, OutputStream out)
-      throws IOException {
+  public void serialize(TransducerModel artifact, OutputStream out) throws IOException {
     ObjectOutputStream oos = new ObjectOutputStream(out);
     oos.writeObject(artifact.getModel());
     oos.flush();