You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by co...@apache.org on 2012/07/11 21:30:04 UTC

svn commit: r1360365 - in /opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools: postag/POSTaggerFactory.java util/model/ArtifactProvider.java util/model/BaseModel.java

Author: colen
Date: Wed Jul 11 19:30:04 2012
New Revision: 1360365

URL: http://svn.apache.org/viewvc?rev=1360365&view=rev
Log:
OPENNLP-521: Mechanism to check POS Tagger dictionary only during its creation. Added a flag to the BaseModel (ArtifactProvider) to allow knowing if it was loaded from a stream. We use this flag to know if the dictionary should be validated or not.

Modified:
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerFactory.java
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/model/ArtifactProvider.java
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/model/BaseModel.java

Modified: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerFactory.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerFactory.java?rev=1360365&r1=1360364&r2=1360365&view=diff
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerFactory.java (original)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerFactory.java Wed Jul 11 19:30:04 2012
@@ -173,6 +173,33 @@ public class POSTaggerFactory extends Ba
     }
   }
 
+  protected void validatePOSDictionary(POSDictionary posDict,
+      AbstractModel posModel) throws InvalidFormatException {
+    Set<String> dictTags = new HashSet<String>();
+
+    for (String word : posDict) {
+      Collections.addAll(dictTags, posDict.getTags(word));
+    }
+
+    Set<String> modelTags = new HashSet<String>();
+
+    for (int i = 0; i < posModel.getNumOutcomes(); i++) {
+      modelTags.add(posModel.getOutcome(i));
+    }
+
+    if (!modelTags.containsAll(dictTags)) {
+      StringBuilder unknownTag = new StringBuilder();
+      for (String d : dictTags) {
+        if (!modelTags.contains(d)) {
+          unknownTag.append(d).append(" ");
+        }
+      }
+      throw new InvalidFormatException("Tag dictioinary contains tags "
+          + "which are unknown by the model! The unknown tags are: "
+          + unknownTag.toString());
+    }
+  }
+  
   @Override
   public void validateArtifactMap() throws InvalidFormatException {
     
@@ -183,36 +210,15 @@ public class POSTaggerFactory extends Ba
 
     if (tagdictEntry != null) {
       if (tagdictEntry instanceof POSDictionary) {
-        POSDictionary posDict = (POSDictionary) tagdictEntry;
-        
-        Set<String> dictTags = new HashSet<String>();
-        
-        for (String word : posDict) {
-          Collections.addAll(dictTags, posDict.getTags(word)); 
+        if(!this.artifactProvider.isLoadedFromSerialized()) {
+          AbstractModel posModel = this.artifactProvider
+              .getArtifact(POSModel.POS_MODEL_ENTRY_NAME);
+          POSDictionary posDict = (POSDictionary) tagdictEntry; 
+          validatePOSDictionary(posDict, posModel);
         }
-        
-        Set<String> modelTags = new HashSet<String>();
-        
-        AbstractModel posModel = this.artifactProvider
-            .getArtifact(POSModel.POS_MODEL_ENTRY_NAME);
-        
-        for  (int i = 0; i < posModel.getNumOutcomes(); i++) {
-          modelTags.add(posModel.getOutcome(i));
-        }
-        
-        if (!modelTags.containsAll(dictTags)) {
-          StringBuilder unknownTag = new StringBuilder();
-          for (String d : dictTags) {
-            if(!modelTags.contains(d)) {
-              unknownTag.append(d).append(" ");
-            }
-          }
-          throw new InvalidFormatException("Tag dictioinary contains tags " +
-                "which are unknown by the model! The unknown tags are: " + unknownTag.toString());
-        }
-      }
-      else {
-        throw new InvalidFormatException("Abbreviations dictionary has wrong type!");
+      } else {
+        throw new InvalidFormatException(
+            "POSTag dictionary has wrong type!");
       }
     }
 

Modified: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/model/ArtifactProvider.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/model/ArtifactProvider.java?rev=1360365&r1=1360364&r2=1360365&view=diff
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/model/ArtifactProvider.java (original)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/model/ArtifactProvider.java Wed Jul 11 19:30:04 2012
@@ -44,4 +44,13 @@ public interface ArtifactProvider {
    * @return the language code of this model
    */
   public String getLanguage();
+  
+  /**
+   * Indicates if this provider was loaded from serialized. It is useful, for
+   * example, while validating artifacts: you can skip the time consuming ones
+   * if they where already validated during the serialization.
+   * 
+   * @return true if this model was loaded from serialized
+   */
+  public boolean isLoadedFromSerialized();
 }

Modified: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/model/BaseModel.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/model/BaseModel.java?rev=1360365&r1=1360364&r2=1360365&view=diff
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/model/BaseModel.java (original)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/model/BaseModel.java Wed Jul 11 19:30:04 2012
@@ -70,6 +70,8 @@ public abstract class BaseModel implemen
   private boolean subclassSerializersInitiated = false;
   private boolean finishedLoadingArtifacts = false;
   
+  private final boolean isLoadedFromSerialized;
+
   /**
    * Initializes the current instance. The sub-class constructor should call the
    * method {@link #checkArtifactMap()} to check the artifact map is OK.
@@ -104,6 +106,8 @@ public abstract class BaseModel implemen
   protected BaseModel(String componentName, String languageCode,
       Map<String, String> manifestInfoEntries, BaseToolFactory factory) {
 
+    isLoadedFromSerialized = false;
+
     if (componentName == null)
         throw new IllegalArgumentException("componentName must not be null!");
     
@@ -163,6 +167,8 @@ public abstract class BaseModel implemen
    */
   protected BaseModel(String componentName, InputStream in) throws IOException, InvalidFormatException {
 
+    this.isLoadedFromSerialized = true;
+
     if (componentName == null)
       throw new IllegalArgumentException("componentName must not be null!");
     
@@ -536,4 +542,8 @@ public abstract class BaseModel implemen
     }
     return output.toByteArray();
   }
+
+  public boolean isLoadedFromSerialized() {
+    return isLoadedFromSerialized;
+  }
 }