You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2014/01/01 19:05:13 UTC

svn commit: r1554661 - /opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/model/BaseModel.java

Author: joern
Date: Wed Jan  1 18:05:13 2014
New Revision: 1554661

URL: http://svn.apache.org/r1554661
Log:
OPENNLP-581 the model now first reads the manifest.properties file and then proceeds with all the other artifacts

Modified:
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/model/BaseModel.java

Modified: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/model/BaseModel.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/model/BaseModel.java?rev=1554661&r1=1554660&r2=1554661&view=diff
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/model/BaseModel.java (original)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/model/BaseModel.java Wed Jan  1 18:05:13 2014
@@ -48,6 +48,8 @@ import opennlp.tools.util.ext.ExtensionL
  */
 public abstract class BaseModel implements ArtifactProvider {
 
+  private static int MODEL_BUFFER_SIZE_LIMIT = Integer.MAX_VALUE;
+  
   protected static final String MANIFEST_ENTRY = "manifest.properties";
   protected static final String FACTORY_NAME = "factory";
   
@@ -72,8 +74,6 @@ public abstract class BaseModel implemen
   
   private final String componentName;
 
-  private Map<String, byte[]> leftoverArtifacts;
-
   private boolean subclassSerializersInitiated = false;
   private boolean finishedLoadingArtifacts = false;
   
@@ -177,9 +177,6 @@ public abstract class BaseModel implemen
   protected BaseModel(String componentName, InputStream in) throws IOException, InvalidFormatException {
     this(componentName, true);
     
-    if (in == null)
-        throw new IllegalArgumentException("in must not be null!");
-
     loadModel(in);
   }
 
@@ -199,7 +196,7 @@ public abstract class BaseModel implemen
   protected BaseModel(String componentName, URL modelURL) throws IOException, InvalidFormatException  {
     this(componentName, true);
     
-    InputStream in = modelURL.openStream();
+    InputStream in = new BufferedInputStream(modelURL.openStream());
 
     try {
       loadModel(in);
@@ -210,36 +207,55 @@ public abstract class BaseModel implemen
   }
 
   private void loadModel(InputStream in) throws IOException, InvalidFormatException {
+    
+    if (in == null) {
+      throw new IllegalArgumentException("in must not be null!");
+    }
+    
     createBaseArtifactSerializers(artifactSerializers);
 
+    if (!in.markSupported()) {
+      in = new BufferedInputStream(in);
+    }
+    
+    // TODO: Discuss this solution, the buffering should 
+    in.mark(MODEL_BUFFER_SIZE_LIMIT);
+    
     final ZipInputStream zip = new ZipInputStream(in);
     
-    // will read it in two steps, first using the known factories, latter the
-    // unknown.
-    leftoverArtifacts = new HashMap<String, byte[]>();
-
+    // The model package can contain artifacts which are serialized with 3rd party
+    // serializers which are configured in the manifest file. To be able to load
+    // the model the manifest must be read first, and afterwards all the artifacts 
+    // can be de-serialized.
+    
+    // The ordering of artifacts in a zip package is not guaranteed. The stream is first
+    // read until the manifest appears, reseted, and read again to load all artifacts.
+    
+    boolean isSearchingForManifest = true;
+    
     ZipEntry entry;
-    while((entry = zip.getNextEntry()) != null ) {
-
-      String extension = getEntryExtension(entry.getName());
+    while((entry = zip.getNextEntry()) != null && isSearchingForManifest) {
 
-      ArtifactSerializer factory = artifactSerializers.get(extension);
-
-      if (factory == null) {
-        /* TODO: find a better solution, that would consume less memory */
-        byte[] bytes = toByteArray(zip);
-        leftoverArtifacts.put(entry.getName(), bytes);
-      } else {
+      if ("manifest.properties".equals(entry.getName())) {
+        // TODO: Probably better to use the serializer here directly!
+        ArtifactSerializer factory = artifactSerializers.get("properties");
         artifactMap.put(entry.getName(), factory.create(zip));
+        isSearchingForManifest = false;
       }
-      
+
       zip.closeEntry();
     }
-
+    
     initializeFactory();
     
     loadArtifactSerializers();
-    finishLoadingArtifacts();
+
+    // The Input Stream should always be reset-able because if markSupport returns
+    // false it is wrapped before hand into an Buffered InputStream
+    in.reset();
+    
+    finishLoadingArtifacts(in);
+    
     checkArtifactMap();
   }
   
@@ -282,41 +298,45 @@ public abstract class BaseModel implemen
   /**
    * Finish loading the artifacts now that it knows all serializers.
    */
-  private void finishLoadingArtifacts()
+  private void finishLoadingArtifacts(InputStream in)
       throws InvalidFormatException, IOException {
-    finishedLoadingArtifacts = true;
-    if (leftoverArtifacts == null || leftoverArtifacts.size() == 0) {
-      return;
-    }
-
+    
+    final ZipInputStream zip = new ZipInputStream(in);
+    
     Map<String, Object> artifactMap = new HashMap<String, Object>();
     
-    for (String entryName : leftoverArtifacts.keySet()) {
+    ZipEntry entry;
+    while((entry = zip.getNextEntry()) != null ) {
       
+      // Note: The manifest.properties file will be read here again,
+      // there should be no need to prevent that.
+      
+      String entryName = entry.getName();
       String extension = getEntryExtension(entryName);
 
-      if (leftoverArtifacts.containsKey(entryName)) {
-        ArtifactSerializer factory = artifactSerializers.get(extension);
+      ArtifactSerializer factory = artifactSerializers.get(extension);
 
-        if (factory == null) {
-          String artifactSerializerClazzName = 
-              getManifestProperty(SERIALIZER_CLASS_NAME_PREFIX + entryName);
+      String artifactSerializerClazzName = 
+          getManifestProperty(SERIALIZER_CLASS_NAME_PREFIX + entryName);
 
-          if (artifactSerializerClazzName != null) {
-            factory = ExtensionLoader.instantiateExtension(ArtifactSerializer.class, artifactSerializerClazzName);
-          }
-        }
-        
-        if (factory == null) {
-          throw new InvalidFormatException("Unknown artifact format: "
-              + extension);
-        } else {
-          artifactMap.put(entryName, factory.create(new ByteArrayInputStream(leftoverArtifacts.get(entryName))));
+      if (artifactSerializerClazzName != null) {
+        if (artifactSerializerClazzName != null) {
+          factory = ExtensionLoader.instantiateExtension(ArtifactSerializer.class, artifactSerializerClazzName);
         }
       }
+      
+      if (factory != null) {
+        artifactMap.put(entryName, factory.create(zip));
+      } else {
+        throw new InvalidFormatException("Unknown artifact format: " + extension);
+      }
+      
+      zip.closeEntry();
     }
-    this.leftoverArtifacts = null;
+
     this.artifactMap.putAll(artifactMap);
+    
+    finishedLoadingArtifacts = true;
   }
 
   /**
@@ -576,7 +596,8 @@ public abstract class BaseModel implemen
       
       ArtifactSerializer serializer = getArtifactSerializer(name);
 
-      if (serializer == null && artifact instanceof SerializableArtifact) {
+      // If model is serialize-able always use the provided serializer
+      if (artifact instanceof SerializableArtifact) {
         
         SerializableArtifact serializableArtifact = (SerializableArtifact) artifact;
 
@@ -622,4 +643,27 @@ public abstract class BaseModel implemen
   public boolean isLoadedFromSerialized() {
     return isLoadedFromSerialized;
   }
+  
+  public static void main(String[] args) throws Exception {
+    
+    // create a stream which can be reset, enclose it in a buffered stream which supports reseting 
+    InputStream in = new FileInputStream("annotation.conf");
+    
+    System.out.println("Is mark supported: " + in.markSupported());
+    
+    in = new BufferedInputStream(in);
+    
+    System.out.println("Is mark supported: " + in.markSupported());
+    
+    // 2 GB limit 
+    in.mark(4096);
+    
+    in.read();
+    
+    in.reset();
+    
+    // the mark support can be used to test if reseting is supported, we shoudl use this test anyway
+    // to fail gracefully in the cross validators ...
+    
+  }
 }